]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
[aarch64]: redefine aes patterns
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
85 {
86 enum insn_type { MOV, MVN };
87 enum modifier_type { LSL, MSL };
88
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode, rtx);
91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 insn_type = MOV, modifier_type = LSL,
93 unsigned int = 0);
94 simd_immediate_info (scalar_mode, rtx, rtx);
95
96 /* The mode of the elements. */
97 scalar_mode elt_mode;
98
99 /* The value of each element if all elements are the same, or the
100 first value if the constant is a series. */
101 rtx value;
102
103 /* The value of the step if the constant is a series, null otherwise. */
104 rtx step;
105
106 /* The instruction to use to move the immediate into a vector. */
107 insn_type insn;
108
109 /* The kind of shift modifier to use, and the number of bits to shift.
110 This is (LSL, 0) if no shift is needed. */
111 modifier_type modifier;
112 unsigned int shift;
113 };
114
115 /* Construct a floating-point immediate in which each element has mode
116 ELT_MODE_IN and value VALUE_IN. */
117 inline simd_immediate_info
118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
119 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
120 modifier (LSL), shift (0)
121 {}
122
123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
124 and value VALUE_IN. The other parameters are as for the structure
125 fields. */
126 inline simd_immediate_info
127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
128 unsigned HOST_WIDE_INT value_in,
129 insn_type insn_in, modifier_type modifier_in,
130 unsigned int shift_in)
131 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
132 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
133 {}
134
135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
136 and where element I is equal to VALUE_IN + I * STEP_IN. */
137 inline simd_immediate_info
138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
139 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
140 modifier (LSL), shift (0)
141 {}
142
143 /* The current code model. */
144 enum aarch64_code_model aarch64_cmodel;
145
146 /* The number of 64-bit elements in an SVE vector. */
147 poly_uint16 aarch64_sve_vg;
148
149 #ifdef HAVE_AS_TLS
150 #undef TARGET_HAVE_TLS
151 #define TARGET_HAVE_TLS 1
152 #endif
153
154 static bool aarch64_composite_type_p (const_tree, machine_mode);
155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
156 const_tree,
157 machine_mode *, int *,
158 bool *);
159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
161 static void aarch64_override_options_after_change (void);
162 static bool aarch64_vector_mode_supported_p (machine_mode);
163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
165 const_tree type,
166 int misalignment,
167 bool is_packed);
168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
170 aarch64_addr_query_type);
171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
172
173 /* Major revision number of the ARM Architecture implemented by the target. */
174 unsigned aarch64_architecture_version;
175
176 /* The processor for which instructions should be scheduled. */
177 enum aarch64_processor aarch64_tune = cortexa53;
178
179 /* Mask to specify which instruction scheduling options should be used. */
180 uint64_t aarch64_tune_flags = 0;
181
182 /* Global flag for PC relative loads. */
183 bool aarch64_pcrelative_literal_loads;
184
185 /* Global flag for whether frame pointer is enabled. */
186 bool aarch64_use_frame_pointer;
187
188 #define BRANCH_PROTECT_STR_MAX 255
189 char *accepted_branch_protection_string = NULL;
190
191 static enum aarch64_parse_opt_result
192 aarch64_parse_branch_protection (const char*, char**);
193
194 /* Support for command line parsing of boolean flags in the tuning
195 structures. */
196 struct aarch64_flag_desc
197 {
198 const char* name;
199 unsigned int flag;
200 };
201
202 #define AARCH64_FUSION_PAIR(name, internal_name) \
203 { name, AARCH64_FUSE_##internal_name },
204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
205 {
206 { "none", AARCH64_FUSE_NOTHING },
207 #include "aarch64-fusion-pairs.def"
208 { "all", AARCH64_FUSE_ALL },
209 { NULL, AARCH64_FUSE_NOTHING }
210 };
211
212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
213 { name, AARCH64_EXTRA_TUNE_##internal_name },
214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
215 {
216 { "none", AARCH64_EXTRA_TUNE_NONE },
217 #include "aarch64-tuning-flags.def"
218 { "all", AARCH64_EXTRA_TUNE_ALL },
219 { NULL, AARCH64_EXTRA_TUNE_NONE }
220 };
221
222 /* Tuning parameters. */
223
224 static const struct cpu_addrcost_table generic_addrcost_table =
225 {
226 {
227 1, /* hi */
228 0, /* si */
229 0, /* di */
230 1, /* ti */
231 },
232 0, /* pre_modify */
233 0, /* post_modify */
234 0, /* register_offset */
235 0, /* register_sextend */
236 0, /* register_zextend */
237 0 /* imm_offset */
238 };
239
240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
241 {
242 {
243 0, /* hi */
244 0, /* si */
245 0, /* di */
246 2, /* ti */
247 },
248 0, /* pre_modify */
249 0, /* post_modify */
250 1, /* register_offset */
251 1, /* register_sextend */
252 2, /* register_zextend */
253 0, /* imm_offset */
254 };
255
256 static const struct cpu_addrcost_table xgene1_addrcost_table =
257 {
258 {
259 1, /* hi */
260 0, /* si */
261 0, /* di */
262 1, /* ti */
263 },
264 1, /* pre_modify */
265 1, /* post_modify */
266 0, /* register_offset */
267 1, /* register_sextend */
268 1, /* register_zextend */
269 0, /* imm_offset */
270 };
271
272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
273 {
274 {
275 1, /* hi */
276 1, /* si */
277 1, /* di */
278 2, /* ti */
279 },
280 0, /* pre_modify */
281 0, /* post_modify */
282 2, /* register_offset */
283 3, /* register_sextend */
284 3, /* register_zextend */
285 0, /* imm_offset */
286 };
287
288 static const struct cpu_addrcost_table tsv110_addrcost_table =
289 {
290 {
291 1, /* hi */
292 0, /* si */
293 0, /* di */
294 1, /* ti */
295 },
296 0, /* pre_modify */
297 0, /* post_modify */
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
301 0, /* imm_offset */
302 };
303
304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
305 {
306 {
307 1, /* hi */
308 1, /* si */
309 1, /* di */
310 2, /* ti */
311 },
312 1, /* pre_modify */
313 1, /* post_modify */
314 3, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
317 2, /* imm_offset */
318 };
319
320 static const struct cpu_regmove_cost generic_regmove_cost =
321 {
322 1, /* GP2GP */
323 /* Avoid the use of slow int<->fp moves for spilling by setting
324 their cost higher than memmov_cost. */
325 5, /* GP2FP */
326 5, /* FP2GP */
327 2 /* FP2FP */
328 };
329
330 static const struct cpu_regmove_cost cortexa57_regmove_cost =
331 {
332 1, /* GP2GP */
333 /* Avoid the use of slow int<->fp moves for spilling by setting
334 their cost higher than memmov_cost. */
335 5, /* GP2FP */
336 5, /* FP2GP */
337 2 /* FP2FP */
338 };
339
340 static const struct cpu_regmove_cost cortexa53_regmove_cost =
341 {
342 1, /* GP2GP */
343 /* Avoid the use of slow int<->fp moves for spilling by setting
344 their cost higher than memmov_cost. */
345 5, /* GP2FP */
346 5, /* FP2GP */
347 2 /* FP2FP */
348 };
349
350 static const struct cpu_regmove_cost exynosm1_regmove_cost =
351 {
352 1, /* GP2GP */
353 /* Avoid the use of slow int<->fp moves for spilling by setting
354 their cost higher than memmov_cost (actual, 4 and 9). */
355 9, /* GP2FP */
356 9, /* FP2GP */
357 1 /* FP2FP */
358 };
359
360 static const struct cpu_regmove_cost thunderx_regmove_cost =
361 {
362 2, /* GP2GP */
363 2, /* GP2FP */
364 6, /* FP2GP */
365 4 /* FP2FP */
366 };
367
368 static const struct cpu_regmove_cost xgene1_regmove_cost =
369 {
370 1, /* GP2GP */
371 /* Avoid the use of slow int<->fp moves for spilling by setting
372 their cost higher than memmov_cost. */
373 8, /* GP2FP */
374 8, /* FP2GP */
375 2 /* FP2FP */
376 };
377
378 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
379 {
380 2, /* GP2GP */
381 /* Avoid the use of int<->fp moves for spilling. */
382 6, /* GP2FP */
383 6, /* FP2GP */
384 4 /* FP2FP */
385 };
386
387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
388 {
389 1, /* GP2GP */
390 /* Avoid the use of int<->fp moves for spilling. */
391 8, /* GP2FP */
392 8, /* FP2GP */
393 4 /* FP2FP */
394 };
395
396 static const struct cpu_regmove_cost tsv110_regmove_cost =
397 {
398 1, /* GP2GP */
399 /* Avoid the use of slow int<->fp moves for spilling by setting
400 their cost higher than memmov_cost. */
401 2, /* GP2FP */
402 3, /* FP2GP */
403 2 /* FP2FP */
404 };
405
406 /* Generic costs for vector insn classes. */
407 static const struct cpu_vector_cost generic_vector_cost =
408 {
409 1, /* scalar_int_stmt_cost */
410 1, /* scalar_fp_stmt_cost */
411 1, /* scalar_load_cost */
412 1, /* scalar_store_cost */
413 1, /* vec_int_stmt_cost */
414 1, /* vec_fp_stmt_cost */
415 2, /* vec_permute_cost */
416 1, /* vec_to_scalar_cost */
417 1, /* scalar_to_vec_cost */
418 1, /* vec_align_load_cost */
419 1, /* vec_unalign_load_cost */
420 1, /* vec_unalign_store_cost */
421 1, /* vec_store_cost */
422 3, /* cond_taken_branch_cost */
423 1 /* cond_not_taken_branch_cost */
424 };
425
426 /* QDF24XX costs for vector insn classes. */
427 static const struct cpu_vector_cost qdf24xx_vector_cost =
428 {
429 1, /* scalar_int_stmt_cost */
430 1, /* scalar_fp_stmt_cost */
431 1, /* scalar_load_cost */
432 1, /* scalar_store_cost */
433 1, /* vec_int_stmt_cost */
434 3, /* vec_fp_stmt_cost */
435 2, /* vec_permute_cost */
436 1, /* vec_to_scalar_cost */
437 1, /* scalar_to_vec_cost */
438 1, /* vec_align_load_cost */
439 1, /* vec_unalign_load_cost */
440 1, /* vec_unalign_store_cost */
441 1, /* vec_store_cost */
442 3, /* cond_taken_branch_cost */
443 1 /* cond_not_taken_branch_cost */
444 };
445
446 /* ThunderX costs for vector insn classes. */
447 static const struct cpu_vector_cost thunderx_vector_cost =
448 {
449 1, /* scalar_int_stmt_cost */
450 1, /* scalar_fp_stmt_cost */
451 3, /* scalar_load_cost */
452 1, /* scalar_store_cost */
453 4, /* vec_int_stmt_cost */
454 1, /* vec_fp_stmt_cost */
455 4, /* vec_permute_cost */
456 2, /* vec_to_scalar_cost */
457 2, /* scalar_to_vec_cost */
458 3, /* vec_align_load_cost */
459 5, /* vec_unalign_load_cost */
460 5, /* vec_unalign_store_cost */
461 1, /* vec_store_cost */
462 3, /* cond_taken_branch_cost */
463 3 /* cond_not_taken_branch_cost */
464 };
465
466 static const struct cpu_vector_cost tsv110_vector_cost =
467 {
468 1, /* scalar_int_stmt_cost */
469 1, /* scalar_fp_stmt_cost */
470 5, /* scalar_load_cost */
471 1, /* scalar_store_cost */
472 2, /* vec_int_stmt_cost */
473 2, /* vec_fp_stmt_cost */
474 2, /* vec_permute_cost */
475 3, /* vec_to_scalar_cost */
476 2, /* scalar_to_vec_cost */
477 5, /* vec_align_load_cost */
478 5, /* vec_unalign_load_cost */
479 1, /* vec_unalign_store_cost */
480 1, /* vec_store_cost */
481 1, /* cond_taken_branch_cost */
482 1 /* cond_not_taken_branch_cost */
483 };
484
485 /* Generic costs for vector insn classes. */
486 static const struct cpu_vector_cost cortexa57_vector_cost =
487 {
488 1, /* scalar_int_stmt_cost */
489 1, /* scalar_fp_stmt_cost */
490 4, /* scalar_load_cost */
491 1, /* scalar_store_cost */
492 2, /* vec_int_stmt_cost */
493 2, /* vec_fp_stmt_cost */
494 3, /* vec_permute_cost */
495 8, /* vec_to_scalar_cost */
496 8, /* scalar_to_vec_cost */
497 4, /* vec_align_load_cost */
498 4, /* vec_unalign_load_cost */
499 1, /* vec_unalign_store_cost */
500 1, /* vec_store_cost */
501 1, /* cond_taken_branch_cost */
502 1 /* cond_not_taken_branch_cost */
503 };
504
505 static const struct cpu_vector_cost exynosm1_vector_cost =
506 {
507 1, /* scalar_int_stmt_cost */
508 1, /* scalar_fp_stmt_cost */
509 5, /* scalar_load_cost */
510 1, /* scalar_store_cost */
511 3, /* vec_int_stmt_cost */
512 3, /* vec_fp_stmt_cost */
513 3, /* vec_permute_cost */
514 3, /* vec_to_scalar_cost */
515 3, /* scalar_to_vec_cost */
516 5, /* vec_align_load_cost */
517 5, /* vec_unalign_load_cost */
518 1, /* vec_unalign_store_cost */
519 1, /* vec_store_cost */
520 1, /* cond_taken_branch_cost */
521 1 /* cond_not_taken_branch_cost */
522 };
523
524 /* Generic costs for vector insn classes. */
525 static const struct cpu_vector_cost xgene1_vector_cost =
526 {
527 1, /* scalar_int_stmt_cost */
528 1, /* scalar_fp_stmt_cost */
529 5, /* scalar_load_cost */
530 1, /* scalar_store_cost */
531 2, /* vec_int_stmt_cost */
532 2, /* vec_fp_stmt_cost */
533 2, /* vec_permute_cost */
534 4, /* vec_to_scalar_cost */
535 4, /* scalar_to_vec_cost */
536 10, /* vec_align_load_cost */
537 10, /* vec_unalign_load_cost */
538 2, /* vec_unalign_store_cost */
539 2, /* vec_store_cost */
540 2, /* cond_taken_branch_cost */
541 1 /* cond_not_taken_branch_cost */
542 };
543
544 /* Costs for vector insn classes for Vulcan. */
545 static const struct cpu_vector_cost thunderx2t99_vector_cost =
546 {
547 1, /* scalar_int_stmt_cost */
548 6, /* scalar_fp_stmt_cost */
549 4, /* scalar_load_cost */
550 1, /* scalar_store_cost */
551 5, /* vec_int_stmt_cost */
552 6, /* vec_fp_stmt_cost */
553 3, /* vec_permute_cost */
554 6, /* vec_to_scalar_cost */
555 5, /* scalar_to_vec_cost */
556 8, /* vec_align_load_cost */
557 8, /* vec_unalign_load_cost */
558 4, /* vec_unalign_store_cost */
559 4, /* vec_store_cost */
560 2, /* cond_taken_branch_cost */
561 1 /* cond_not_taken_branch_cost */
562 };
563
564 /* Generic costs for branch instructions. */
565 static const struct cpu_branch_cost generic_branch_cost =
566 {
567 1, /* Predictable. */
568 3 /* Unpredictable. */
569 };
570
571 /* Generic approximation modes. */
572 static const cpu_approx_modes generic_approx_modes =
573 {
574 AARCH64_APPROX_NONE, /* division */
575 AARCH64_APPROX_NONE, /* sqrt */
576 AARCH64_APPROX_NONE /* recip_sqrt */
577 };
578
579 /* Approximation modes for Exynos M1. */
580 static const cpu_approx_modes exynosm1_approx_modes =
581 {
582 AARCH64_APPROX_NONE, /* division */
583 AARCH64_APPROX_ALL, /* sqrt */
584 AARCH64_APPROX_ALL /* recip_sqrt */
585 };
586
587 /* Approximation modes for X-Gene 1. */
588 static const cpu_approx_modes xgene1_approx_modes =
589 {
590 AARCH64_APPROX_NONE, /* division */
591 AARCH64_APPROX_NONE, /* sqrt */
592 AARCH64_APPROX_ALL /* recip_sqrt */
593 };
594
595 /* Generic prefetch settings (which disable prefetch). */
596 static const cpu_prefetch_tune generic_prefetch_tune =
597 {
598 0, /* num_slots */
599 -1, /* l1_cache_size */
600 -1, /* l1_cache_line_size */
601 -1, /* l2_cache_size */
602 true, /* prefetch_dynamic_strides */
603 -1, /* minimum_stride */
604 -1 /* default_opt_level */
605 };
606
607 static const cpu_prefetch_tune exynosm1_prefetch_tune =
608 {
609 0, /* num_slots */
610 -1, /* l1_cache_size */
611 64, /* l1_cache_line_size */
612 -1, /* l2_cache_size */
613 true, /* prefetch_dynamic_strides */
614 -1, /* minimum_stride */
615 -1 /* default_opt_level */
616 };
617
618 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
619 {
620 4, /* num_slots */
621 32, /* l1_cache_size */
622 64, /* l1_cache_line_size */
623 512, /* l2_cache_size */
624 false, /* prefetch_dynamic_strides */
625 2048, /* minimum_stride */
626 3 /* default_opt_level */
627 };
628
629 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
630 {
631 8, /* num_slots */
632 32, /* l1_cache_size */
633 128, /* l1_cache_line_size */
634 16*1024, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 3 /* default_opt_level */
638 };
639
640 static const cpu_prefetch_tune thunderx_prefetch_tune =
641 {
642 8, /* num_slots */
643 32, /* l1_cache_size */
644 128, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
649 };
650
651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
652 {
653 8, /* num_slots */
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 256, /* l2_cache_size */
657 true, /* prefetch_dynamic_strides */
658 -1, /* minimum_stride */
659 -1 /* default_opt_level */
660 };
661
662 static const cpu_prefetch_tune tsv110_prefetch_tune =
663 {
664 0, /* num_slots */
665 64, /* l1_cache_size */
666 64, /* l1_cache_line_size */
667 512, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 -1 /* default_opt_level */
671 };
672
673 static const cpu_prefetch_tune xgene1_prefetch_tune =
674 {
675 8, /* num_slots */
676 32, /* l1_cache_size */
677 64, /* l1_cache_line_size */
678 256, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
682 };
683
684 static const struct tune_params generic_tunings =
685 {
686 &cortexa57_extra_costs,
687 &generic_addrcost_table,
688 &generic_regmove_cost,
689 &generic_vector_cost,
690 &generic_branch_cost,
691 &generic_approx_modes,
692 SVE_NOT_IMPLEMENTED, /* sve_width */
693 4, /* memmov_cost */
694 2, /* issue_rate */
695 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
696 "8", /* function_align. */
697 "4", /* jump_align. */
698 "8", /* loop_align. */
699 2, /* int_reassoc_width. */
700 4, /* fp_reassoc_width. */
701 1, /* vec_reassoc_width. */
702 2, /* min_div_recip_mul_sf. */
703 2, /* min_div_recip_mul_df. */
704 0, /* max_case_values. */
705 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
706 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
707 &generic_prefetch_tune
708 };
709
710 static const struct tune_params cortexa35_tunings =
711 {
712 &cortexa53_extra_costs,
713 &generic_addrcost_table,
714 &cortexa53_regmove_cost,
715 &generic_vector_cost,
716 &generic_branch_cost,
717 &generic_approx_modes,
718 SVE_NOT_IMPLEMENTED, /* sve_width */
719 4, /* memmov_cost */
720 1, /* issue_rate */
721 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
722 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
723 "16", /* function_align. */
724 "4", /* jump_align. */
725 "8", /* loop_align. */
726 2, /* int_reassoc_width. */
727 4, /* fp_reassoc_width. */
728 1, /* vec_reassoc_width. */
729 2, /* min_div_recip_mul_sf. */
730 2, /* min_div_recip_mul_df. */
731 0, /* max_case_values. */
732 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
733 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
734 &generic_prefetch_tune
735 };
736
737 static const struct tune_params cortexa53_tunings =
738 {
739 &cortexa53_extra_costs,
740 &generic_addrcost_table,
741 &cortexa53_regmove_cost,
742 &generic_vector_cost,
743 &generic_branch_cost,
744 &generic_approx_modes,
745 SVE_NOT_IMPLEMENTED, /* sve_width */
746 4, /* memmov_cost */
747 2, /* issue_rate */
748 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
749 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
750 "16", /* function_align. */
751 "4", /* jump_align. */
752 "8", /* loop_align. */
753 2, /* int_reassoc_width. */
754 4, /* fp_reassoc_width. */
755 1, /* vec_reassoc_width. */
756 2, /* min_div_recip_mul_sf. */
757 2, /* min_div_recip_mul_df. */
758 0, /* max_case_values. */
759 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
760 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
761 &generic_prefetch_tune
762 };
763
764 static const struct tune_params cortexa57_tunings =
765 {
766 &cortexa57_extra_costs,
767 &generic_addrcost_table,
768 &cortexa57_regmove_cost,
769 &cortexa57_vector_cost,
770 &generic_branch_cost,
771 &generic_approx_modes,
772 SVE_NOT_IMPLEMENTED, /* sve_width */
773 4, /* memmov_cost */
774 3, /* issue_rate */
775 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
776 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
777 "16", /* function_align. */
778 "4", /* jump_align. */
779 "8", /* loop_align. */
780 2, /* int_reassoc_width. */
781 4, /* fp_reassoc_width. */
782 1, /* vec_reassoc_width. */
783 2, /* min_div_recip_mul_sf. */
784 2, /* min_div_recip_mul_df. */
785 0, /* max_case_values. */
786 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
787 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
788 &generic_prefetch_tune
789 };
790
791 static const struct tune_params cortexa72_tunings =
792 {
793 &cortexa57_extra_costs,
794 &generic_addrcost_table,
795 &cortexa57_regmove_cost,
796 &cortexa57_vector_cost,
797 &generic_branch_cost,
798 &generic_approx_modes,
799 SVE_NOT_IMPLEMENTED, /* sve_width */
800 4, /* memmov_cost */
801 3, /* issue_rate */
802 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
803 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
804 "16", /* function_align. */
805 "4", /* jump_align. */
806 "8", /* loop_align. */
807 2, /* int_reassoc_width. */
808 4, /* fp_reassoc_width. */
809 1, /* vec_reassoc_width. */
810 2, /* min_div_recip_mul_sf. */
811 2, /* min_div_recip_mul_df. */
812 0, /* max_case_values. */
813 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
814 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
815 &generic_prefetch_tune
816 };
817
818 static const struct tune_params cortexa73_tunings =
819 {
820 &cortexa57_extra_costs,
821 &generic_addrcost_table,
822 &cortexa57_regmove_cost,
823 &cortexa57_vector_cost,
824 &generic_branch_cost,
825 &generic_approx_modes,
826 SVE_NOT_IMPLEMENTED, /* sve_width */
827 4, /* memmov_cost. */
828 2, /* issue_rate. */
829 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
830 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
831 "16", /* function_align. */
832 "4", /* jump_align. */
833 "8", /* loop_align. */
834 2, /* int_reassoc_width. */
835 4, /* fp_reassoc_width. */
836 1, /* vec_reassoc_width. */
837 2, /* min_div_recip_mul_sf. */
838 2, /* min_div_recip_mul_df. */
839 0, /* max_case_values. */
840 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
841 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
842 &generic_prefetch_tune
843 };
844
845
846
847 static const struct tune_params exynosm1_tunings =
848 {
849 &exynosm1_extra_costs,
850 &exynosm1_addrcost_table,
851 &exynosm1_regmove_cost,
852 &exynosm1_vector_cost,
853 &generic_branch_cost,
854 &exynosm1_approx_modes,
855 SVE_NOT_IMPLEMENTED, /* sve_width */
856 4, /* memmov_cost */
857 3, /* issue_rate */
858 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
859 "4", /* function_align. */
860 "4", /* jump_align. */
861 "4", /* loop_align. */
862 2, /* int_reassoc_width. */
863 4, /* fp_reassoc_width. */
864 1, /* vec_reassoc_width. */
865 2, /* min_div_recip_mul_sf. */
866 2, /* min_div_recip_mul_df. */
867 48, /* max_case_values. */
868 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
869 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
870 &exynosm1_prefetch_tune
871 };
872
873 static const struct tune_params thunderxt88_tunings =
874 {
875 &thunderx_extra_costs,
876 &generic_addrcost_table,
877 &thunderx_regmove_cost,
878 &thunderx_vector_cost,
879 &generic_branch_cost,
880 &generic_approx_modes,
881 SVE_NOT_IMPLEMENTED, /* sve_width */
882 6, /* memmov_cost */
883 2, /* issue_rate */
884 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
885 "8", /* function_align. */
886 "8", /* jump_align. */
887 "8", /* loop_align. */
888 2, /* int_reassoc_width. */
889 4, /* fp_reassoc_width. */
890 1, /* vec_reassoc_width. */
891 2, /* min_div_recip_mul_sf. */
892 2, /* min_div_recip_mul_df. */
893 0, /* max_case_values. */
894 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
895 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
896 &thunderxt88_prefetch_tune
897 };
898
899 static const struct tune_params thunderx_tunings =
900 {
901 &thunderx_extra_costs,
902 &generic_addrcost_table,
903 &thunderx_regmove_cost,
904 &thunderx_vector_cost,
905 &generic_branch_cost,
906 &generic_approx_modes,
907 SVE_NOT_IMPLEMENTED, /* sve_width */
908 6, /* memmov_cost */
909 2, /* issue_rate */
910 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
911 "8", /* function_align. */
912 "8", /* jump_align. */
913 "8", /* loop_align. */
914 2, /* int_reassoc_width. */
915 4, /* fp_reassoc_width. */
916 1, /* vec_reassoc_width. */
917 2, /* min_div_recip_mul_sf. */
918 2, /* min_div_recip_mul_df. */
919 0, /* max_case_values. */
920 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
921 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
922 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
923 &thunderx_prefetch_tune
924 };
925
926 static const struct tune_params tsv110_tunings =
927 {
928 &tsv110_extra_costs,
929 &tsv110_addrcost_table,
930 &tsv110_regmove_cost,
931 &tsv110_vector_cost,
932 &generic_branch_cost,
933 &generic_approx_modes,
934 SVE_NOT_IMPLEMENTED, /* sve_width */
935 4, /* memmov_cost */
936 4, /* issue_rate */
937 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
938 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
939 "16", /* function_align. */
940 "4", /* jump_align. */
941 "8", /* loop_align. */
942 2, /* int_reassoc_width. */
943 4, /* fp_reassoc_width. */
944 1, /* vec_reassoc_width. */
945 2, /* min_div_recip_mul_sf. */
946 2, /* min_div_recip_mul_df. */
947 0, /* max_case_values. */
948 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
949 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
950 &tsv110_prefetch_tune
951 };
952
953 static const struct tune_params xgene1_tunings =
954 {
955 &xgene1_extra_costs,
956 &xgene1_addrcost_table,
957 &xgene1_regmove_cost,
958 &xgene1_vector_cost,
959 &generic_branch_cost,
960 &xgene1_approx_modes,
961 SVE_NOT_IMPLEMENTED, /* sve_width */
962 6, /* memmov_cost */
963 4, /* issue_rate */
964 AARCH64_FUSE_NOTHING, /* fusible_ops */
965 "16", /* function_align. */
966 "16", /* jump_align. */
967 "16", /* loop_align. */
968 2, /* int_reassoc_width. */
969 4, /* fp_reassoc_width. */
970 1, /* vec_reassoc_width. */
971 2, /* min_div_recip_mul_sf. */
972 2, /* min_div_recip_mul_df. */
973 17, /* max_case_values. */
974 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
975 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
976 &xgene1_prefetch_tune
977 };
978
979 static const struct tune_params emag_tunings =
980 {
981 &xgene1_extra_costs,
982 &xgene1_addrcost_table,
983 &xgene1_regmove_cost,
984 &xgene1_vector_cost,
985 &generic_branch_cost,
986 &xgene1_approx_modes,
987 SVE_NOT_IMPLEMENTED,
988 6, /* memmov_cost */
989 4, /* issue_rate */
990 AARCH64_FUSE_NOTHING, /* fusible_ops */
991 "16", /* function_align. */
992 "16", /* jump_align. */
993 "16", /* loop_align. */
994 2, /* int_reassoc_width. */
995 4, /* fp_reassoc_width. */
996 1, /* vec_reassoc_width. */
997 2, /* min_div_recip_mul_sf. */
998 2, /* min_div_recip_mul_df. */
999 17, /* max_case_values. */
1000 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1001 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1002 &xgene1_prefetch_tune
1003 };
1004
1005 static const struct tune_params qdf24xx_tunings =
1006 {
1007 &qdf24xx_extra_costs,
1008 &qdf24xx_addrcost_table,
1009 &qdf24xx_regmove_cost,
1010 &qdf24xx_vector_cost,
1011 &generic_branch_cost,
1012 &generic_approx_modes,
1013 SVE_NOT_IMPLEMENTED, /* sve_width */
1014 4, /* memmov_cost */
1015 4, /* issue_rate */
1016 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1018 "16", /* function_align. */
1019 "8", /* jump_align. */
1020 "16", /* loop_align. */
1021 2, /* int_reassoc_width. */
1022 4, /* fp_reassoc_width. */
1023 1, /* vec_reassoc_width. */
1024 2, /* min_div_recip_mul_sf. */
1025 2, /* min_div_recip_mul_df. */
1026 0, /* max_case_values. */
1027 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1028 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1029 &qdf24xx_prefetch_tune
1030 };
1031
1032 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1033 for now. */
1034 static const struct tune_params saphira_tunings =
1035 {
1036 &generic_extra_costs,
1037 &generic_addrcost_table,
1038 &generic_regmove_cost,
1039 &generic_vector_cost,
1040 &generic_branch_cost,
1041 &generic_approx_modes,
1042 SVE_NOT_IMPLEMENTED, /* sve_width */
1043 4, /* memmov_cost */
1044 4, /* issue_rate */
1045 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1046 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1047 "16", /* function_align. */
1048 "8", /* jump_align. */
1049 "16", /* loop_align. */
1050 2, /* int_reassoc_width. */
1051 4, /* fp_reassoc_width. */
1052 1, /* vec_reassoc_width. */
1053 2, /* min_div_recip_mul_sf. */
1054 2, /* min_div_recip_mul_df. */
1055 0, /* max_case_values. */
1056 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1057 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1058 &generic_prefetch_tune
1059 };
1060
1061 static const struct tune_params thunderx2t99_tunings =
1062 {
1063 &thunderx2t99_extra_costs,
1064 &thunderx2t99_addrcost_table,
1065 &thunderx2t99_regmove_cost,
1066 &thunderx2t99_vector_cost,
1067 &generic_branch_cost,
1068 &generic_approx_modes,
1069 SVE_NOT_IMPLEMENTED, /* sve_width */
1070 4, /* memmov_cost. */
1071 4, /* issue_rate. */
1072 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1073 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1074 "16", /* function_align. */
1075 "8", /* jump_align. */
1076 "16", /* loop_align. */
1077 3, /* int_reassoc_width. */
1078 2, /* fp_reassoc_width. */
1079 2, /* vec_reassoc_width. */
1080 2, /* min_div_recip_mul_sf. */
1081 2, /* min_div_recip_mul_df. */
1082 0, /* max_case_values. */
1083 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1084 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1085 &thunderx2t99_prefetch_tune
1086 };
1087
1088 static const struct tune_params neoversen1_tunings =
1089 {
1090 &cortexa57_extra_costs,
1091 &generic_addrcost_table,
1092 &generic_regmove_cost,
1093 &cortexa57_vector_cost,
1094 &generic_branch_cost,
1095 &generic_approx_modes,
1096 SVE_NOT_IMPLEMENTED, /* sve_width */
1097 4, /* memmov_cost */
1098 3, /* issue_rate */
1099 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1100 "32:16", /* function_align. */
1101 "32:16", /* jump_align. */
1102 "32:16", /* loop_align. */
1103 2, /* int_reassoc_width. */
1104 4, /* fp_reassoc_width. */
1105 2, /* vec_reassoc_width. */
1106 2, /* min_div_recip_mul_sf. */
1107 2, /* min_div_recip_mul_df. */
1108 0, /* max_case_values. */
1109 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1110 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1111 &generic_prefetch_tune
1112 };
1113
1114 /* Support for fine-grained override of the tuning structures. */
1115 struct aarch64_tuning_override_function
1116 {
1117 const char* name;
1118 void (*parse_override)(const char*, struct tune_params*);
1119 };
1120
1121 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1122 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1124
1125 static const struct aarch64_tuning_override_function
1126 aarch64_tuning_override_functions[] =
1127 {
1128 { "fuse", aarch64_parse_fuse_string },
1129 { "tune", aarch64_parse_tune_string },
1130 { "sve_width", aarch64_parse_sve_width_string },
1131 { NULL, NULL }
1132 };
1133
1134 /* A processor implementing AArch64. */
1135 struct processor
1136 {
1137 const char *const name;
1138 enum aarch64_processor ident;
1139 enum aarch64_processor sched_core;
1140 enum aarch64_arch arch;
1141 unsigned architecture_version;
1142 const uint64_t flags;
1143 const struct tune_params *const tune;
1144 };
1145
1146 /* Architectures implementing AArch64. */
1147 static const struct processor all_architectures[] =
1148 {
1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1150 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1151 #include "aarch64-arches.def"
1152 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1153 };
1154
1155 /* Processor cores implementing AArch64. */
1156 static const struct processor all_cores[] =
1157 {
1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1159 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1160 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1161 FLAGS, &COSTS##_tunings},
1162 #include "aarch64-cores.def"
1163 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1164 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1165 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1166 };
1167
1168
1169 /* Target specification. These are populated by the -march, -mtune, -mcpu
1170 handling code or by target attributes. */
1171 static const struct processor *selected_arch;
1172 static const struct processor *selected_cpu;
1173 static const struct processor *selected_tune;
1174
1175 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1176
1177 /* The current tuning set. */
1178 struct tune_params aarch64_tune_params = generic_tunings;
1179
1180 /* Table of machine attributes. */
1181 static const struct attribute_spec aarch64_attribute_table[] =
1182 {
1183 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1184 affects_type_identity, handler, exclude } */
1185 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1186 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1187 };
1188
1189 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1190
1191 /* An ISA extension in the co-processor and main instruction set space. */
1192 struct aarch64_option_extension
1193 {
1194 const char *const name;
1195 const unsigned long flags_on;
1196 const unsigned long flags_off;
1197 };
1198
1199 typedef enum aarch64_cond_code
1200 {
1201 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1202 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1203 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1204 }
1205 aarch64_cc;
1206
1207 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1208
1209 struct aarch64_branch_protect_type
1210 {
1211 /* The type's name that the user passes to the branch-protection option
1212 string. */
1213 const char* name;
1214 /* Function to handle the protection type and set global variables.
1215 First argument is the string token corresponding with this type and the
1216 second argument is the next token in the option string.
1217 Return values:
1218 * AARCH64_PARSE_OK: Handling was sucessful.
1219 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1220 should print an error.
1221 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1222 own error. */
1223 enum aarch64_parse_opt_result (*handler)(char*, char*);
1224 /* A list of types that can follow this type in the option string. */
1225 const aarch64_branch_protect_type* subtypes;
1226 unsigned int num_subtypes;
1227 };
1228
1229 static enum aarch64_parse_opt_result
1230 aarch64_handle_no_branch_protection (char* str, char* rest)
1231 {
1232 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1233 aarch64_enable_bti = 0;
1234 if (rest)
1235 {
1236 error ("unexpected %<%s%> after %<%s%>", rest, str);
1237 return AARCH64_PARSE_INVALID_FEATURE;
1238 }
1239 return AARCH64_PARSE_OK;
1240 }
1241
1242 static enum aarch64_parse_opt_result
1243 aarch64_handle_standard_branch_protection (char* str, char* rest)
1244 {
1245 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1246 aarch64_ra_sign_key = AARCH64_KEY_A;
1247 aarch64_enable_bti = 1;
1248 if (rest)
1249 {
1250 error ("unexpected %<%s%> after %<%s%>", rest, str);
1251 return AARCH64_PARSE_INVALID_FEATURE;
1252 }
1253 return AARCH64_PARSE_OK;
1254 }
1255
1256 static enum aarch64_parse_opt_result
1257 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1258 char* rest ATTRIBUTE_UNUSED)
1259 {
1260 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1261 aarch64_ra_sign_key = AARCH64_KEY_A;
1262 return AARCH64_PARSE_OK;
1263 }
1264
1265 static enum aarch64_parse_opt_result
1266 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1267 char* rest ATTRIBUTE_UNUSED)
1268 {
1269 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1270 return AARCH64_PARSE_OK;
1271 }
1272
1273 static enum aarch64_parse_opt_result
1274 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1275 char* rest ATTRIBUTE_UNUSED)
1276 {
1277 aarch64_ra_sign_key = AARCH64_KEY_B;
1278 return AARCH64_PARSE_OK;
1279 }
1280
1281 static enum aarch64_parse_opt_result
1282 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1283 char* rest ATTRIBUTE_UNUSED)
1284 {
1285 aarch64_enable_bti = 1;
1286 return AARCH64_PARSE_OK;
1287 }
1288
1289 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1290 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1291 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1292 { NULL, NULL, NULL, 0 }
1293 };
1294
1295 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1296 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1297 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1298 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1299 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1300 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1301 { NULL, NULL, NULL, 0 }
1302 };
1303
1304 /* The condition codes of the processor, and the inverse function. */
1305 static const char * const aarch64_condition_codes[] =
1306 {
1307 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1308 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1309 };
1310
1311 /* The preferred condition codes for SVE conditions. */
1312 static const char *const aarch64_sve_condition_codes[] =
1313 {
1314 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1315 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1316 };
1317
1318 /* Generate code to enable conditional branches in functions over 1 MiB. */
1319 const char *
1320 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1321 const char * branch_format)
1322 {
1323 rtx_code_label * tmp_label = gen_label_rtx ();
1324 char label_buf[256];
1325 char buffer[128];
1326 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1327 CODE_LABEL_NUMBER (tmp_label));
1328 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1329 rtx dest_label = operands[pos_label];
1330 operands[pos_label] = tmp_label;
1331
1332 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1333 output_asm_insn (buffer, operands);
1334
1335 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1336 operands[pos_label] = dest_label;
1337 output_asm_insn (buffer, operands);
1338 return "";
1339 }
1340
1341 void
1342 aarch64_err_no_fpadvsimd (machine_mode mode)
1343 {
1344 if (TARGET_GENERAL_REGS_ONLY)
1345 if (FLOAT_MODE_P (mode))
1346 error ("%qs is incompatible with the use of floating-point types",
1347 "-mgeneral-regs-only");
1348 else
1349 error ("%qs is incompatible with the use of vector types",
1350 "-mgeneral-regs-only");
1351 else
1352 if (FLOAT_MODE_P (mode))
1353 error ("%qs feature modifier is incompatible with the use of"
1354 " floating-point types", "+nofp");
1355 else
1356 error ("%qs feature modifier is incompatible with the use of"
1357 " vector types", "+nofp");
1358 }
1359
1360 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1361 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1362 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1363 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1364 and GENERAL_REGS is lower than the memory cost (in this case the best class
1365 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1366 cost results in bad allocations with many redundant int<->FP moves which
1367 are expensive on various cores.
1368 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1369 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1370 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1371 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1372 The result of this is that it is no longer inefficient to have a higher
1373 memory move cost than the register move cost.
1374 */
1375
1376 static reg_class_t
1377 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1378 reg_class_t best_class)
1379 {
1380 machine_mode mode;
1381
1382 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1383 || !reg_class_subset_p (FP_REGS, allocno_class))
1384 return allocno_class;
1385
1386 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1387 || !reg_class_subset_p (FP_REGS, best_class))
1388 return best_class;
1389
1390 mode = PSEUDO_REGNO_MODE (regno);
1391 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1392 }
1393
1394 static unsigned int
1395 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1396 {
1397 if (GET_MODE_UNIT_SIZE (mode) == 4)
1398 return aarch64_tune_params.min_div_recip_mul_sf;
1399 return aarch64_tune_params.min_div_recip_mul_df;
1400 }
1401
1402 /* Return the reassociation width of treeop OPC with mode MODE. */
1403 static int
1404 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1405 {
1406 if (VECTOR_MODE_P (mode))
1407 return aarch64_tune_params.vec_reassoc_width;
1408 if (INTEGRAL_MODE_P (mode))
1409 return aarch64_tune_params.int_reassoc_width;
1410 /* Avoid reassociating floating point addition so we emit more FMAs. */
1411 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1412 return aarch64_tune_params.fp_reassoc_width;
1413 return 1;
1414 }
1415
1416 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1417 unsigned
1418 aarch64_dbx_register_number (unsigned regno)
1419 {
1420 if (GP_REGNUM_P (regno))
1421 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1422 else if (regno == SP_REGNUM)
1423 return AARCH64_DWARF_SP;
1424 else if (FP_REGNUM_P (regno))
1425 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1426 else if (PR_REGNUM_P (regno))
1427 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1428 else if (regno == VG_REGNUM)
1429 return AARCH64_DWARF_VG;
1430
1431 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1432 equivalent DWARF register. */
1433 return DWARF_FRAME_REGISTERS;
1434 }
1435
1436 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1437 static bool
1438 aarch64_advsimd_struct_mode_p (machine_mode mode)
1439 {
1440 return (TARGET_SIMD
1441 && (mode == OImode || mode == CImode || mode == XImode));
1442 }
1443
1444 /* Return true if MODE is an SVE predicate mode. */
1445 static bool
1446 aarch64_sve_pred_mode_p (machine_mode mode)
1447 {
1448 return (TARGET_SVE
1449 && (mode == VNx16BImode
1450 || mode == VNx8BImode
1451 || mode == VNx4BImode
1452 || mode == VNx2BImode));
1453 }
1454
1455 /* Three mutually-exclusive flags describing a vector or predicate type. */
1456 const unsigned int VEC_ADVSIMD = 1;
1457 const unsigned int VEC_SVE_DATA = 2;
1458 const unsigned int VEC_SVE_PRED = 4;
1459 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1460 a structure of 2, 3 or 4 vectors. */
1461 const unsigned int VEC_STRUCT = 8;
1462 /* Useful combinations of the above. */
1463 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1464 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1465
1466 /* Return a set of flags describing the vector properties of mode MODE.
1467 Ignore modes that are not supported by the current target. */
1468 static unsigned int
1469 aarch64_classify_vector_mode (machine_mode mode)
1470 {
1471 if (aarch64_advsimd_struct_mode_p (mode))
1472 return VEC_ADVSIMD | VEC_STRUCT;
1473
1474 if (aarch64_sve_pred_mode_p (mode))
1475 return VEC_SVE_PRED;
1476
1477 scalar_mode inner = GET_MODE_INNER (mode);
1478 if (VECTOR_MODE_P (mode)
1479 && (inner == QImode
1480 || inner == HImode
1481 || inner == HFmode
1482 || inner == SImode
1483 || inner == SFmode
1484 || inner == DImode
1485 || inner == DFmode))
1486 {
1487 if (TARGET_SVE)
1488 {
1489 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1490 return VEC_SVE_DATA;
1491 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1492 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1493 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1494 return VEC_SVE_DATA | VEC_STRUCT;
1495 }
1496
1497 /* This includes V1DF but not V1DI (which doesn't exist). */
1498 if (TARGET_SIMD
1499 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1500 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1501 return VEC_ADVSIMD;
1502 }
1503
1504 return 0;
1505 }
1506
1507 /* Return true if MODE is any of the data vector modes, including
1508 structure modes. */
1509 static bool
1510 aarch64_vector_data_mode_p (machine_mode mode)
1511 {
1512 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1513 }
1514
1515 /* Return true if MODE is an SVE data vector mode; either a single vector
1516 or a structure of vectors. */
1517 static bool
1518 aarch64_sve_data_mode_p (machine_mode mode)
1519 {
1520 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1521 }
1522
1523 /* Implement target hook TARGET_ARRAY_MODE. */
1524 static opt_machine_mode
1525 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1526 {
1527 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1528 && IN_RANGE (nelems, 2, 4))
1529 return mode_for_vector (GET_MODE_INNER (mode),
1530 GET_MODE_NUNITS (mode) * nelems);
1531
1532 return opt_machine_mode ();
1533 }
1534
1535 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1536 static bool
1537 aarch64_array_mode_supported_p (machine_mode mode,
1538 unsigned HOST_WIDE_INT nelems)
1539 {
1540 if (TARGET_SIMD
1541 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1542 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1543 && (nelems >= 2 && nelems <= 4))
1544 return true;
1545
1546 return false;
1547 }
1548
1549 /* Return the SVE predicate mode to use for elements that have
1550 ELEM_NBYTES bytes, if such a mode exists. */
1551
1552 opt_machine_mode
1553 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1554 {
1555 if (TARGET_SVE)
1556 {
1557 if (elem_nbytes == 1)
1558 return VNx16BImode;
1559 if (elem_nbytes == 2)
1560 return VNx8BImode;
1561 if (elem_nbytes == 4)
1562 return VNx4BImode;
1563 if (elem_nbytes == 8)
1564 return VNx2BImode;
1565 }
1566 return opt_machine_mode ();
1567 }
1568
1569 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1570
1571 static opt_machine_mode
1572 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1573 {
1574 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1575 {
1576 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1577 machine_mode pred_mode;
1578 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1579 return pred_mode;
1580 }
1581
1582 return default_get_mask_mode (nunits, nbytes);
1583 }
1584
1585 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1586 prefer to use the first arithmetic operand as the else value if
1587 the else value doesn't matter, since that exactly matches the SVE
1588 destructive merging form. For ternary operations we could either
1589 pick the first operand and use FMAD-like instructions or the last
1590 operand and use FMLA-like instructions; the latter seems more
1591 natural. */
1592
1593 static tree
1594 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1595 {
1596 return nops == 3 ? ops[2] : ops[0];
1597 }
1598
1599 /* Implement TARGET_HARD_REGNO_NREGS. */
1600
1601 static unsigned int
1602 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1603 {
1604 /* ??? Logically we should only need to provide a value when
1605 HARD_REGNO_MODE_OK says that the combination is valid,
1606 but at the moment we need to handle all modes. Just ignore
1607 any runtime parts for registers that can't store them. */
1608 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1609 switch (aarch64_regno_regclass (regno))
1610 {
1611 case FP_REGS:
1612 case FP_LO_REGS:
1613 if (aarch64_sve_data_mode_p (mode))
1614 return exact_div (GET_MODE_SIZE (mode),
1615 BYTES_PER_SVE_VECTOR).to_constant ();
1616 return CEIL (lowest_size, UNITS_PER_VREG);
1617 case PR_REGS:
1618 case PR_LO_REGS:
1619 case PR_HI_REGS:
1620 return 1;
1621 default:
1622 return CEIL (lowest_size, UNITS_PER_WORD);
1623 }
1624 gcc_unreachable ();
1625 }
1626
1627 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1628
1629 static bool
1630 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1631 {
1632 if (GET_MODE_CLASS (mode) == MODE_CC)
1633 return regno == CC_REGNUM;
1634
1635 if (regno == VG_REGNUM)
1636 /* This must have the same size as _Unwind_Word. */
1637 return mode == DImode;
1638
1639 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1640 if (vec_flags & VEC_SVE_PRED)
1641 return PR_REGNUM_P (regno);
1642
1643 if (PR_REGNUM_P (regno))
1644 return 0;
1645
1646 if (regno == SP_REGNUM)
1647 /* The purpose of comparing with ptr_mode is to support the
1648 global register variable associated with the stack pointer
1649 register via the syntax of asm ("wsp") in ILP32. */
1650 return mode == Pmode || mode == ptr_mode;
1651
1652 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1653 return mode == Pmode;
1654
1655 if (GP_REGNUM_P (regno))
1656 {
1657 if (known_le (GET_MODE_SIZE (mode), 8))
1658 return true;
1659 else if (known_le (GET_MODE_SIZE (mode), 16))
1660 return (regno & 1) == 0;
1661 }
1662 else if (FP_REGNUM_P (regno))
1663 {
1664 if (vec_flags & VEC_STRUCT)
1665 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1666 else
1667 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1668 }
1669
1670 return false;
1671 }
1672
1673 /* Return true if this is a definition of a vectorized simd function. */
1674
1675 static bool
1676 aarch64_simd_decl_p (tree fndecl)
1677 {
1678 tree fntype;
1679
1680 if (fndecl == NULL)
1681 return false;
1682 fntype = TREE_TYPE (fndecl);
1683 if (fntype == NULL)
1684 return false;
1685
1686 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1687 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1688 return true;
1689
1690 return false;
1691 }
1692
1693 /* Return the mode a register save/restore should use. DImode for integer
1694 registers, DFmode for FP registers in non-SIMD functions (they only save
1695 the bottom half of a 128 bit register), or TFmode for FP registers in
1696 SIMD functions. */
1697
1698 static machine_mode
1699 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1700 {
1701 return GP_REGNUM_P (regno)
1702 ? E_DImode
1703 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1704 }
1705
1706 /* Return true if the instruction is a call to a SIMD function, false
1707 if it is not a SIMD function or if we do not know anything about
1708 the function. */
1709
1710 static bool
1711 aarch64_simd_call_p (rtx_insn *insn)
1712 {
1713 rtx symbol;
1714 rtx call;
1715 tree fndecl;
1716
1717 gcc_assert (CALL_P (insn));
1718 call = get_call_rtx_from (insn);
1719 symbol = XEXP (XEXP (call, 0), 0);
1720 if (GET_CODE (symbol) != SYMBOL_REF)
1721 return false;
1722 fndecl = SYMBOL_REF_DECL (symbol);
1723 if (!fndecl)
1724 return false;
1725
1726 return aarch64_simd_decl_p (fndecl);
1727 }
1728
1729 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1730 a function that uses the SIMD ABI, take advantage of the extra
1731 call-preserved registers that the ABI provides. */
1732
1733 void
1734 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1735 HARD_REG_SET *return_set)
1736 {
1737 if (aarch64_simd_call_p (insn))
1738 {
1739 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1740 if (FP_SIMD_SAVED_REGNUM_P (regno))
1741 CLEAR_HARD_REG_BIT (*return_set, regno);
1742 }
1743 }
1744
1745 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1746 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1747 clobbers the top 64 bits when restoring the bottom 64 bits. */
1748
1749 static bool
1750 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1751 machine_mode mode)
1752 {
1753 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1754 return FP_REGNUM_P (regno)
1755 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1756 }
1757
1758 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1759
1760 rtx_insn *
1761 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1762 {
1763 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1764
1765 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1766 return call_1;
1767 else
1768 return call_2;
1769 }
1770
1771 /* Implement REGMODE_NATURAL_SIZE. */
1772 poly_uint64
1773 aarch64_regmode_natural_size (machine_mode mode)
1774 {
1775 /* The natural size for SVE data modes is one SVE data vector,
1776 and similarly for predicates. We can't independently modify
1777 anything smaller than that. */
1778 /* ??? For now, only do this for variable-width SVE registers.
1779 Doing it for constant-sized registers breaks lower-subreg.c. */
1780 /* ??? And once that's fixed, we should probably have similar
1781 code for Advanced SIMD. */
1782 if (!aarch64_sve_vg.is_constant ())
1783 {
1784 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1785 if (vec_flags & VEC_SVE_PRED)
1786 return BYTES_PER_SVE_PRED;
1787 if (vec_flags & VEC_SVE_DATA)
1788 return BYTES_PER_SVE_VECTOR;
1789 }
1790 return UNITS_PER_WORD;
1791 }
1792
1793 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1794 machine_mode
1795 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1796 machine_mode mode)
1797 {
1798 /* The predicate mode determines which bits are significant and
1799 which are "don't care". Decreasing the number of lanes would
1800 lose data while increasing the number of lanes would make bits
1801 unnecessarily significant. */
1802 if (PR_REGNUM_P (regno))
1803 return mode;
1804 if (known_ge (GET_MODE_SIZE (mode), 4))
1805 return mode;
1806 else
1807 return SImode;
1808 }
1809
1810 /* Return true if I's bits are consecutive ones from the MSB. */
1811 bool
1812 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1813 {
1814 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1815 }
1816
1817 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1818 that strcpy from constants will be faster. */
1819
1820 static HOST_WIDE_INT
1821 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1822 {
1823 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1824 return MAX (align, BITS_PER_WORD);
1825 return align;
1826 }
1827
1828 /* Return true if calls to DECL should be treated as
1829 long-calls (ie called via a register). */
1830 static bool
1831 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1832 {
1833 return false;
1834 }
1835
1836 /* Return true if calls to symbol-ref SYM should be treated as
1837 long-calls (ie called via a register). */
1838 bool
1839 aarch64_is_long_call_p (rtx sym)
1840 {
1841 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1842 }
1843
1844 /* Return true if calls to symbol-ref SYM should not go through
1845 plt stubs. */
1846
1847 bool
1848 aarch64_is_noplt_call_p (rtx sym)
1849 {
1850 const_tree decl = SYMBOL_REF_DECL (sym);
1851
1852 if (flag_pic
1853 && decl
1854 && (!flag_plt
1855 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1856 && !targetm.binds_local_p (decl))
1857 return true;
1858
1859 return false;
1860 }
1861
1862 /* Return true if the offsets to a zero/sign-extract operation
1863 represent an expression that matches an extend operation. The
1864 operands represent the paramters from
1865
1866 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1867 bool
1868 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1869 rtx extract_imm)
1870 {
1871 HOST_WIDE_INT mult_val, extract_val;
1872
1873 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1874 return false;
1875
1876 mult_val = INTVAL (mult_imm);
1877 extract_val = INTVAL (extract_imm);
1878
1879 if (extract_val > 8
1880 && extract_val < GET_MODE_BITSIZE (mode)
1881 && exact_log2 (extract_val & ~7) > 0
1882 && (extract_val & 7) <= 4
1883 && mult_val == (1 << (extract_val & 7)))
1884 return true;
1885
1886 return false;
1887 }
1888
1889 /* Emit an insn that's a simple single-set. Both the operands must be
1890 known to be valid. */
1891 inline static rtx_insn *
1892 emit_set_insn (rtx x, rtx y)
1893 {
1894 return emit_insn (gen_rtx_SET (x, y));
1895 }
1896
1897 /* X and Y are two things to compare using CODE. Emit the compare insn and
1898 return the rtx for register 0 in the proper mode. */
1899 rtx
1900 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1901 {
1902 machine_mode mode = SELECT_CC_MODE (code, x, y);
1903 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1904
1905 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1906 return cc_reg;
1907 }
1908
1909 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
1910
1911 static rtx
1912 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1913 machine_mode y_mode)
1914 {
1915 if (y_mode == E_QImode || y_mode == E_HImode)
1916 {
1917 if (CONST_INT_P (y))
1918 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1919 else
1920 {
1921 rtx t, cc_reg;
1922 machine_mode cc_mode;
1923
1924 t = gen_rtx_ZERO_EXTEND (SImode, y);
1925 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1926 cc_mode = CC_SWPmode;
1927 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1928 emit_set_insn (cc_reg, t);
1929 return cc_reg;
1930 }
1931 }
1932
1933 return aarch64_gen_compare_reg (code, x, y);
1934 }
1935
1936 /* Build the SYMBOL_REF for __tls_get_addr. */
1937
1938 static GTY(()) rtx tls_get_addr_libfunc;
1939
1940 rtx
1941 aarch64_tls_get_addr (void)
1942 {
1943 if (!tls_get_addr_libfunc)
1944 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1945 return tls_get_addr_libfunc;
1946 }
1947
1948 /* Return the TLS model to use for ADDR. */
1949
1950 static enum tls_model
1951 tls_symbolic_operand_type (rtx addr)
1952 {
1953 enum tls_model tls_kind = TLS_MODEL_NONE;
1954 if (GET_CODE (addr) == CONST)
1955 {
1956 poly_int64 addend;
1957 rtx sym = strip_offset (addr, &addend);
1958 if (GET_CODE (sym) == SYMBOL_REF)
1959 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1960 }
1961 else if (GET_CODE (addr) == SYMBOL_REF)
1962 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1963
1964 return tls_kind;
1965 }
1966
1967 /* We'll allow lo_sum's in addresses in our legitimate addresses
1968 so that combine would take care of combining addresses where
1969 necessary, but for generation purposes, we'll generate the address
1970 as :
1971 RTL Absolute
1972 tmp = hi (symbol_ref); adrp x1, foo
1973 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1974 nop
1975
1976 PIC TLS
1977 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1978 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1979 bl __tls_get_addr
1980 nop
1981
1982 Load TLS symbol, depending on TLS mechanism and TLS access model.
1983
1984 Global Dynamic - Traditional TLS:
1985 adrp tmp, :tlsgd:imm
1986 add dest, tmp, #:tlsgd_lo12:imm
1987 bl __tls_get_addr
1988
1989 Global Dynamic - TLS Descriptors:
1990 adrp dest, :tlsdesc:imm
1991 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1992 add dest, dest, #:tlsdesc_lo12:imm
1993 blr tmp
1994 mrs tp, tpidr_el0
1995 add dest, dest, tp
1996
1997 Initial Exec:
1998 mrs tp, tpidr_el0
1999 adrp tmp, :gottprel:imm
2000 ldr dest, [tmp, #:gottprel_lo12:imm]
2001 add dest, dest, tp
2002
2003 Local Exec:
2004 mrs tp, tpidr_el0
2005 add t0, tp, #:tprel_hi12:imm, lsl #12
2006 add t0, t0, #:tprel_lo12_nc:imm
2007 */
2008
2009 static void
2010 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2011 enum aarch64_symbol_type type)
2012 {
2013 switch (type)
2014 {
2015 case SYMBOL_SMALL_ABSOLUTE:
2016 {
2017 /* In ILP32, the mode of dest can be either SImode or DImode. */
2018 rtx tmp_reg = dest;
2019 machine_mode mode = GET_MODE (dest);
2020
2021 gcc_assert (mode == Pmode || mode == ptr_mode);
2022
2023 if (can_create_pseudo_p ())
2024 tmp_reg = gen_reg_rtx (mode);
2025
2026 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2027 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2028 return;
2029 }
2030
2031 case SYMBOL_TINY_ABSOLUTE:
2032 emit_insn (gen_rtx_SET (dest, imm));
2033 return;
2034
2035 case SYMBOL_SMALL_GOT_28K:
2036 {
2037 machine_mode mode = GET_MODE (dest);
2038 rtx gp_rtx = pic_offset_table_rtx;
2039 rtx insn;
2040 rtx mem;
2041
2042 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2043 here before rtl expand. Tree IVOPT will generate rtl pattern to
2044 decide rtx costs, in which case pic_offset_table_rtx is not
2045 initialized. For that case no need to generate the first adrp
2046 instruction as the final cost for global variable access is
2047 one instruction. */
2048 if (gp_rtx != NULL)
2049 {
2050 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2051 using the page base as GOT base, the first page may be wasted,
2052 in the worst scenario, there is only 28K space for GOT).
2053
2054 The generate instruction sequence for accessing global variable
2055 is:
2056
2057 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2058
2059 Only one instruction needed. But we must initialize
2060 pic_offset_table_rtx properly. We generate initialize insn for
2061 every global access, and allow CSE to remove all redundant.
2062
2063 The final instruction sequences will look like the following
2064 for multiply global variables access.
2065
2066 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2067
2068 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2069 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2070 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2071 ... */
2072
2073 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2074 crtl->uses_pic_offset_table = 1;
2075 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2076
2077 if (mode != GET_MODE (gp_rtx))
2078 gp_rtx = gen_lowpart (mode, gp_rtx);
2079
2080 }
2081
2082 if (mode == ptr_mode)
2083 {
2084 if (mode == DImode)
2085 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2086 else
2087 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2088
2089 mem = XVECEXP (SET_SRC (insn), 0, 0);
2090 }
2091 else
2092 {
2093 gcc_assert (mode == Pmode);
2094
2095 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2096 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2097 }
2098
2099 /* The operand is expected to be MEM. Whenever the related insn
2100 pattern changed, above code which calculate mem should be
2101 updated. */
2102 gcc_assert (GET_CODE (mem) == MEM);
2103 MEM_READONLY_P (mem) = 1;
2104 MEM_NOTRAP_P (mem) = 1;
2105 emit_insn (insn);
2106 return;
2107 }
2108
2109 case SYMBOL_SMALL_GOT_4G:
2110 {
2111 /* In ILP32, the mode of dest can be either SImode or DImode,
2112 while the got entry is always of SImode size. The mode of
2113 dest depends on how dest is used: if dest is assigned to a
2114 pointer (e.g. in the memory), it has SImode; it may have
2115 DImode if dest is dereferenced to access the memeory.
2116 This is why we have to handle three different ldr_got_small
2117 patterns here (two patterns for ILP32). */
2118
2119 rtx insn;
2120 rtx mem;
2121 rtx tmp_reg = dest;
2122 machine_mode mode = GET_MODE (dest);
2123
2124 if (can_create_pseudo_p ())
2125 tmp_reg = gen_reg_rtx (mode);
2126
2127 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2128 if (mode == ptr_mode)
2129 {
2130 if (mode == DImode)
2131 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2132 else
2133 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2134
2135 mem = XVECEXP (SET_SRC (insn), 0, 0);
2136 }
2137 else
2138 {
2139 gcc_assert (mode == Pmode);
2140
2141 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2142 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2143 }
2144
2145 gcc_assert (GET_CODE (mem) == MEM);
2146 MEM_READONLY_P (mem) = 1;
2147 MEM_NOTRAP_P (mem) = 1;
2148 emit_insn (insn);
2149 return;
2150 }
2151
2152 case SYMBOL_SMALL_TLSGD:
2153 {
2154 rtx_insn *insns;
2155 machine_mode mode = GET_MODE (dest);
2156 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2157
2158 start_sequence ();
2159 if (TARGET_ILP32)
2160 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2161 else
2162 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2163 insns = get_insns ();
2164 end_sequence ();
2165
2166 RTL_CONST_CALL_P (insns) = 1;
2167 emit_libcall_block (insns, dest, result, imm);
2168 return;
2169 }
2170
2171 case SYMBOL_SMALL_TLSDESC:
2172 {
2173 machine_mode mode = GET_MODE (dest);
2174 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2175 rtx tp;
2176
2177 gcc_assert (mode == Pmode || mode == ptr_mode);
2178
2179 /* In ILP32, the got entry is always of SImode size. Unlike
2180 small GOT, the dest is fixed at reg 0. */
2181 if (TARGET_ILP32)
2182 emit_insn (gen_tlsdesc_small_si (imm));
2183 else
2184 emit_insn (gen_tlsdesc_small_di (imm));
2185 tp = aarch64_load_tp (NULL);
2186
2187 if (mode != Pmode)
2188 tp = gen_lowpart (mode, tp);
2189
2190 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2191 if (REG_P (dest))
2192 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2193 return;
2194 }
2195
2196 case SYMBOL_SMALL_TLSIE:
2197 {
2198 /* In ILP32, the mode of dest can be either SImode or DImode,
2199 while the got entry is always of SImode size. The mode of
2200 dest depends on how dest is used: if dest is assigned to a
2201 pointer (e.g. in the memory), it has SImode; it may have
2202 DImode if dest is dereferenced to access the memeory.
2203 This is why we have to handle three different tlsie_small
2204 patterns here (two patterns for ILP32). */
2205 machine_mode mode = GET_MODE (dest);
2206 rtx tmp_reg = gen_reg_rtx (mode);
2207 rtx tp = aarch64_load_tp (NULL);
2208
2209 if (mode == ptr_mode)
2210 {
2211 if (mode == DImode)
2212 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2213 else
2214 {
2215 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2216 tp = gen_lowpart (mode, tp);
2217 }
2218 }
2219 else
2220 {
2221 gcc_assert (mode == Pmode);
2222 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2223 }
2224
2225 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2226 if (REG_P (dest))
2227 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2228 return;
2229 }
2230
2231 case SYMBOL_TLSLE12:
2232 case SYMBOL_TLSLE24:
2233 case SYMBOL_TLSLE32:
2234 case SYMBOL_TLSLE48:
2235 {
2236 machine_mode mode = GET_MODE (dest);
2237 rtx tp = aarch64_load_tp (NULL);
2238
2239 if (mode != Pmode)
2240 tp = gen_lowpart (mode, tp);
2241
2242 switch (type)
2243 {
2244 case SYMBOL_TLSLE12:
2245 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2246 (dest, tp, imm));
2247 break;
2248 case SYMBOL_TLSLE24:
2249 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2250 (dest, tp, imm));
2251 break;
2252 case SYMBOL_TLSLE32:
2253 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2254 (dest, imm));
2255 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2256 (dest, dest, tp));
2257 break;
2258 case SYMBOL_TLSLE48:
2259 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2260 (dest, imm));
2261 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2262 (dest, dest, tp));
2263 break;
2264 default:
2265 gcc_unreachable ();
2266 }
2267
2268 if (REG_P (dest))
2269 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2270 return;
2271 }
2272
2273 case SYMBOL_TINY_GOT:
2274 emit_insn (gen_ldr_got_tiny (dest, imm));
2275 return;
2276
2277 case SYMBOL_TINY_TLSIE:
2278 {
2279 machine_mode mode = GET_MODE (dest);
2280 rtx tp = aarch64_load_tp (NULL);
2281
2282 if (mode == ptr_mode)
2283 {
2284 if (mode == DImode)
2285 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2286 else
2287 {
2288 tp = gen_lowpart (mode, tp);
2289 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2290 }
2291 }
2292 else
2293 {
2294 gcc_assert (mode == Pmode);
2295 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2296 }
2297
2298 if (REG_P (dest))
2299 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2300 return;
2301 }
2302
2303 default:
2304 gcc_unreachable ();
2305 }
2306 }
2307
2308 /* Emit a move from SRC to DEST. Assume that the move expanders can
2309 handle all moves if !can_create_pseudo_p (). The distinction is
2310 important because, unlike emit_move_insn, the move expanders know
2311 how to force Pmode objects into the constant pool even when the
2312 constant pool address is not itself legitimate. */
2313 static rtx
2314 aarch64_emit_move (rtx dest, rtx src)
2315 {
2316 return (can_create_pseudo_p ()
2317 ? emit_move_insn (dest, src)
2318 : emit_move_insn_1 (dest, src));
2319 }
2320
2321 /* Apply UNOPTAB to OP and store the result in DEST. */
2322
2323 static void
2324 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2325 {
2326 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2327 if (dest != tmp)
2328 emit_move_insn (dest, tmp);
2329 }
2330
2331 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2332
2333 static void
2334 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2335 {
2336 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2337 OPTAB_DIRECT);
2338 if (dest != tmp)
2339 emit_move_insn (dest, tmp);
2340 }
2341
2342 /* Split a 128-bit move operation into two 64-bit move operations,
2343 taking care to handle partial overlap of register to register
2344 copies. Special cases are needed when moving between GP regs and
2345 FP regs. SRC can be a register, constant or memory; DST a register
2346 or memory. If either operand is memory it must not have any side
2347 effects. */
2348 void
2349 aarch64_split_128bit_move (rtx dst, rtx src)
2350 {
2351 rtx dst_lo, dst_hi;
2352 rtx src_lo, src_hi;
2353
2354 machine_mode mode = GET_MODE (dst);
2355
2356 gcc_assert (mode == TImode || mode == TFmode);
2357 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2358 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2359
2360 if (REG_P (dst) && REG_P (src))
2361 {
2362 int src_regno = REGNO (src);
2363 int dst_regno = REGNO (dst);
2364
2365 /* Handle FP <-> GP regs. */
2366 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2367 {
2368 src_lo = gen_lowpart (word_mode, src);
2369 src_hi = gen_highpart (word_mode, src);
2370
2371 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2372 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2373 return;
2374 }
2375 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2376 {
2377 dst_lo = gen_lowpart (word_mode, dst);
2378 dst_hi = gen_highpart (word_mode, dst);
2379
2380 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2381 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2382 return;
2383 }
2384 }
2385
2386 dst_lo = gen_lowpart (word_mode, dst);
2387 dst_hi = gen_highpart (word_mode, dst);
2388 src_lo = gen_lowpart (word_mode, src);
2389 src_hi = gen_highpart_mode (word_mode, mode, src);
2390
2391 /* At most one pairing may overlap. */
2392 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2393 {
2394 aarch64_emit_move (dst_hi, src_hi);
2395 aarch64_emit_move (dst_lo, src_lo);
2396 }
2397 else
2398 {
2399 aarch64_emit_move (dst_lo, src_lo);
2400 aarch64_emit_move (dst_hi, src_hi);
2401 }
2402 }
2403
2404 bool
2405 aarch64_split_128bit_move_p (rtx dst, rtx src)
2406 {
2407 return (! REG_P (src)
2408 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2409 }
2410
2411 /* Split a complex SIMD combine. */
2412
2413 void
2414 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2415 {
2416 machine_mode src_mode = GET_MODE (src1);
2417 machine_mode dst_mode = GET_MODE (dst);
2418
2419 gcc_assert (VECTOR_MODE_P (dst_mode));
2420 gcc_assert (register_operand (dst, dst_mode)
2421 && register_operand (src1, src_mode)
2422 && register_operand (src2, src_mode));
2423
2424 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2425 return;
2426 }
2427
2428 /* Split a complex SIMD move. */
2429
2430 void
2431 aarch64_split_simd_move (rtx dst, rtx src)
2432 {
2433 machine_mode src_mode = GET_MODE (src);
2434 machine_mode dst_mode = GET_MODE (dst);
2435
2436 gcc_assert (VECTOR_MODE_P (dst_mode));
2437
2438 if (REG_P (dst) && REG_P (src))
2439 {
2440 gcc_assert (VECTOR_MODE_P (src_mode));
2441 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2442 }
2443 }
2444
2445 bool
2446 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2447 machine_mode ymode, rtx y)
2448 {
2449 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2450 gcc_assert (r != NULL);
2451 return rtx_equal_p (x, r);
2452 }
2453
2454
2455 static rtx
2456 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2457 {
2458 if (can_create_pseudo_p ())
2459 return force_reg (mode, value);
2460 else
2461 {
2462 gcc_assert (x);
2463 aarch64_emit_move (x, value);
2464 return x;
2465 }
2466 }
2467
2468 /* Return an all-true predicate register of mode MODE. */
2469
2470 rtx
2471 aarch64_ptrue_reg (machine_mode mode)
2472 {
2473 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2474 return force_reg (mode, CONSTM1_RTX (mode));
2475 }
2476
2477 /* Return an all-false predicate register of mode MODE. */
2478
2479 rtx
2480 aarch64_pfalse_reg (machine_mode mode)
2481 {
2482 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2483 return force_reg (mode, CONST0_RTX (mode));
2484 }
2485
2486 /* Return true if we can move VALUE into a register using a single
2487 CNT[BHWD] instruction. */
2488
2489 static bool
2490 aarch64_sve_cnt_immediate_p (poly_int64 value)
2491 {
2492 HOST_WIDE_INT factor = value.coeffs[0];
2493 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2494 return (value.coeffs[1] == factor
2495 && IN_RANGE (factor, 2, 16 * 16)
2496 && (factor & 1) == 0
2497 && factor <= 16 * (factor & -factor));
2498 }
2499
2500 /* Likewise for rtx X. */
2501
2502 bool
2503 aarch64_sve_cnt_immediate_p (rtx x)
2504 {
2505 poly_int64 value;
2506 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2507 }
2508
2509 /* Return the asm string for an instruction with a CNT-like vector size
2510 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2511 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2512 first part of the operands template (the part that comes before the
2513 vector size itself). FACTOR is the number of quadwords.
2514 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2515 If it is zero, we can use any element size. */
2516
2517 static char *
2518 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2519 unsigned int factor,
2520 unsigned int nelts_per_vq)
2521 {
2522 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2523
2524 if (nelts_per_vq == 0)
2525 /* There is some overlap in the ranges of the four CNT instructions.
2526 Here we always use the smallest possible element size, so that the
2527 multiplier is 1 whereever possible. */
2528 nelts_per_vq = factor & -factor;
2529 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2530 gcc_assert (IN_RANGE (shift, 1, 4));
2531 char suffix = "dwhb"[shift - 1];
2532
2533 factor >>= shift;
2534 unsigned int written;
2535 if (factor == 1)
2536 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2537 prefix, suffix, operands);
2538 else
2539 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2540 prefix, suffix, operands, factor);
2541 gcc_assert (written < sizeof (buffer));
2542 return buffer;
2543 }
2544
2545 /* Return the asm string for an instruction with a CNT-like vector size
2546 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2547 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2548 first part of the operands template (the part that comes before the
2549 vector size itself). X is the value of the vector size operand,
2550 as a polynomial integer rtx. */
2551
2552 char *
2553 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2554 rtx x)
2555 {
2556 poly_int64 value = rtx_to_poly_int64 (x);
2557 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2558 return aarch64_output_sve_cnt_immediate (prefix, operands,
2559 value.coeffs[1], 0);
2560 }
2561
2562 /* Return true if we can add VALUE to a register using a single ADDVL
2563 or ADDPL instruction. */
2564
2565 static bool
2566 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2567 {
2568 HOST_WIDE_INT factor = value.coeffs[0];
2569 if (factor == 0 || value.coeffs[1] != factor)
2570 return false;
2571 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2572 and a value of 16 is one vector width. */
2573 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2574 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2575 }
2576
2577 /* Likewise for rtx X. */
2578
2579 bool
2580 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2581 {
2582 poly_int64 value;
2583 return (poly_int_rtx_p (x, &value)
2584 && aarch64_sve_addvl_addpl_immediate_p (value));
2585 }
2586
2587 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2588 and storing the result in operand 0. */
2589
2590 char *
2591 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2592 {
2593 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2594 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2595 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2596
2597 /* Use INC or DEC if possible. */
2598 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2599 {
2600 if (aarch64_sve_cnt_immediate_p (offset_value))
2601 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2602 offset_value.coeffs[1], 0);
2603 if (aarch64_sve_cnt_immediate_p (-offset_value))
2604 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2605 -offset_value.coeffs[1], 0);
2606 }
2607
2608 int factor = offset_value.coeffs[1];
2609 if ((factor & 15) == 0)
2610 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2611 else
2612 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2613 return buffer;
2614 }
2615
2616 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2617 instruction. If it is, store the number of elements in each vector
2618 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2619 factor in *FACTOR_OUT (if nonnull). */
2620
2621 bool
2622 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2623 unsigned int *nelts_per_vq_out)
2624 {
2625 rtx elt;
2626 poly_int64 value;
2627
2628 if (!const_vec_duplicate_p (x, &elt)
2629 || !poly_int_rtx_p (elt, &value))
2630 return false;
2631
2632 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2633 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2634 /* There's no vector INCB. */
2635 return false;
2636
2637 HOST_WIDE_INT factor = value.coeffs[0];
2638 if (value.coeffs[1] != factor)
2639 return false;
2640
2641 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2642 if ((factor % nelts_per_vq) != 0
2643 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2644 return false;
2645
2646 if (factor_out)
2647 *factor_out = factor;
2648 if (nelts_per_vq_out)
2649 *nelts_per_vq_out = nelts_per_vq;
2650 return true;
2651 }
2652
2653 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2654 instruction. */
2655
2656 bool
2657 aarch64_sve_inc_dec_immediate_p (rtx x)
2658 {
2659 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2660 }
2661
2662 /* Return the asm template for an SVE vector INC or DEC instruction.
2663 OPERANDS gives the operands before the vector count and X is the
2664 value of the vector count operand itself. */
2665
2666 char *
2667 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2668 {
2669 int factor;
2670 unsigned int nelts_per_vq;
2671 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2672 gcc_unreachable ();
2673 if (factor < 0)
2674 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2675 nelts_per_vq);
2676 else
2677 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2678 nelts_per_vq);
2679 }
2680
2681 static int
2682 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2683 scalar_int_mode mode)
2684 {
2685 int i;
2686 unsigned HOST_WIDE_INT val, val2, mask;
2687 int one_match, zero_match;
2688 int num_insns;
2689
2690 val = INTVAL (imm);
2691
2692 if (aarch64_move_imm (val, mode))
2693 {
2694 if (generate)
2695 emit_insn (gen_rtx_SET (dest, imm));
2696 return 1;
2697 }
2698
2699 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2700 (with XXXX non-zero). In that case check to see if the move can be done in
2701 a smaller mode. */
2702 val2 = val & 0xffffffff;
2703 if (mode == DImode
2704 && aarch64_move_imm (val2, SImode)
2705 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2706 {
2707 if (generate)
2708 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2709
2710 /* Check if we have to emit a second instruction by checking to see
2711 if any of the upper 32 bits of the original DI mode value is set. */
2712 if (val == val2)
2713 return 1;
2714
2715 i = (val >> 48) ? 48 : 32;
2716
2717 if (generate)
2718 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2719 GEN_INT ((val >> i) & 0xffff)));
2720
2721 return 2;
2722 }
2723
2724 if ((val >> 32) == 0 || mode == SImode)
2725 {
2726 if (generate)
2727 {
2728 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2729 if (mode == SImode)
2730 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2731 GEN_INT ((val >> 16) & 0xffff)));
2732 else
2733 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2734 GEN_INT ((val >> 16) & 0xffff)));
2735 }
2736 return 2;
2737 }
2738
2739 /* Remaining cases are all for DImode. */
2740
2741 mask = 0xffff;
2742 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2743 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2744 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2745 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2746
2747 if (zero_match != 2 && one_match != 2)
2748 {
2749 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2750 For a 64-bit bitmask try whether changing 16 bits to all ones or
2751 zeroes creates a valid bitmask. To check any repeated bitmask,
2752 try using 16 bits from the other 32-bit half of val. */
2753
2754 for (i = 0; i < 64; i += 16, mask <<= 16)
2755 {
2756 val2 = val & ~mask;
2757 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2758 break;
2759 val2 = val | mask;
2760 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2761 break;
2762 val2 = val2 & ~mask;
2763 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2764 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2765 break;
2766 }
2767 if (i != 64)
2768 {
2769 if (generate)
2770 {
2771 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2772 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2773 GEN_INT ((val >> i) & 0xffff)));
2774 }
2775 return 2;
2776 }
2777 }
2778
2779 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2780 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2781 otherwise skip zero bits. */
2782
2783 num_insns = 1;
2784 mask = 0xffff;
2785 val2 = one_match > zero_match ? ~val : val;
2786 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2787
2788 if (generate)
2789 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2790 ? (val | ~(mask << i))
2791 : (val & (mask << i)))));
2792 for (i += 16; i < 64; i += 16)
2793 {
2794 if ((val2 & (mask << i)) == 0)
2795 continue;
2796 if (generate)
2797 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2798 GEN_INT ((val >> i) & 0xffff)));
2799 num_insns ++;
2800 }
2801
2802 return num_insns;
2803 }
2804
2805 /* Return whether imm is a 128-bit immediate which is simple enough to
2806 expand inline. */
2807 bool
2808 aarch64_mov128_immediate (rtx imm)
2809 {
2810 if (GET_CODE (imm) == CONST_INT)
2811 return true;
2812
2813 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2814
2815 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2816 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2817
2818 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2819 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2820 }
2821
2822
2823 /* Return the number of temporary registers that aarch64_add_offset_1
2824 would need to add OFFSET to a register. */
2825
2826 static unsigned int
2827 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2828 {
2829 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2830 }
2831
2832 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2833 a non-polynomial OFFSET. MODE is the mode of the addition.
2834 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2835 be set and CFA adjustments added to the generated instructions.
2836
2837 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2838 temporary if register allocation is already complete. This temporary
2839 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2840 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2841 the immediate again.
2842
2843 Since this function may be used to adjust the stack pointer, we must
2844 ensure that it cannot cause transient stack deallocation (for example
2845 by first incrementing SP and then decrementing when adjusting by a
2846 large immediate). */
2847
2848 static void
2849 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2850 rtx src, HOST_WIDE_INT offset, rtx temp1,
2851 bool frame_related_p, bool emit_move_imm)
2852 {
2853 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2854 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2855
2856 HOST_WIDE_INT moffset = abs_hwi (offset);
2857 rtx_insn *insn;
2858
2859 if (!moffset)
2860 {
2861 if (!rtx_equal_p (dest, src))
2862 {
2863 insn = emit_insn (gen_rtx_SET (dest, src));
2864 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2865 }
2866 return;
2867 }
2868
2869 /* Single instruction adjustment. */
2870 if (aarch64_uimm12_shift (moffset))
2871 {
2872 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2873 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2874 return;
2875 }
2876
2877 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2878 and either:
2879
2880 a) the offset cannot be loaded by a 16-bit move or
2881 b) there is no spare register into which we can move it. */
2882 if (moffset < 0x1000000
2883 && ((!temp1 && !can_create_pseudo_p ())
2884 || !aarch64_move_imm (moffset, mode)))
2885 {
2886 HOST_WIDE_INT low_off = moffset & 0xfff;
2887
2888 low_off = offset < 0 ? -low_off : low_off;
2889 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2890 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2891 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2892 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2893 return;
2894 }
2895
2896 /* Emit a move immediate if required and an addition/subtraction. */
2897 if (emit_move_imm)
2898 {
2899 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2900 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2901 }
2902 insn = emit_insn (offset < 0
2903 ? gen_sub3_insn (dest, src, temp1)
2904 : gen_add3_insn (dest, src, temp1));
2905 if (frame_related_p)
2906 {
2907 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2908 rtx adj = plus_constant (mode, src, offset);
2909 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2910 }
2911 }
2912
2913 /* Return the number of temporary registers that aarch64_add_offset
2914 would need to move OFFSET into a register or add OFFSET to a register;
2915 ADD_P is true if we want the latter rather than the former. */
2916
2917 static unsigned int
2918 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2919 {
2920 /* This follows the same structure as aarch64_add_offset. */
2921 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2922 return 0;
2923
2924 unsigned int count = 0;
2925 HOST_WIDE_INT factor = offset.coeffs[1];
2926 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2927 poly_int64 poly_offset (factor, factor);
2928 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2929 /* Need one register for the ADDVL/ADDPL result. */
2930 count += 1;
2931 else if (factor != 0)
2932 {
2933 factor = abs (factor);
2934 if (factor > 16 * (factor & -factor))
2935 /* Need one register for the CNT result and one for the multiplication
2936 factor. If necessary, the second temporary can be reused for the
2937 constant part of the offset. */
2938 return 2;
2939 /* Need one register for the CNT result (which might then
2940 be shifted). */
2941 count += 1;
2942 }
2943 return count + aarch64_add_offset_1_temporaries (constant);
2944 }
2945
2946 /* If X can be represented as a poly_int64, return the number
2947 of temporaries that are required to add it to a register.
2948 Return -1 otherwise. */
2949
2950 int
2951 aarch64_add_offset_temporaries (rtx x)
2952 {
2953 poly_int64 offset;
2954 if (!poly_int_rtx_p (x, &offset))
2955 return -1;
2956 return aarch64_offset_temporaries (true, offset);
2957 }
2958
2959 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2960 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2961 be set and CFA adjustments added to the generated instructions.
2962
2963 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2964 temporary if register allocation is already complete. This temporary
2965 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2966 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2967 false to avoid emitting the immediate again.
2968
2969 TEMP2, if nonnull, is a second temporary register that doesn't
2970 overlap either DEST or REG.
2971
2972 Since this function may be used to adjust the stack pointer, we must
2973 ensure that it cannot cause transient stack deallocation (for example
2974 by first incrementing SP and then decrementing when adjusting by a
2975 large immediate). */
2976
2977 static void
2978 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2979 poly_int64 offset, rtx temp1, rtx temp2,
2980 bool frame_related_p, bool emit_move_imm = true)
2981 {
2982 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2983 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2984 gcc_assert (temp1 == NULL_RTX
2985 || !frame_related_p
2986 || !reg_overlap_mentioned_p (temp1, dest));
2987 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2988
2989 /* Try using ADDVL or ADDPL to add the whole value. */
2990 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2991 {
2992 rtx offset_rtx = gen_int_mode (offset, mode);
2993 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2994 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2995 return;
2996 }
2997
2998 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2999 SVE vector register, over and above the minimum size of 128 bits.
3000 This is equivalent to half the value returned by CNTD with a
3001 vector shape of ALL. */
3002 HOST_WIDE_INT factor = offset.coeffs[1];
3003 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3004
3005 /* Try using ADDVL or ADDPL to add the VG-based part. */
3006 poly_int64 poly_offset (factor, factor);
3007 if (src != const0_rtx
3008 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3009 {
3010 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3011 if (frame_related_p)
3012 {
3013 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3014 RTX_FRAME_RELATED_P (insn) = true;
3015 src = dest;
3016 }
3017 else
3018 {
3019 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3020 src = aarch64_force_temporary (mode, temp1, addr);
3021 temp1 = temp2;
3022 temp2 = NULL_RTX;
3023 }
3024 }
3025 /* Otherwise use a CNT-based sequence. */
3026 else if (factor != 0)
3027 {
3028 /* Use a subtraction if we have a negative factor. */
3029 rtx_code code = PLUS;
3030 if (factor < 0)
3031 {
3032 factor = -factor;
3033 code = MINUS;
3034 }
3035
3036 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3037 into the multiplication. */
3038 rtx val;
3039 int shift = 0;
3040 if (factor & 1)
3041 /* Use a right shift by 1. */
3042 shift = -1;
3043 else
3044 factor /= 2;
3045 HOST_WIDE_INT low_bit = factor & -factor;
3046 if (factor <= 16 * low_bit)
3047 {
3048 if (factor > 16 * 8)
3049 {
3050 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3051 the value with the minimum multiplier and shift it into
3052 position. */
3053 int extra_shift = exact_log2 (low_bit);
3054 shift += extra_shift;
3055 factor >>= extra_shift;
3056 }
3057 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3058 }
3059 else
3060 {
3061 /* Use CNTD, then multiply it by FACTOR. */
3062 val = gen_int_mode (poly_int64 (2, 2), mode);
3063 val = aarch64_force_temporary (mode, temp1, val);
3064
3065 /* Go back to using a negative multiplication factor if we have
3066 no register from which to subtract. */
3067 if (code == MINUS && src == const0_rtx)
3068 {
3069 factor = -factor;
3070 code = PLUS;
3071 }
3072 rtx coeff1 = gen_int_mode (factor, mode);
3073 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3074 val = gen_rtx_MULT (mode, val, coeff1);
3075 }
3076
3077 if (shift > 0)
3078 {
3079 /* Multiply by 1 << SHIFT. */
3080 val = aarch64_force_temporary (mode, temp1, val);
3081 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3082 }
3083 else if (shift == -1)
3084 {
3085 /* Divide by 2. */
3086 val = aarch64_force_temporary (mode, temp1, val);
3087 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3088 }
3089
3090 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3091 if (src != const0_rtx)
3092 {
3093 val = aarch64_force_temporary (mode, temp1, val);
3094 val = gen_rtx_fmt_ee (code, mode, src, val);
3095 }
3096 else if (code == MINUS)
3097 {
3098 val = aarch64_force_temporary (mode, temp1, val);
3099 val = gen_rtx_NEG (mode, val);
3100 }
3101
3102 if (constant == 0 || frame_related_p)
3103 {
3104 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3105 if (frame_related_p)
3106 {
3107 RTX_FRAME_RELATED_P (insn) = true;
3108 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3109 gen_rtx_SET (dest, plus_constant (Pmode, src,
3110 poly_offset)));
3111 }
3112 src = dest;
3113 if (constant == 0)
3114 return;
3115 }
3116 else
3117 {
3118 src = aarch64_force_temporary (mode, temp1, val);
3119 temp1 = temp2;
3120 temp2 = NULL_RTX;
3121 }
3122
3123 emit_move_imm = true;
3124 }
3125
3126 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3127 frame_related_p, emit_move_imm);
3128 }
3129
3130 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3131 than a poly_int64. */
3132
3133 void
3134 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3135 rtx offset_rtx, rtx temp1, rtx temp2)
3136 {
3137 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3138 temp1, temp2, false);
3139 }
3140
3141 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3142 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3143 if TEMP1 already contains abs (DELTA). */
3144
3145 static inline void
3146 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3147 {
3148 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3149 temp1, temp2, true, emit_move_imm);
3150 }
3151
3152 /* Subtract DELTA from the stack pointer, marking the instructions
3153 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3154 if nonnull. */
3155
3156 static inline void
3157 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3158 bool emit_move_imm = true)
3159 {
3160 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3161 temp1, temp2, frame_related_p, emit_move_imm);
3162 }
3163
3164 /* Set DEST to (vec_series BASE STEP). */
3165
3166 static void
3167 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3168 {
3169 machine_mode mode = GET_MODE (dest);
3170 scalar_mode inner = GET_MODE_INNER (mode);
3171
3172 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3173 if (!aarch64_sve_index_immediate_p (base))
3174 base = force_reg (inner, base);
3175 if (!aarch64_sve_index_immediate_p (step))
3176 step = force_reg (inner, step);
3177
3178 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3179 }
3180
3181 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3182 integer of mode INT_MODE. Return true on success. */
3183
3184 static bool
3185 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3186 rtx src)
3187 {
3188 /* If the constant is smaller than 128 bits, we can do the move
3189 using a vector of SRC_MODEs. */
3190 if (src_mode != TImode)
3191 {
3192 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3193 GET_MODE_SIZE (src_mode));
3194 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3195 emit_move_insn (gen_lowpart (dup_mode, dest),
3196 gen_const_vec_duplicate (dup_mode, src));
3197 return true;
3198 }
3199
3200 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
3201 src = force_const_mem (src_mode, src);
3202 if (!src)
3203 return false;
3204
3205 /* Make sure that the address is legitimate. */
3206 if (!aarch64_sve_ld1r_operand_p (src))
3207 {
3208 rtx addr = force_reg (Pmode, XEXP (src, 0));
3209 src = replace_equiv_address (src, addr);
3210 }
3211
3212 machine_mode mode = GET_MODE (dest);
3213 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3214 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3215 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3216 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3217 emit_insn (gen_rtx_SET (dest, src));
3218 return true;
3219 }
3220
3221 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3222 isn't a simple duplicate or series. */
3223
3224 static void
3225 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3226 {
3227 machine_mode mode = GET_MODE (src);
3228 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3229 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3230 gcc_assert (npatterns > 1);
3231
3232 if (nelts_per_pattern == 1)
3233 {
3234 /* The constant is a repeating seqeuence of at least two elements,
3235 where the repeating elements occupy no more than 128 bits.
3236 Get an integer representation of the replicated value. */
3237 scalar_int_mode int_mode;
3238 if (BYTES_BIG_ENDIAN)
3239 /* For now, always use LD1RQ to load the value on big-endian
3240 targets, since the handling of smaller integers includes a
3241 subreg that is semantically an element reverse. */
3242 int_mode = TImode;
3243 else
3244 {
3245 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3246 gcc_assert (int_bits <= 128);
3247 int_mode = int_mode_for_size (int_bits, 0).require ();
3248 }
3249 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3250 if (int_value
3251 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3252 return;
3253 }
3254
3255 /* Expand each pattern individually. */
3256 rtx_vector_builder builder;
3257 auto_vec<rtx, 16> vectors (npatterns);
3258 for (unsigned int i = 0; i < npatterns; ++i)
3259 {
3260 builder.new_vector (mode, 1, nelts_per_pattern);
3261 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3262 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3263 vectors.quick_push (force_reg (mode, builder.build ()));
3264 }
3265
3266 /* Use permutes to interleave the separate vectors. */
3267 while (npatterns > 1)
3268 {
3269 npatterns /= 2;
3270 for (unsigned int i = 0; i < npatterns; ++i)
3271 {
3272 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3273 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3274 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3275 vectors[i] = tmp;
3276 }
3277 }
3278 gcc_assert (vectors[0] == dest);
3279 }
3280
3281 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
3282 is a pattern that can be used to set DEST to a replicated scalar
3283 element. */
3284
3285 void
3286 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3287 rtx (*gen_vec_duplicate) (rtx, rtx))
3288 {
3289 machine_mode mode = GET_MODE (dest);
3290
3291 /* Check on what type of symbol it is. */
3292 scalar_int_mode int_mode;
3293 if ((GET_CODE (imm) == SYMBOL_REF
3294 || GET_CODE (imm) == LABEL_REF
3295 || GET_CODE (imm) == CONST
3296 || GET_CODE (imm) == CONST_POLY_INT)
3297 && is_a <scalar_int_mode> (mode, &int_mode))
3298 {
3299 rtx mem;
3300 poly_int64 offset;
3301 HOST_WIDE_INT const_offset;
3302 enum aarch64_symbol_type sty;
3303
3304 /* If we have (const (plus symbol offset)), separate out the offset
3305 before we start classifying the symbol. */
3306 rtx base = strip_offset (imm, &offset);
3307
3308 /* We must always add an offset involving VL separately, rather than
3309 folding it into the relocation. */
3310 if (!offset.is_constant (&const_offset))
3311 {
3312 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3313 emit_insn (gen_rtx_SET (dest, imm));
3314 else
3315 {
3316 /* Do arithmetic on 32-bit values if the result is smaller
3317 than that. */
3318 if (partial_subreg_p (int_mode, SImode))
3319 {
3320 /* It is invalid to do symbol calculations in modes
3321 narrower than SImode. */
3322 gcc_assert (base == const0_rtx);
3323 dest = gen_lowpart (SImode, dest);
3324 int_mode = SImode;
3325 }
3326 if (base != const0_rtx)
3327 {
3328 base = aarch64_force_temporary (int_mode, dest, base);
3329 aarch64_add_offset (int_mode, dest, base, offset,
3330 NULL_RTX, NULL_RTX, false);
3331 }
3332 else
3333 aarch64_add_offset (int_mode, dest, base, offset,
3334 dest, NULL_RTX, false);
3335 }
3336 return;
3337 }
3338
3339 sty = aarch64_classify_symbol (base, const_offset);
3340 switch (sty)
3341 {
3342 case SYMBOL_FORCE_TO_MEM:
3343 if (const_offset != 0
3344 && targetm.cannot_force_const_mem (int_mode, imm))
3345 {
3346 gcc_assert (can_create_pseudo_p ());
3347 base = aarch64_force_temporary (int_mode, dest, base);
3348 aarch64_add_offset (int_mode, dest, base, const_offset,
3349 NULL_RTX, NULL_RTX, false);
3350 return;
3351 }
3352
3353 mem = force_const_mem (ptr_mode, imm);
3354 gcc_assert (mem);
3355
3356 /* If we aren't generating PC relative literals, then
3357 we need to expand the literal pool access carefully.
3358 This is something that needs to be done in a number
3359 of places, so could well live as a separate function. */
3360 if (!aarch64_pcrelative_literal_loads)
3361 {
3362 gcc_assert (can_create_pseudo_p ());
3363 base = gen_reg_rtx (ptr_mode);
3364 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3365 if (ptr_mode != Pmode)
3366 base = convert_memory_address (Pmode, base);
3367 mem = gen_rtx_MEM (ptr_mode, base);
3368 }
3369
3370 if (int_mode != ptr_mode)
3371 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3372
3373 emit_insn (gen_rtx_SET (dest, mem));
3374
3375 return;
3376
3377 case SYMBOL_SMALL_TLSGD:
3378 case SYMBOL_SMALL_TLSDESC:
3379 case SYMBOL_SMALL_TLSIE:
3380 case SYMBOL_SMALL_GOT_28K:
3381 case SYMBOL_SMALL_GOT_4G:
3382 case SYMBOL_TINY_GOT:
3383 case SYMBOL_TINY_TLSIE:
3384 if (const_offset != 0)
3385 {
3386 gcc_assert(can_create_pseudo_p ());
3387 base = aarch64_force_temporary (int_mode, dest, base);
3388 aarch64_add_offset (int_mode, dest, base, const_offset,
3389 NULL_RTX, NULL_RTX, false);
3390 return;
3391 }
3392 /* FALLTHRU */
3393
3394 case SYMBOL_SMALL_ABSOLUTE:
3395 case SYMBOL_TINY_ABSOLUTE:
3396 case SYMBOL_TLSLE12:
3397 case SYMBOL_TLSLE24:
3398 case SYMBOL_TLSLE32:
3399 case SYMBOL_TLSLE48:
3400 aarch64_load_symref_appropriately (dest, imm, sty);
3401 return;
3402
3403 default:
3404 gcc_unreachable ();
3405 }
3406 }
3407
3408 if (!CONST_INT_P (imm))
3409 {
3410 rtx base, step, value;
3411 if (GET_CODE (imm) == HIGH
3412 || aarch64_simd_valid_immediate (imm, NULL))
3413 emit_insn (gen_rtx_SET (dest, imm));
3414 else if (const_vec_series_p (imm, &base, &step))
3415 aarch64_expand_vec_series (dest, base, step);
3416 else if (const_vec_duplicate_p (imm, &value))
3417 {
3418 /* If the constant is out of range of an SVE vector move,
3419 load it from memory if we can, otherwise move it into
3420 a register and use a DUP. */
3421 scalar_mode inner_mode = GET_MODE_INNER (mode);
3422 rtx op = force_const_mem (inner_mode, value);
3423 if (!op)
3424 op = force_reg (inner_mode, value);
3425 else if (!aarch64_sve_ld1r_operand_p (op))
3426 {
3427 rtx addr = force_reg (Pmode, XEXP (op, 0));
3428 op = replace_equiv_address (op, addr);
3429 }
3430 emit_insn (gen_vec_duplicate (dest, op));
3431 }
3432 else if (GET_CODE (imm) == CONST_VECTOR
3433 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3434 aarch64_expand_sve_const_vector (dest, imm);
3435 else
3436 {
3437 rtx mem = force_const_mem (mode, imm);
3438 gcc_assert (mem);
3439 emit_move_insn (dest, mem);
3440 }
3441
3442 return;
3443 }
3444
3445 aarch64_internal_mov_immediate (dest, imm, true,
3446 as_a <scalar_int_mode> (mode));
3447 }
3448
3449 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3450 that is known to contain PTRUE. */
3451
3452 void
3453 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3454 {
3455 expand_operand ops[3];
3456 machine_mode mode = GET_MODE (dest);
3457 create_output_operand (&ops[0], dest, mode);
3458 create_input_operand (&ops[1], pred, GET_MODE(pred));
3459 create_input_operand (&ops[2], src, mode);
3460 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3461 }
3462
3463 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3464 operand is in memory. In this case we need to use the predicated LD1
3465 and ST1 instead of LDR and STR, both for correctness on big-endian
3466 targets and because LD1 and ST1 support a wider range of addressing modes.
3467 PRED_MODE is the mode of the predicate.
3468
3469 See the comment at the head of aarch64-sve.md for details about the
3470 big-endian handling. */
3471
3472 void
3473 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3474 {
3475 machine_mode mode = GET_MODE (dest);
3476 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3477 if (!register_operand (src, mode)
3478 && !register_operand (dest, mode))
3479 {
3480 rtx tmp = gen_reg_rtx (mode);
3481 if (MEM_P (src))
3482 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3483 else
3484 emit_move_insn (tmp, src);
3485 src = tmp;
3486 }
3487 aarch64_emit_sve_pred_move (dest, ptrue, src);
3488 }
3489
3490 /* Called only on big-endian targets. See whether an SVE vector move
3491 from SRC to DEST is effectively a REV[BHW] instruction, because at
3492 least one operand is a subreg of an SVE vector that has wider or
3493 narrower elements. Return true and emit the instruction if so.
3494
3495 For example:
3496
3497 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3498
3499 represents a VIEW_CONVERT between the following vectors, viewed
3500 in memory order:
3501
3502 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3503 R1: { [0], [1], [2], [3], ... }
3504
3505 The high part of lane X in R2 should therefore correspond to lane X*2
3506 of R1, but the register representations are:
3507
3508 msb lsb
3509 R2: ...... [1].high [1].low [0].high [0].low
3510 R1: ...... [3] [2] [1] [0]
3511
3512 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3513 We therefore need a reverse operation to swap the high and low values
3514 around.
3515
3516 This is purely an optimization. Without it we would spill the
3517 subreg operand to the stack in one mode and reload it in the
3518 other mode, which has the same effect as the REV. */
3519
3520 bool
3521 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3522 {
3523 gcc_assert (BYTES_BIG_ENDIAN);
3524 if (GET_CODE (dest) == SUBREG)
3525 dest = SUBREG_REG (dest);
3526 if (GET_CODE (src) == SUBREG)
3527 src = SUBREG_REG (src);
3528
3529 /* The optimization handles two single SVE REGs with different element
3530 sizes. */
3531 if (!REG_P (dest)
3532 || !REG_P (src)
3533 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3534 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3535 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3536 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3537 return false;
3538
3539 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3540 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
3541 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3542 UNSPEC_REV_SUBREG);
3543 emit_insn (gen_rtx_SET (dest, unspec));
3544 return true;
3545 }
3546
3547 /* Return a copy of X with mode MODE, without changing its other
3548 attributes. Unlike gen_lowpart, this doesn't care whether the
3549 mode change is valid. */
3550
3551 static rtx
3552 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3553 {
3554 if (GET_MODE (x) == mode)
3555 return x;
3556
3557 x = shallow_copy_rtx (x);
3558 set_mode_and_regno (x, mode, REGNO (x));
3559 return x;
3560 }
3561
3562 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3563 operands. */
3564
3565 void
3566 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3567 {
3568 /* Decide which REV operation we need. The mode with narrower elements
3569 determines the mode of the operands and the mode with the wider
3570 elements determines the reverse width. */
3571 machine_mode mode_with_wider_elts = GET_MODE (dest);
3572 machine_mode mode_with_narrower_elts = GET_MODE (src);
3573 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3574 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3575 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3576
3577 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3578 unsigned int unspec;
3579 if (wider_bytes == 8)
3580 unspec = UNSPEC_REV64;
3581 else if (wider_bytes == 4)
3582 unspec = UNSPEC_REV32;
3583 else if (wider_bytes == 2)
3584 unspec = UNSPEC_REV16;
3585 else
3586 gcc_unreachable ();
3587 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3588
3589 /* Emit:
3590
3591 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3592 UNSPEC_MERGE_PTRUE))
3593
3594 with the appropriate modes. */
3595 ptrue = gen_lowpart (pred_mode, ptrue);
3596 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3597 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3598 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3599 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3600 UNSPEC_MERGE_PTRUE);
3601 emit_insn (gen_rtx_SET (dest, src));
3602 }
3603
3604 static bool
3605 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3606 tree exp ATTRIBUTE_UNUSED)
3607 {
3608 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3609 return false;
3610
3611 return true;
3612 }
3613
3614 /* Implement TARGET_PASS_BY_REFERENCE. */
3615
3616 static bool
3617 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3618 machine_mode mode,
3619 const_tree type,
3620 bool named ATTRIBUTE_UNUSED)
3621 {
3622 HOST_WIDE_INT size;
3623 machine_mode dummymode;
3624 int nregs;
3625
3626 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3627 if (mode == BLKmode && type)
3628 size = int_size_in_bytes (type);
3629 else
3630 /* No frontends can create types with variable-sized modes, so we
3631 shouldn't be asked to pass or return them. */
3632 size = GET_MODE_SIZE (mode).to_constant ();
3633
3634 /* Aggregates are passed by reference based on their size. */
3635 if (type && AGGREGATE_TYPE_P (type))
3636 {
3637 size = int_size_in_bytes (type);
3638 }
3639
3640 /* Variable sized arguments are always returned by reference. */
3641 if (size < 0)
3642 return true;
3643
3644 /* Can this be a candidate to be passed in fp/simd register(s)? */
3645 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3646 &dummymode, &nregs,
3647 NULL))
3648 return false;
3649
3650 /* Arguments which are variable sized or larger than 2 registers are
3651 passed by reference unless they are a homogenous floating point
3652 aggregate. */
3653 return size > 2 * UNITS_PER_WORD;
3654 }
3655
3656 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3657 static bool
3658 aarch64_return_in_msb (const_tree valtype)
3659 {
3660 machine_mode dummy_mode;
3661 int dummy_int;
3662
3663 /* Never happens in little-endian mode. */
3664 if (!BYTES_BIG_ENDIAN)
3665 return false;
3666
3667 /* Only composite types smaller than or equal to 16 bytes can
3668 be potentially returned in registers. */
3669 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3670 || int_size_in_bytes (valtype) <= 0
3671 || int_size_in_bytes (valtype) > 16)
3672 return false;
3673
3674 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3675 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3676 is always passed/returned in the least significant bits of fp/simd
3677 register(s). */
3678 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3679 &dummy_mode, &dummy_int, NULL))
3680 return false;
3681
3682 return true;
3683 }
3684
3685 /* Implement TARGET_FUNCTION_VALUE.
3686 Define how to find the value returned by a function. */
3687
3688 static rtx
3689 aarch64_function_value (const_tree type, const_tree func,
3690 bool outgoing ATTRIBUTE_UNUSED)
3691 {
3692 machine_mode mode;
3693 int unsignedp;
3694 int count;
3695 machine_mode ag_mode;
3696
3697 mode = TYPE_MODE (type);
3698 if (INTEGRAL_TYPE_P (type))
3699 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3700
3701 if (aarch64_return_in_msb (type))
3702 {
3703 HOST_WIDE_INT size = int_size_in_bytes (type);
3704
3705 if (size % UNITS_PER_WORD != 0)
3706 {
3707 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3708 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3709 }
3710 }
3711
3712 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3713 &ag_mode, &count, NULL))
3714 {
3715 if (!aarch64_composite_type_p (type, mode))
3716 {
3717 gcc_assert (count == 1 && mode == ag_mode);
3718 return gen_rtx_REG (mode, V0_REGNUM);
3719 }
3720 else
3721 {
3722 int i;
3723 rtx par;
3724
3725 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3726 for (i = 0; i < count; i++)
3727 {
3728 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3729 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3730 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3731 XVECEXP (par, 0, i) = tmp;
3732 }
3733 return par;
3734 }
3735 }
3736 else
3737 return gen_rtx_REG (mode, R0_REGNUM);
3738 }
3739
3740 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3741 Return true if REGNO is the number of a hard register in which the values
3742 of called function may come back. */
3743
3744 static bool
3745 aarch64_function_value_regno_p (const unsigned int regno)
3746 {
3747 /* Maximum of 16 bytes can be returned in the general registers. Examples
3748 of 16-byte return values are: 128-bit integers and 16-byte small
3749 structures (excluding homogeneous floating-point aggregates). */
3750 if (regno == R0_REGNUM || regno == R1_REGNUM)
3751 return true;
3752
3753 /* Up to four fp/simd registers can return a function value, e.g. a
3754 homogeneous floating-point aggregate having four members. */
3755 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3756 return TARGET_FLOAT;
3757
3758 return false;
3759 }
3760
3761 /* Implement TARGET_RETURN_IN_MEMORY.
3762
3763 If the type T of the result of a function is such that
3764 void func (T arg)
3765 would require that arg be passed as a value in a register (or set of
3766 registers) according to the parameter passing rules, then the result
3767 is returned in the same registers as would be used for such an
3768 argument. */
3769
3770 static bool
3771 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3772 {
3773 HOST_WIDE_INT size;
3774 machine_mode ag_mode;
3775 int count;
3776
3777 if (!AGGREGATE_TYPE_P (type)
3778 && TREE_CODE (type) != COMPLEX_TYPE
3779 && TREE_CODE (type) != VECTOR_TYPE)
3780 /* Simple scalar types always returned in registers. */
3781 return false;
3782
3783 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3784 type,
3785 &ag_mode,
3786 &count,
3787 NULL))
3788 return false;
3789
3790 /* Types larger than 2 registers returned in memory. */
3791 size = int_size_in_bytes (type);
3792 return (size < 0 || size > 2 * UNITS_PER_WORD);
3793 }
3794
3795 static bool
3796 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3797 const_tree type, int *nregs)
3798 {
3799 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3800 return aarch64_vfp_is_call_or_return_candidate (mode,
3801 type,
3802 &pcum->aapcs_vfp_rmode,
3803 nregs,
3804 NULL);
3805 }
3806
3807 /* Given MODE and TYPE of a function argument, return the alignment in
3808 bits. The idea is to suppress any stronger alignment requested by
3809 the user and opt for the natural alignment (specified in AAPCS64 \S
3810 4.1). ABI_BREAK is set to true if the alignment was incorrectly
3811 calculated in versions of GCC prior to GCC-9. This is a helper
3812 function for local use only. */
3813
3814 static unsigned int
3815 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3816 bool *abi_break)
3817 {
3818 *abi_break = false;
3819 if (!type)
3820 return GET_MODE_ALIGNMENT (mode);
3821
3822 if (integer_zerop (TYPE_SIZE (type)))
3823 return 0;
3824
3825 gcc_assert (TYPE_MODE (type) == mode);
3826
3827 if (!AGGREGATE_TYPE_P (type))
3828 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3829
3830 if (TREE_CODE (type) == ARRAY_TYPE)
3831 return TYPE_ALIGN (TREE_TYPE (type));
3832
3833 unsigned int alignment = 0;
3834 unsigned int bitfield_alignment = 0;
3835 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3836 if (TREE_CODE (field) == FIELD_DECL)
3837 {
3838 alignment = std::max (alignment, DECL_ALIGN (field));
3839 if (DECL_BIT_FIELD_TYPE (field))
3840 bitfield_alignment
3841 = std::max (bitfield_alignment,
3842 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3843 }
3844
3845 if (bitfield_alignment > alignment)
3846 {
3847 *abi_break = true;
3848 return bitfield_alignment;
3849 }
3850
3851 return alignment;
3852 }
3853
3854 /* Layout a function argument according to the AAPCS64 rules. The rule
3855 numbers refer to the rule numbers in the AAPCS64. */
3856
3857 static void
3858 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3859 const_tree type,
3860 bool named ATTRIBUTE_UNUSED)
3861 {
3862 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3863 int ncrn, nvrn, nregs;
3864 bool allocate_ncrn, allocate_nvrn;
3865 HOST_WIDE_INT size;
3866 bool abi_break;
3867
3868 /* We need to do this once per argument. */
3869 if (pcum->aapcs_arg_processed)
3870 return;
3871
3872 pcum->aapcs_arg_processed = true;
3873
3874 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3875 if (type)
3876 size = int_size_in_bytes (type);
3877 else
3878 /* No frontends can create types with variable-sized modes, so we
3879 shouldn't be asked to pass or return them. */
3880 size = GET_MODE_SIZE (mode).to_constant ();
3881 size = ROUND_UP (size, UNITS_PER_WORD);
3882
3883 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3884 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3885 mode,
3886 type,
3887 &nregs);
3888
3889 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3890 The following code thus handles passing by SIMD/FP registers first. */
3891
3892 nvrn = pcum->aapcs_nvrn;
3893
3894 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3895 and homogenous short-vector aggregates (HVA). */
3896 if (allocate_nvrn)
3897 {
3898 if (!TARGET_FLOAT)
3899 aarch64_err_no_fpadvsimd (mode);
3900
3901 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3902 {
3903 pcum->aapcs_nextnvrn = nvrn + nregs;
3904 if (!aarch64_composite_type_p (type, mode))
3905 {
3906 gcc_assert (nregs == 1);
3907 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3908 }
3909 else
3910 {
3911 rtx par;
3912 int i;
3913 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3914 for (i = 0; i < nregs; i++)
3915 {
3916 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3917 V0_REGNUM + nvrn + i);
3918 rtx offset = gen_int_mode
3919 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3920 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3921 XVECEXP (par, 0, i) = tmp;
3922 }
3923 pcum->aapcs_reg = par;
3924 }
3925 return;
3926 }
3927 else
3928 {
3929 /* C.3 NSRN is set to 8. */
3930 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3931 goto on_stack;
3932 }
3933 }
3934
3935 ncrn = pcum->aapcs_ncrn;
3936 nregs = size / UNITS_PER_WORD;
3937
3938 /* C6 - C9. though the sign and zero extension semantics are
3939 handled elsewhere. This is the case where the argument fits
3940 entirely general registers. */
3941 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3942 {
3943 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3944
3945 /* C.8 if the argument has an alignment of 16 then the NGRN is
3946 rounded up to the next even number. */
3947 if (nregs == 2
3948 && ncrn % 2
3949 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3950 comparison is there because for > 16 * BITS_PER_UNIT
3951 alignment nregs should be > 2 and therefore it should be
3952 passed by reference rather than value. */
3953 && (aarch64_function_arg_alignment (mode, type, &abi_break)
3954 == 16 * BITS_PER_UNIT))
3955 {
3956 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3957 inform (input_location, "parameter passing for argument of type "
3958 "%qT changed in GCC 9.1", type);
3959 ++ncrn;
3960 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3961 }
3962
3963 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3964 A reg is still generated for it, but the caller should be smart
3965 enough not to use it. */
3966 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3967 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3968 else
3969 {
3970 rtx par;
3971 int i;
3972
3973 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3974 for (i = 0; i < nregs; i++)
3975 {
3976 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3977 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3978 GEN_INT (i * UNITS_PER_WORD));
3979 XVECEXP (par, 0, i) = tmp;
3980 }
3981 pcum->aapcs_reg = par;
3982 }
3983
3984 pcum->aapcs_nextncrn = ncrn + nregs;
3985 return;
3986 }
3987
3988 /* C.11 */
3989 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3990
3991 /* The argument is passed on stack; record the needed number of words for
3992 this argument and align the total size if necessary. */
3993 on_stack:
3994 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3995
3996 if (aarch64_function_arg_alignment (mode, type, &abi_break)
3997 == 16 * BITS_PER_UNIT)
3998 {
3999 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4000 if (pcum->aapcs_stack_size != new_size)
4001 {
4002 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4003 inform (input_location, "parameter passing for argument of type "
4004 "%qT changed in GCC 9.1", type);
4005 pcum->aapcs_stack_size = new_size;
4006 }
4007 }
4008 return;
4009 }
4010
4011 /* Implement TARGET_FUNCTION_ARG. */
4012
4013 static rtx
4014 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4015 const_tree type, bool named)
4016 {
4017 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4018 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4019
4020 if (mode == VOIDmode)
4021 return NULL_RTX;
4022
4023 aarch64_layout_arg (pcum_v, mode, type, named);
4024 return pcum->aapcs_reg;
4025 }
4026
4027 void
4028 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4029 const_tree fntype ATTRIBUTE_UNUSED,
4030 rtx libname ATTRIBUTE_UNUSED,
4031 const_tree fndecl ATTRIBUTE_UNUSED,
4032 unsigned n_named ATTRIBUTE_UNUSED)
4033 {
4034 pcum->aapcs_ncrn = 0;
4035 pcum->aapcs_nvrn = 0;
4036 pcum->aapcs_nextncrn = 0;
4037 pcum->aapcs_nextnvrn = 0;
4038 pcum->pcs_variant = ARM_PCS_AAPCS64;
4039 pcum->aapcs_reg = NULL_RTX;
4040 pcum->aapcs_arg_processed = false;
4041 pcum->aapcs_stack_words = 0;
4042 pcum->aapcs_stack_size = 0;
4043
4044 if (!TARGET_FLOAT
4045 && fndecl && TREE_PUBLIC (fndecl)
4046 && fntype && fntype != error_mark_node)
4047 {
4048 const_tree type = TREE_TYPE (fntype);
4049 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4050 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4051 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4052 &mode, &nregs, NULL))
4053 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4054 }
4055 return;
4056 }
4057
4058 static void
4059 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4060 machine_mode mode,
4061 const_tree type,
4062 bool named)
4063 {
4064 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4065 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4066 {
4067 aarch64_layout_arg (pcum_v, mode, type, named);
4068 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4069 != (pcum->aapcs_stack_words != 0));
4070 pcum->aapcs_arg_processed = false;
4071 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4072 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4073 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4074 pcum->aapcs_stack_words = 0;
4075 pcum->aapcs_reg = NULL_RTX;
4076 }
4077 }
4078
4079 bool
4080 aarch64_function_arg_regno_p (unsigned regno)
4081 {
4082 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4083 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4084 }
4085
4086 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4087 PARM_BOUNDARY bits of alignment, but will be given anything up
4088 to STACK_BOUNDARY bits if the type requires it. This makes sure
4089 that both before and after the layout of each argument, the Next
4090 Stacked Argument Address (NSAA) will have a minimum alignment of
4091 8 bytes. */
4092
4093 static unsigned int
4094 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4095 {
4096 bool abi_break;
4097 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4098 &abi_break);
4099 if (abi_break & warn_psabi)
4100 inform (input_location, "parameter passing for argument of type "
4101 "%qT changed in GCC 9.1", type);
4102
4103 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4104 }
4105
4106 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4107
4108 static fixed_size_mode
4109 aarch64_get_reg_raw_mode (int regno)
4110 {
4111 if (TARGET_SVE && FP_REGNUM_P (regno))
4112 /* Don't use the SVE part of the register for __builtin_apply and
4113 __builtin_return. The SVE registers aren't used by the normal PCS,
4114 so using them there would be a waste of time. The PCS extensions
4115 for SVE types are fundamentally incompatible with the
4116 __builtin_return/__builtin_apply interface. */
4117 return as_a <fixed_size_mode> (V16QImode);
4118 return default_get_reg_raw_mode (regno);
4119 }
4120
4121 /* Implement TARGET_FUNCTION_ARG_PADDING.
4122
4123 Small aggregate types are placed in the lowest memory address.
4124
4125 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4126
4127 static pad_direction
4128 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4129 {
4130 /* On little-endian targets, the least significant byte of every stack
4131 argument is passed at the lowest byte address of the stack slot. */
4132 if (!BYTES_BIG_ENDIAN)
4133 return PAD_UPWARD;
4134
4135 /* Otherwise, integral, floating-point and pointer types are padded downward:
4136 the least significant byte of a stack argument is passed at the highest
4137 byte address of the stack slot. */
4138 if (type
4139 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4140 || POINTER_TYPE_P (type))
4141 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4142 return PAD_DOWNWARD;
4143
4144 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4145 return PAD_UPWARD;
4146 }
4147
4148 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4149
4150 It specifies padding for the last (may also be the only)
4151 element of a block move between registers and memory. If
4152 assuming the block is in the memory, padding upward means that
4153 the last element is padded after its highest significant byte,
4154 while in downward padding, the last element is padded at the
4155 its least significant byte side.
4156
4157 Small aggregates and small complex types are always padded
4158 upwards.
4159
4160 We don't need to worry about homogeneous floating-point or
4161 short-vector aggregates; their move is not affected by the
4162 padding direction determined here. Regardless of endianness,
4163 each element of such an aggregate is put in the least
4164 significant bits of a fp/simd register.
4165
4166 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4167 register has useful data, and return the opposite if the most
4168 significant byte does. */
4169
4170 bool
4171 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4172 bool first ATTRIBUTE_UNUSED)
4173 {
4174
4175 /* Small composite types are always padded upward. */
4176 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4177 {
4178 HOST_WIDE_INT size;
4179 if (type)
4180 size = int_size_in_bytes (type);
4181 else
4182 /* No frontends can create types with variable-sized modes, so we
4183 shouldn't be asked to pass or return them. */
4184 size = GET_MODE_SIZE (mode).to_constant ();
4185 if (size < 2 * UNITS_PER_WORD)
4186 return true;
4187 }
4188
4189 /* Otherwise, use the default padding. */
4190 return !BYTES_BIG_ENDIAN;
4191 }
4192
4193 static scalar_int_mode
4194 aarch64_libgcc_cmp_return_mode (void)
4195 {
4196 return SImode;
4197 }
4198
4199 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4200
4201 /* We use the 12-bit shifted immediate arithmetic instructions so values
4202 must be multiple of (1 << 12), i.e. 4096. */
4203 #define ARITH_FACTOR 4096
4204
4205 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4206 #error Cannot use simple address calculation for stack probing
4207 #endif
4208
4209 /* The pair of scratch registers used for stack probing. */
4210 #define PROBE_STACK_FIRST_REG R9_REGNUM
4211 #define PROBE_STACK_SECOND_REG R10_REGNUM
4212
4213 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4214 inclusive. These are offsets from the current stack pointer. */
4215
4216 static void
4217 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4218 {
4219 HOST_WIDE_INT size;
4220 if (!poly_size.is_constant (&size))
4221 {
4222 sorry ("stack probes for SVE frames");
4223 return;
4224 }
4225
4226 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4227
4228 /* See the same assertion on PROBE_INTERVAL above. */
4229 gcc_assert ((first % ARITH_FACTOR) == 0);
4230
4231 /* See if we have a constant small number of probes to generate. If so,
4232 that's the easy case. */
4233 if (size <= PROBE_INTERVAL)
4234 {
4235 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4236
4237 emit_set_insn (reg1,
4238 plus_constant (Pmode,
4239 stack_pointer_rtx, -(first + base)));
4240 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4241 }
4242
4243 /* The run-time loop is made up of 8 insns in the generic case while the
4244 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4245 else if (size <= 4 * PROBE_INTERVAL)
4246 {
4247 HOST_WIDE_INT i, rem;
4248
4249 emit_set_insn (reg1,
4250 plus_constant (Pmode,
4251 stack_pointer_rtx,
4252 -(first + PROBE_INTERVAL)));
4253 emit_stack_probe (reg1);
4254
4255 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4256 it exceeds SIZE. If only two probes are needed, this will not
4257 generate any code. Then probe at FIRST + SIZE. */
4258 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4259 {
4260 emit_set_insn (reg1,
4261 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4262 emit_stack_probe (reg1);
4263 }
4264
4265 rem = size - (i - PROBE_INTERVAL);
4266 if (rem > 256)
4267 {
4268 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4269
4270 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4271 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4272 }
4273 else
4274 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4275 }
4276
4277 /* Otherwise, do the same as above, but in a loop. Note that we must be
4278 extra careful with variables wrapping around because we might be at
4279 the very top (or the very bottom) of the address space and we have
4280 to be able to handle this case properly; in particular, we use an
4281 equality test for the loop condition. */
4282 else
4283 {
4284 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4285
4286 /* Step 1: round SIZE to the previous multiple of the interval. */
4287
4288 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4289
4290
4291 /* Step 2: compute initial and final value of the loop counter. */
4292
4293 /* TEST_ADDR = SP + FIRST. */
4294 emit_set_insn (reg1,
4295 plus_constant (Pmode, stack_pointer_rtx, -first));
4296
4297 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4298 HOST_WIDE_INT adjustment = - (first + rounded_size);
4299 if (! aarch64_uimm12_shift (adjustment))
4300 {
4301 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4302 true, Pmode);
4303 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4304 }
4305 else
4306 emit_set_insn (reg2,
4307 plus_constant (Pmode, stack_pointer_rtx, adjustment));
4308
4309 /* Step 3: the loop
4310
4311 do
4312 {
4313 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4314 probe at TEST_ADDR
4315 }
4316 while (TEST_ADDR != LAST_ADDR)
4317
4318 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4319 until it is equal to ROUNDED_SIZE. */
4320
4321 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4322
4323
4324 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4325 that SIZE is equal to ROUNDED_SIZE. */
4326
4327 if (size != rounded_size)
4328 {
4329 HOST_WIDE_INT rem = size - rounded_size;
4330
4331 if (rem > 256)
4332 {
4333 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4334
4335 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4336 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4337 }
4338 else
4339 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4340 }
4341 }
4342
4343 /* Make sure nothing is scheduled before we are done. */
4344 emit_insn (gen_blockage ());
4345 }
4346
4347 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4348 absolute addresses. */
4349
4350 const char *
4351 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4352 {
4353 static int labelno = 0;
4354 char loop_lab[32];
4355 rtx xops[2];
4356
4357 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4358
4359 /* Loop. */
4360 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4361
4362 HOST_WIDE_INT stack_clash_probe_interval
4363 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4364
4365 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4366 xops[0] = reg1;
4367 HOST_WIDE_INT interval;
4368 if (flag_stack_clash_protection)
4369 interval = stack_clash_probe_interval;
4370 else
4371 interval = PROBE_INTERVAL;
4372
4373 gcc_assert (aarch64_uimm12_shift (interval));
4374 xops[1] = GEN_INT (interval);
4375
4376 output_asm_insn ("sub\t%0, %0, %1", xops);
4377
4378 /* If doing stack clash protection then we probe up by the ABI specified
4379 amount. We do this because we're dropping full pages at a time in the
4380 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4381 if (flag_stack_clash_protection)
4382 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4383 else
4384 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4385
4386 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4387 by this amount for each iteration. */
4388 output_asm_insn ("str\txzr, [%0, %1]", xops);
4389
4390 /* Test if TEST_ADDR == LAST_ADDR. */
4391 xops[1] = reg2;
4392 output_asm_insn ("cmp\t%0, %1", xops);
4393
4394 /* Branch. */
4395 fputs ("\tb.ne\t", asm_out_file);
4396 assemble_name_raw (asm_out_file, loop_lab);
4397 fputc ('\n', asm_out_file);
4398
4399 return "";
4400 }
4401
4402 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4403 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4404 of GUARD_SIZE. When a probe is emitted it is done at most
4405 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4406 at most MIN_PROBE_THRESHOLD. By the end of this function
4407 BASE = BASE - ADJUSTMENT. */
4408
4409 const char *
4410 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4411 rtx min_probe_threshold, rtx guard_size)
4412 {
4413 /* This function is not allowed to use any instruction generation function
4414 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4415 so instead emit the code you want using output_asm_insn. */
4416 gcc_assert (flag_stack_clash_protection);
4417 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4418 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4419
4420 /* The minimum required allocation before the residual requires probing. */
4421 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4422
4423 /* Clamp the value down to the nearest value that can be used with a cmp. */
4424 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4425 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4426
4427 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4428 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4429
4430 static int labelno = 0;
4431 char loop_start_lab[32];
4432 char loop_end_lab[32];
4433 rtx xops[2];
4434
4435 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4436 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4437
4438 /* Emit loop start label. */
4439 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4440
4441 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4442 xops[0] = adjustment;
4443 xops[1] = probe_offset_value_rtx;
4444 output_asm_insn ("cmp\t%0, %1", xops);
4445
4446 /* Branch to end if not enough adjustment to probe. */
4447 fputs ("\tb.lt\t", asm_out_file);
4448 assemble_name_raw (asm_out_file, loop_end_lab);
4449 fputc ('\n', asm_out_file);
4450
4451 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4452 xops[0] = base;
4453 xops[1] = probe_offset_value_rtx;
4454 output_asm_insn ("sub\t%0, %0, %1", xops);
4455
4456 /* Probe at BASE. */
4457 xops[1] = const0_rtx;
4458 output_asm_insn ("str\txzr, [%0, %1]", xops);
4459
4460 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4461 xops[0] = adjustment;
4462 xops[1] = probe_offset_value_rtx;
4463 output_asm_insn ("sub\t%0, %0, %1", xops);
4464
4465 /* Branch to start if still more bytes to allocate. */
4466 fputs ("\tb\t", asm_out_file);
4467 assemble_name_raw (asm_out_file, loop_start_lab);
4468 fputc ('\n', asm_out_file);
4469
4470 /* No probe leave. */
4471 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4472
4473 /* BASE = BASE - ADJUSTMENT. */
4474 xops[0] = base;
4475 xops[1] = adjustment;
4476 output_asm_insn ("sub\t%0, %0, %1", xops);
4477 return "";
4478 }
4479
4480 /* Determine whether a frame chain needs to be generated. */
4481 static bool
4482 aarch64_needs_frame_chain (void)
4483 {
4484 /* Force a frame chain for EH returns so the return address is at FP+8. */
4485 if (frame_pointer_needed || crtl->calls_eh_return)
4486 return true;
4487
4488 /* A leaf function cannot have calls or write LR. */
4489 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4490
4491 /* Don't use a frame chain in leaf functions if leaf frame pointers
4492 are disabled. */
4493 if (flag_omit_leaf_frame_pointer && is_leaf)
4494 return false;
4495
4496 return aarch64_use_frame_pointer;
4497 }
4498
4499 /* Mark the registers that need to be saved by the callee and calculate
4500 the size of the callee-saved registers area and frame record (both FP
4501 and LR may be omitted). */
4502 static void
4503 aarch64_layout_frame (void)
4504 {
4505 HOST_WIDE_INT offset = 0;
4506 int regno, last_fp_reg = INVALID_REGNUM;
4507 bool simd_function = aarch64_simd_decl_p (cfun->decl);
4508
4509 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4510
4511 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4512 the mid-end is doing. */
4513 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4514
4515 #define SLOT_NOT_REQUIRED (-2)
4516 #define SLOT_REQUIRED (-1)
4517
4518 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4519 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4520
4521 /* If this is a non-leaf simd function with calls we assume that
4522 at least one of those calls is to a non-simd function and thus
4523 we must save V8 to V23 in the prologue. */
4524
4525 if (simd_function && !crtl->is_leaf)
4526 {
4527 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4528 if (FP_SIMD_SAVED_REGNUM_P (regno))
4529 df_set_regs_ever_live (regno, true);
4530 }
4531
4532 /* First mark all the registers that really need to be saved... */
4533 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4534 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4535
4536 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4537 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4538
4539 /* ... that includes the eh data registers (if needed)... */
4540 if (crtl->calls_eh_return)
4541 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4542 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4543 = SLOT_REQUIRED;
4544
4545 /* ... and any callee saved register that dataflow says is live. */
4546 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4547 if (df_regs_ever_live_p (regno)
4548 && (regno == R30_REGNUM
4549 || !call_used_regs[regno]))
4550 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4551
4552 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4553 if (df_regs_ever_live_p (regno)
4554 && (!call_used_regs[regno]
4555 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4556 {
4557 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4558 last_fp_reg = regno;
4559 }
4560
4561 if (cfun->machine->frame.emit_frame_chain)
4562 {
4563 /* FP and LR are placed in the linkage record. */
4564 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4565 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4566 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4567 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4568 offset = 2 * UNITS_PER_WORD;
4569 }
4570
4571 /* With stack-clash, LR must be saved in non-leaf functions. */
4572 gcc_assert (crtl->is_leaf
4573 || (cfun->machine->frame.reg_offset[R30_REGNUM]
4574 != SLOT_NOT_REQUIRED));
4575
4576 /* Now assign stack slots for them. */
4577 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4578 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4579 {
4580 cfun->machine->frame.reg_offset[regno] = offset;
4581 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4582 cfun->machine->frame.wb_candidate1 = regno;
4583 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4584 cfun->machine->frame.wb_candidate2 = regno;
4585 offset += UNITS_PER_WORD;
4586 }
4587
4588 HOST_WIDE_INT max_int_offset = offset;
4589 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4590 bool has_align_gap = offset != max_int_offset;
4591
4592 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4593 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4594 {
4595 /* If there is an alignment gap between integer and fp callee-saves,
4596 allocate the last fp register to it if possible. */
4597 if (regno == last_fp_reg
4598 && has_align_gap
4599 && !simd_function
4600 && (offset & 8) == 0)
4601 {
4602 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4603 break;
4604 }
4605
4606 cfun->machine->frame.reg_offset[regno] = offset;
4607 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4608 cfun->machine->frame.wb_candidate1 = regno;
4609 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4610 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4611 cfun->machine->frame.wb_candidate2 = regno;
4612 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4613 }
4614
4615 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4616
4617 cfun->machine->frame.saved_regs_size = offset;
4618
4619 HOST_WIDE_INT varargs_and_saved_regs_size
4620 = offset + cfun->machine->frame.saved_varargs_size;
4621
4622 cfun->machine->frame.hard_fp_offset
4623 = aligned_upper_bound (varargs_and_saved_regs_size
4624 + get_frame_size (),
4625 STACK_BOUNDARY / BITS_PER_UNIT);
4626
4627 /* Both these values are already aligned. */
4628 gcc_assert (multiple_p (crtl->outgoing_args_size,
4629 STACK_BOUNDARY / BITS_PER_UNIT));
4630 cfun->machine->frame.frame_size
4631 = (cfun->machine->frame.hard_fp_offset
4632 + crtl->outgoing_args_size);
4633
4634 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4635
4636 cfun->machine->frame.initial_adjust = 0;
4637 cfun->machine->frame.final_adjust = 0;
4638 cfun->machine->frame.callee_adjust = 0;
4639 cfun->machine->frame.callee_offset = 0;
4640
4641 HOST_WIDE_INT max_push_offset = 0;
4642 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4643 max_push_offset = 512;
4644 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4645 max_push_offset = 256;
4646
4647 HOST_WIDE_INT const_size, const_fp_offset;
4648 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4649 && const_size < max_push_offset
4650 && known_eq (crtl->outgoing_args_size, 0))
4651 {
4652 /* Simple, small frame with no outgoing arguments:
4653 stp reg1, reg2, [sp, -frame_size]!
4654 stp reg3, reg4, [sp, 16] */
4655 cfun->machine->frame.callee_adjust = const_size;
4656 }
4657 else if (known_lt (crtl->outgoing_args_size
4658 + cfun->machine->frame.saved_regs_size, 512)
4659 && !(cfun->calls_alloca
4660 && known_lt (cfun->machine->frame.hard_fp_offset,
4661 max_push_offset)))
4662 {
4663 /* Frame with small outgoing arguments:
4664 sub sp, sp, frame_size
4665 stp reg1, reg2, [sp, outgoing_args_size]
4666 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4667 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4668 cfun->machine->frame.callee_offset
4669 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4670 }
4671 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4672 && const_fp_offset < max_push_offset)
4673 {
4674 /* Frame with large outgoing arguments but a small local area:
4675 stp reg1, reg2, [sp, -hard_fp_offset]!
4676 stp reg3, reg4, [sp, 16]
4677 sub sp, sp, outgoing_args_size */
4678 cfun->machine->frame.callee_adjust = const_fp_offset;
4679 cfun->machine->frame.final_adjust
4680 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4681 }
4682 else
4683 {
4684 /* Frame with large local area and outgoing arguments using frame pointer:
4685 sub sp, sp, hard_fp_offset
4686 stp x29, x30, [sp, 0]
4687 add x29, sp, 0
4688 stp reg3, reg4, [sp, 16]
4689 sub sp, sp, outgoing_args_size */
4690 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4691 cfun->machine->frame.final_adjust
4692 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4693 }
4694
4695 cfun->machine->frame.laid_out = true;
4696 }
4697
4698 /* Return true if the register REGNO is saved on entry to
4699 the current function. */
4700
4701 static bool
4702 aarch64_register_saved_on_entry (int regno)
4703 {
4704 return cfun->machine->frame.reg_offset[regno] >= 0;
4705 }
4706
4707 /* Return the next register up from REGNO up to LIMIT for the callee
4708 to save. */
4709
4710 static unsigned
4711 aarch64_next_callee_save (unsigned regno, unsigned limit)
4712 {
4713 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4714 regno ++;
4715 return regno;
4716 }
4717
4718 /* Push the register number REGNO of mode MODE to the stack with write-back
4719 adjusting the stack by ADJUSTMENT. */
4720
4721 static void
4722 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4723 HOST_WIDE_INT adjustment)
4724 {
4725 rtx base_rtx = stack_pointer_rtx;
4726 rtx insn, reg, mem;
4727
4728 reg = gen_rtx_REG (mode, regno);
4729 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4730 plus_constant (Pmode, base_rtx, -adjustment));
4731 mem = gen_frame_mem (mode, mem);
4732
4733 insn = emit_move_insn (mem, reg);
4734 RTX_FRAME_RELATED_P (insn) = 1;
4735 }
4736
4737 /* Generate and return an instruction to store the pair of registers
4738 REG and REG2 of mode MODE to location BASE with write-back adjusting
4739 the stack location BASE by ADJUSTMENT. */
4740
4741 static rtx
4742 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4743 HOST_WIDE_INT adjustment)
4744 {
4745 switch (mode)
4746 {
4747 case E_DImode:
4748 return gen_storewb_pairdi_di (base, base, reg, reg2,
4749 GEN_INT (-adjustment),
4750 GEN_INT (UNITS_PER_WORD - adjustment));
4751 case E_DFmode:
4752 return gen_storewb_pairdf_di (base, base, reg, reg2,
4753 GEN_INT (-adjustment),
4754 GEN_INT (UNITS_PER_WORD - adjustment));
4755 case E_TFmode:
4756 return gen_storewb_pairtf_di (base, base, reg, reg2,
4757 GEN_INT (-adjustment),
4758 GEN_INT (UNITS_PER_VREG - adjustment));
4759 default:
4760 gcc_unreachable ();
4761 }
4762 }
4763
4764 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4765 stack pointer by ADJUSTMENT. */
4766
4767 static void
4768 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4769 {
4770 rtx_insn *insn;
4771 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4772
4773 if (regno2 == INVALID_REGNUM)
4774 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4775
4776 rtx reg1 = gen_rtx_REG (mode, regno1);
4777 rtx reg2 = gen_rtx_REG (mode, regno2);
4778
4779 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4780 reg2, adjustment));
4781 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4782 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4783 RTX_FRAME_RELATED_P (insn) = 1;
4784 }
4785
4786 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4787 adjusting it by ADJUSTMENT afterwards. */
4788
4789 static rtx
4790 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4791 HOST_WIDE_INT adjustment)
4792 {
4793 switch (mode)
4794 {
4795 case E_DImode:
4796 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4797 GEN_INT (UNITS_PER_WORD));
4798 case E_DFmode:
4799 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4800 GEN_INT (UNITS_PER_WORD));
4801 case E_TFmode:
4802 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4803 GEN_INT (UNITS_PER_VREG));
4804 default:
4805 gcc_unreachable ();
4806 }
4807 }
4808
4809 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4810 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4811 into CFI_OPS. */
4812
4813 static void
4814 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4815 rtx *cfi_ops)
4816 {
4817 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4818 rtx reg1 = gen_rtx_REG (mode, regno1);
4819
4820 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4821
4822 if (regno2 == INVALID_REGNUM)
4823 {
4824 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4825 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4826 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4827 }
4828 else
4829 {
4830 rtx reg2 = gen_rtx_REG (mode, regno2);
4831 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4832 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4833 reg2, adjustment));
4834 }
4835 }
4836
4837 /* Generate and return a store pair instruction of mode MODE to store
4838 register REG1 to MEM1 and register REG2 to MEM2. */
4839
4840 static rtx
4841 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4842 rtx reg2)
4843 {
4844 switch (mode)
4845 {
4846 case E_DImode:
4847 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4848
4849 case E_DFmode:
4850 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4851
4852 case E_TFmode:
4853 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4854
4855 default:
4856 gcc_unreachable ();
4857 }
4858 }
4859
4860 /* Generate and regurn a load pair isntruction of mode MODE to load register
4861 REG1 from MEM1 and register REG2 from MEM2. */
4862
4863 static rtx
4864 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4865 rtx mem2)
4866 {
4867 switch (mode)
4868 {
4869 case E_DImode:
4870 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4871
4872 case E_DFmode:
4873 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4874
4875 case E_TFmode:
4876 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4877
4878 default:
4879 gcc_unreachable ();
4880 }
4881 }
4882
4883 /* Return TRUE if return address signing should be enabled for the current
4884 function, otherwise return FALSE. */
4885
4886 bool
4887 aarch64_return_address_signing_enabled (void)
4888 {
4889 /* This function should only be called after frame laid out. */
4890 gcc_assert (cfun->machine->frame.laid_out);
4891
4892 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4893 if its LR is pushed onto stack. */
4894 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4895 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4896 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4897 }
4898
4899 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
4900 bool
4901 aarch64_bti_enabled (void)
4902 {
4903 return (aarch64_enable_bti == 1);
4904 }
4905
4906 /* Emit code to save the callee-saved registers from register number START
4907 to LIMIT to the stack at the location starting at offset START_OFFSET,
4908 skipping any write-back candidates if SKIP_WB is true. */
4909
4910 static void
4911 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4912 unsigned start, unsigned limit, bool skip_wb)
4913 {
4914 rtx_insn *insn;
4915 unsigned regno;
4916 unsigned regno2;
4917
4918 for (regno = aarch64_next_callee_save (start, limit);
4919 regno <= limit;
4920 regno = aarch64_next_callee_save (regno + 1, limit))
4921 {
4922 rtx reg, mem;
4923 poly_int64 offset;
4924 int offset_diff;
4925
4926 if (skip_wb
4927 && (regno == cfun->machine->frame.wb_candidate1
4928 || regno == cfun->machine->frame.wb_candidate2))
4929 continue;
4930
4931 if (cfun->machine->reg_is_wrapped_separately[regno])
4932 continue;
4933
4934 reg = gen_rtx_REG (mode, regno);
4935 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4936 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4937 offset));
4938
4939 regno2 = aarch64_next_callee_save (regno + 1, limit);
4940 offset_diff = cfun->machine->frame.reg_offset[regno2]
4941 - cfun->machine->frame.reg_offset[regno];
4942
4943 if (regno2 <= limit
4944 && !cfun->machine->reg_is_wrapped_separately[regno2]
4945 && known_eq (GET_MODE_SIZE (mode), offset_diff))
4946 {
4947 rtx reg2 = gen_rtx_REG (mode, regno2);
4948 rtx mem2;
4949
4950 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4951 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4952 offset));
4953 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4954 reg2));
4955
4956 /* The first part of a frame-related parallel insn is
4957 always assumed to be relevant to the frame
4958 calculations; subsequent parts, are only
4959 frame-related if explicitly marked. */
4960 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4961 regno = regno2;
4962 }
4963 else
4964 insn = emit_move_insn (mem, reg);
4965
4966 RTX_FRAME_RELATED_P (insn) = 1;
4967 }
4968 }
4969
4970 /* Emit code to restore the callee registers of mode MODE from register
4971 number START up to and including LIMIT. Restore from the stack offset
4972 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4973 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4974
4975 static void
4976 aarch64_restore_callee_saves (machine_mode mode,
4977 poly_int64 start_offset, unsigned start,
4978 unsigned limit, bool skip_wb, rtx *cfi_ops)
4979 {
4980 rtx base_rtx = stack_pointer_rtx;
4981 unsigned regno;
4982 unsigned regno2;
4983 poly_int64 offset;
4984
4985 for (regno = aarch64_next_callee_save (start, limit);
4986 regno <= limit;
4987 regno = aarch64_next_callee_save (regno + 1, limit))
4988 {
4989 if (cfun->machine->reg_is_wrapped_separately[regno])
4990 continue;
4991
4992 rtx reg, mem;
4993 int offset_diff;
4994
4995 if (skip_wb
4996 && (regno == cfun->machine->frame.wb_candidate1
4997 || regno == cfun->machine->frame.wb_candidate2))
4998 continue;
4999
5000 reg = gen_rtx_REG (mode, regno);
5001 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5002 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5003
5004 regno2 = aarch64_next_callee_save (regno + 1, limit);
5005 offset_diff = cfun->machine->frame.reg_offset[regno2]
5006 - cfun->machine->frame.reg_offset[regno];
5007
5008 if (regno2 <= limit
5009 && !cfun->machine->reg_is_wrapped_separately[regno2]
5010 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5011 {
5012 rtx reg2 = gen_rtx_REG (mode, regno2);
5013 rtx mem2;
5014
5015 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5016 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5017 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5018
5019 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5020 regno = regno2;
5021 }
5022 else
5023 emit_move_insn (reg, mem);
5024 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5025 }
5026 }
5027
5028 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5029 of MODE. */
5030
5031 static inline bool
5032 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5033 {
5034 HOST_WIDE_INT multiple;
5035 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5036 && IN_RANGE (multiple, -8, 7));
5037 }
5038
5039 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5040 of MODE. */
5041
5042 static inline bool
5043 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5044 {
5045 HOST_WIDE_INT multiple;
5046 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5047 && IN_RANGE (multiple, 0, 63));
5048 }
5049
5050 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5051 of MODE. */
5052
5053 bool
5054 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5055 {
5056 HOST_WIDE_INT multiple;
5057 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5058 && IN_RANGE (multiple, -64, 63));
5059 }
5060
5061 /* Return true if OFFSET is a signed 9-bit value. */
5062
5063 bool
5064 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5065 poly_int64 offset)
5066 {
5067 HOST_WIDE_INT const_offset;
5068 return (offset.is_constant (&const_offset)
5069 && IN_RANGE (const_offset, -256, 255));
5070 }
5071
5072 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5073 of MODE. */
5074
5075 static inline bool
5076 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5077 {
5078 HOST_WIDE_INT multiple;
5079 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5080 && IN_RANGE (multiple, -256, 255));
5081 }
5082
5083 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5084 of MODE. */
5085
5086 static inline bool
5087 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5088 {
5089 HOST_WIDE_INT multiple;
5090 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5091 && IN_RANGE (multiple, 0, 4095));
5092 }
5093
5094 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5095
5096 static sbitmap
5097 aarch64_get_separate_components (void)
5098 {
5099 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5100 bitmap_clear (components);
5101
5102 /* The registers we need saved to the frame. */
5103 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5104 if (aarch64_register_saved_on_entry (regno))
5105 {
5106 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5107 if (!frame_pointer_needed)
5108 offset += cfun->machine->frame.frame_size
5109 - cfun->machine->frame.hard_fp_offset;
5110 /* Check that we can access the stack slot of the register with one
5111 direct load with no adjustments needed. */
5112 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5113 bitmap_set_bit (components, regno);
5114 }
5115
5116 /* Don't mess with the hard frame pointer. */
5117 if (frame_pointer_needed)
5118 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5119
5120 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5121 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5122 /* If registers have been chosen to be stored/restored with
5123 writeback don't interfere with them to avoid having to output explicit
5124 stack adjustment instructions. */
5125 if (reg2 != INVALID_REGNUM)
5126 bitmap_clear_bit (components, reg2);
5127 if (reg1 != INVALID_REGNUM)
5128 bitmap_clear_bit (components, reg1);
5129
5130 bitmap_clear_bit (components, LR_REGNUM);
5131 bitmap_clear_bit (components, SP_REGNUM);
5132
5133 return components;
5134 }
5135
5136 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5137
5138 static sbitmap
5139 aarch64_components_for_bb (basic_block bb)
5140 {
5141 bitmap in = DF_LIVE_IN (bb);
5142 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5143 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5144 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5145
5146 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5147 bitmap_clear (components);
5148
5149 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5150 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5151 if ((!call_used_regs[regno]
5152 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5153 && (bitmap_bit_p (in, regno)
5154 || bitmap_bit_p (gen, regno)
5155 || bitmap_bit_p (kill, regno)))
5156 {
5157 unsigned regno2, offset, offset2;
5158 bitmap_set_bit (components, regno);
5159
5160 /* If there is a callee-save at an adjacent offset, add it too
5161 to increase the use of LDP/STP. */
5162 offset = cfun->machine->frame.reg_offset[regno];
5163 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5164
5165 if (regno2 <= LAST_SAVED_REGNUM)
5166 {
5167 offset2 = cfun->machine->frame.reg_offset[regno2];
5168 if ((offset & ~8) == (offset2 & ~8))
5169 bitmap_set_bit (components, regno2);
5170 }
5171 }
5172
5173 return components;
5174 }
5175
5176 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5177 Nothing to do for aarch64. */
5178
5179 static void
5180 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5181 {
5182 }
5183
5184 /* Return the next set bit in BMP from START onwards. Return the total number
5185 of bits in BMP if no set bit is found at or after START. */
5186
5187 static unsigned int
5188 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5189 {
5190 unsigned int nbits = SBITMAP_SIZE (bmp);
5191 if (start == nbits)
5192 return start;
5193
5194 gcc_assert (start < nbits);
5195 for (unsigned int i = start; i < nbits; i++)
5196 if (bitmap_bit_p (bmp, i))
5197 return i;
5198
5199 return nbits;
5200 }
5201
5202 /* Do the work for aarch64_emit_prologue_components and
5203 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5204 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5205 for these components or the epilogue sequence. That is, it determines
5206 whether we should emit stores or loads and what kind of CFA notes to attach
5207 to the insns. Otherwise the logic for the two sequences is very
5208 similar. */
5209
5210 static void
5211 aarch64_process_components (sbitmap components, bool prologue_p)
5212 {
5213 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5214 ? HARD_FRAME_POINTER_REGNUM
5215 : STACK_POINTER_REGNUM);
5216
5217 unsigned last_regno = SBITMAP_SIZE (components);
5218 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5219 rtx_insn *insn = NULL;
5220
5221 while (regno != last_regno)
5222 {
5223 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5224 so DFmode for the vector registers is enough. For simd functions
5225 we want to save the low 128 bits. */
5226 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5227
5228 rtx reg = gen_rtx_REG (mode, regno);
5229 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5230 if (!frame_pointer_needed)
5231 offset += cfun->machine->frame.frame_size
5232 - cfun->machine->frame.hard_fp_offset;
5233 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5234 rtx mem = gen_frame_mem (mode, addr);
5235
5236 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5237 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5238 /* No more registers to handle after REGNO.
5239 Emit a single save/restore and exit. */
5240 if (regno2 == last_regno)
5241 {
5242 insn = emit_insn (set);
5243 RTX_FRAME_RELATED_P (insn) = 1;
5244 if (prologue_p)
5245 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5246 else
5247 add_reg_note (insn, REG_CFA_RESTORE, reg);
5248 break;
5249 }
5250
5251 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5252 /* The next register is not of the same class or its offset is not
5253 mergeable with the current one into a pair. */
5254 if (!satisfies_constraint_Ump (mem)
5255 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5256 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5257 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5258 GET_MODE_SIZE (mode)))
5259 {
5260 insn = emit_insn (set);
5261 RTX_FRAME_RELATED_P (insn) = 1;
5262 if (prologue_p)
5263 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5264 else
5265 add_reg_note (insn, REG_CFA_RESTORE, reg);
5266
5267 regno = regno2;
5268 continue;
5269 }
5270
5271 /* REGNO2 can be saved/restored in a pair with REGNO. */
5272 rtx reg2 = gen_rtx_REG (mode, regno2);
5273 if (!frame_pointer_needed)
5274 offset2 += cfun->machine->frame.frame_size
5275 - cfun->machine->frame.hard_fp_offset;
5276 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5277 rtx mem2 = gen_frame_mem (mode, addr2);
5278 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5279 : gen_rtx_SET (reg2, mem2);
5280
5281 if (prologue_p)
5282 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5283 else
5284 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5285
5286 RTX_FRAME_RELATED_P (insn) = 1;
5287 if (prologue_p)
5288 {
5289 add_reg_note (insn, REG_CFA_OFFSET, set);
5290 add_reg_note (insn, REG_CFA_OFFSET, set2);
5291 }
5292 else
5293 {
5294 add_reg_note (insn, REG_CFA_RESTORE, reg);
5295 add_reg_note (insn, REG_CFA_RESTORE, reg2);
5296 }
5297
5298 regno = aarch64_get_next_set_bit (components, regno2 + 1);
5299 }
5300 }
5301
5302 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5303
5304 static void
5305 aarch64_emit_prologue_components (sbitmap components)
5306 {
5307 aarch64_process_components (components, true);
5308 }
5309
5310 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5311
5312 static void
5313 aarch64_emit_epilogue_components (sbitmap components)
5314 {
5315 aarch64_process_components (components, false);
5316 }
5317
5318 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5319
5320 static void
5321 aarch64_set_handled_components (sbitmap components)
5322 {
5323 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5324 if (bitmap_bit_p (components, regno))
5325 cfun->machine->reg_is_wrapped_separately[regno] = true;
5326 }
5327
5328 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5329 determining the probe offset for alloca. */
5330
5331 static HOST_WIDE_INT
5332 aarch64_stack_clash_protection_alloca_probe_range (void)
5333 {
5334 return STACK_CLASH_CALLER_GUARD;
5335 }
5336
5337
5338 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5339 registers. If POLY_SIZE is not large enough to require a probe this function
5340 will only adjust the stack. When allocating the stack space
5341 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5342 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5343 arguments. If we are then we ensure that any allocation larger than the ABI
5344 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5345 maintained.
5346
5347 We emit barriers after each stack adjustment to prevent optimizations from
5348 breaking the invariant that we never drop the stack more than a page. This
5349 invariant is needed to make it easier to correctly handle asynchronous
5350 events, e.g. if we were to allow the stack to be dropped by more than a page
5351 and then have multiple probes up and we take a signal somewhere in between
5352 then the signal handler doesn't know the state of the stack and can make no
5353 assumptions about which pages have been probed. */
5354
5355 static void
5356 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5357 poly_int64 poly_size,
5358 bool frame_related_p,
5359 bool final_adjustment_p)
5360 {
5361 HOST_WIDE_INT guard_size
5362 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5363 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5364 /* When doing the final adjustment for the outgoing argument size we can't
5365 assume that LR was saved at position 0. So subtract it's offset from the
5366 ABI safe buffer so that we don't accidentally allow an adjustment that
5367 would result in an allocation larger than the ABI buffer without
5368 probing. */
5369 HOST_WIDE_INT min_probe_threshold
5370 = final_adjustment_p
5371 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5372 : guard_size - guard_used_by_caller;
5373
5374 poly_int64 frame_size = cfun->machine->frame.frame_size;
5375
5376 /* We should always have a positive probe threshold. */
5377 gcc_assert (min_probe_threshold > 0);
5378
5379 if (flag_stack_clash_protection && !final_adjustment_p)
5380 {
5381 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5382 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5383
5384 if (known_eq (frame_size, 0))
5385 {
5386 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5387 }
5388 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5389 && known_lt (final_adjust, guard_used_by_caller))
5390 {
5391 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5392 }
5393 }
5394
5395 /* If SIZE is not large enough to require probing, just adjust the stack and
5396 exit. */
5397 if (known_lt (poly_size, min_probe_threshold)
5398 || !flag_stack_clash_protection)
5399 {
5400 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5401 return;
5402 }
5403
5404 HOST_WIDE_INT size;
5405 /* Handle the SVE non-constant case first. */
5406 if (!poly_size.is_constant (&size))
5407 {
5408 if (dump_file)
5409 {
5410 fprintf (dump_file, "Stack clash SVE prologue: ");
5411 print_dec (poly_size, dump_file);
5412 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5413 }
5414
5415 /* First calculate the amount of bytes we're actually spilling. */
5416 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5417 poly_size, temp1, temp2, false, true);
5418
5419 rtx_insn *insn = get_last_insn ();
5420
5421 if (frame_related_p)
5422 {
5423 /* This is done to provide unwinding information for the stack
5424 adjustments we're about to do, however to prevent the optimizers
5425 from removing the R11 move and leaving the CFA note (which would be
5426 very wrong) we tie the old and new stack pointer together.
5427 The tie will expand to nothing but the optimizers will not touch
5428 the instruction. */
5429 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5430 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5431 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5432
5433 /* We want the CFA independent of the stack pointer for the
5434 duration of the loop. */
5435 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5436 RTX_FRAME_RELATED_P (insn) = 1;
5437 }
5438
5439 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5440 rtx guard_const = gen_int_mode (guard_size, Pmode);
5441
5442 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5443 stack_pointer_rtx, temp1,
5444 probe_const, guard_const));
5445
5446 /* Now reset the CFA register if needed. */
5447 if (frame_related_p)
5448 {
5449 add_reg_note (insn, REG_CFA_DEF_CFA,
5450 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5451 gen_int_mode (poly_size, Pmode)));
5452 RTX_FRAME_RELATED_P (insn) = 1;
5453 }
5454
5455 return;
5456 }
5457
5458 if (dump_file)
5459 fprintf (dump_file,
5460 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5461 " bytes, probing will be required.\n", size);
5462
5463 /* Round size to the nearest multiple of guard_size, and calculate the
5464 residual as the difference between the original size and the rounded
5465 size. */
5466 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5467 HOST_WIDE_INT residual = size - rounded_size;
5468
5469 /* We can handle a small number of allocations/probes inline. Otherwise
5470 punt to a loop. */
5471 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5472 {
5473 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5474 {
5475 aarch64_sub_sp (NULL, temp2, guard_size, true);
5476 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5477 guard_used_by_caller));
5478 emit_insn (gen_blockage ());
5479 }
5480 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5481 }
5482 else
5483 {
5484 /* Compute the ending address. */
5485 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5486 temp1, NULL, false, true);
5487 rtx_insn *insn = get_last_insn ();
5488
5489 /* For the initial allocation, we don't have a frame pointer
5490 set up, so we always need CFI notes. If we're doing the
5491 final allocation, then we may have a frame pointer, in which
5492 case it is the CFA, otherwise we need CFI notes.
5493
5494 We can determine which allocation we are doing by looking at
5495 the value of FRAME_RELATED_P since the final allocations are not
5496 frame related. */
5497 if (frame_related_p)
5498 {
5499 /* We want the CFA independent of the stack pointer for the
5500 duration of the loop. */
5501 add_reg_note (insn, REG_CFA_DEF_CFA,
5502 plus_constant (Pmode, temp1, rounded_size));
5503 RTX_FRAME_RELATED_P (insn) = 1;
5504 }
5505
5506 /* This allocates and probes the stack. Note that this re-uses some of
5507 the existing Ada stack protection code. However we are guaranteed not
5508 to enter the non loop or residual branches of that code.
5509
5510 The non-loop part won't be entered because if our allocation amount
5511 doesn't require a loop, the case above would handle it.
5512
5513 The residual amount won't be entered because TEMP1 is a mutliple of
5514 the allocation size. The residual will always be 0. As such, the only
5515 part we are actually using from that code is the loop setup. The
5516 actual probing is done in aarch64_output_probe_stack_range. */
5517 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5518 stack_pointer_rtx, temp1));
5519
5520 /* Now reset the CFA register if needed. */
5521 if (frame_related_p)
5522 {
5523 add_reg_note (insn, REG_CFA_DEF_CFA,
5524 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5525 RTX_FRAME_RELATED_P (insn) = 1;
5526 }
5527
5528 emit_insn (gen_blockage ());
5529 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5530 }
5531
5532 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5533 be probed. This maintains the requirement that each page is probed at
5534 least once. For initial probing we probe only if the allocation is
5535 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5536 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5537 GUARD_SIZE. This works that for any allocation that is large enough to
5538 trigger a probe here, we'll have at least one, and if they're not large
5539 enough for this code to emit anything for them, The page would have been
5540 probed by the saving of FP/LR either by this function or any callees. If
5541 we don't have any callees then we won't have more stack adjustments and so
5542 are still safe. */
5543 if (residual)
5544 {
5545 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5546 /* If we're doing final adjustments, and we've done any full page
5547 allocations then any residual needs to be probed. */
5548 if (final_adjustment_p && rounded_size != 0)
5549 min_probe_threshold = 0;
5550 /* If doing a small final adjustment, we always probe at offset 0.
5551 This is done to avoid issues when LR is not at position 0 or when
5552 the final adjustment is smaller than the probing offset. */
5553 else if (final_adjustment_p && rounded_size == 0)
5554 residual_probe_offset = 0;
5555
5556 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5557 if (residual >= min_probe_threshold)
5558 {
5559 if (dump_file)
5560 fprintf (dump_file,
5561 "Stack clash AArch64 prologue residuals: "
5562 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5563 "\n", residual);
5564
5565 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5566 residual_probe_offset));
5567 emit_insn (gen_blockage ());
5568 }
5569 }
5570 }
5571
5572 /* Return 1 if the register is used by the epilogue. We need to say the
5573 return register is used, but only after epilogue generation is complete.
5574 Note that in the case of sibcalls, the values "used by the epilogue" are
5575 considered live at the start of the called function.
5576
5577 For SIMD functions we need to return 1 for FP registers that are saved and
5578 restored by a function but are not zero in call_used_regs. If we do not do
5579 this optimizations may remove the restore of the register. */
5580
5581 int
5582 aarch64_epilogue_uses (int regno)
5583 {
5584 if (epilogue_completed)
5585 {
5586 if (regno == LR_REGNUM)
5587 return 1;
5588 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5589 return 1;
5590 }
5591 return 0;
5592 }
5593
5594 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5595 is saved at BASE + OFFSET. */
5596
5597 static void
5598 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5599 rtx base, poly_int64 offset)
5600 {
5601 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5602 add_reg_note (insn, REG_CFA_EXPRESSION,
5603 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5604 }
5605
5606 /* AArch64 stack frames generated by this compiler look like:
5607
5608 +-------------------------------+
5609 | |
5610 | incoming stack arguments |
5611 | |
5612 +-------------------------------+
5613 | | <-- incoming stack pointer (aligned)
5614 | callee-allocated save area |
5615 | for register varargs |
5616 | |
5617 +-------------------------------+
5618 | local variables | <-- frame_pointer_rtx
5619 | |
5620 +-------------------------------+
5621 | padding | \
5622 +-------------------------------+ |
5623 | callee-saved registers | | frame.saved_regs_size
5624 +-------------------------------+ |
5625 | LR' | |
5626 +-------------------------------+ |
5627 | FP' | / <- hard_frame_pointer_rtx (aligned)
5628 +-------------------------------+
5629 | dynamic allocation |
5630 +-------------------------------+
5631 | padding |
5632 +-------------------------------+
5633 | outgoing stack arguments | <-- arg_pointer
5634 | |
5635 +-------------------------------+
5636 | | <-- stack_pointer_rtx (aligned)
5637
5638 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5639 but leave frame_pointer_rtx and hard_frame_pointer_rtx
5640 unchanged.
5641
5642 By default for stack-clash we assume the guard is at least 64KB, but this
5643 value is configurable to either 4KB or 64KB. We also force the guard size to
5644 be the same as the probing interval and both values are kept in sync.
5645
5646 With those assumptions the callee can allocate up to 63KB (or 3KB depending
5647 on the guard size) of stack space without probing.
5648
5649 When probing is needed, we emit a probe at the start of the prologue
5650 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5651
5652 We have to track how much space has been allocated and the only stores
5653 to the stack we track as implicit probes are the FP/LR stores.
5654
5655 For outgoing arguments we probe if the size is larger than 1KB, such that
5656 the ABI specified buffer is maintained for the next callee.
5657
5658 The following registers are reserved during frame layout and should not be
5659 used for any other purpose:
5660
5661 - r11: Used by stack clash protection when SVE is enabled.
5662 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5663 - r14 and r15: Used for speculation tracking.
5664 - r16(IP0), r17(IP1): Used by indirect tailcalls.
5665 - r30(LR), r29(FP): Used by standard frame layout.
5666
5667 These registers must be avoided in frame layout related code unless the
5668 explicit intention is to interact with one of the features listed above. */
5669
5670 /* Generate the prologue instructions for entry into a function.
5671 Establish the stack frame by decreasing the stack pointer with a
5672 properly calculated size and, if necessary, create a frame record
5673 filled with the values of LR and previous frame pointer. The
5674 current FP is also set up if it is in use. */
5675
5676 void
5677 aarch64_expand_prologue (void)
5678 {
5679 poly_int64 frame_size = cfun->machine->frame.frame_size;
5680 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5681 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5682 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5683 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5684 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5685 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5686 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5687 rtx_insn *insn;
5688
5689 /* Sign return address for functions. */
5690 if (aarch64_return_address_signing_enabled ())
5691 {
5692 switch (aarch64_ra_sign_key)
5693 {
5694 case AARCH64_KEY_A:
5695 insn = emit_insn (gen_paciasp ());
5696 break;
5697 case AARCH64_KEY_B:
5698 insn = emit_insn (gen_pacibsp ());
5699 break;
5700 default:
5701 gcc_unreachable ();
5702 }
5703 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5704 RTX_FRAME_RELATED_P (insn) = 1;
5705 }
5706
5707 if (flag_stack_usage_info)
5708 current_function_static_stack_size = constant_lower_bound (frame_size);
5709
5710 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5711 {
5712 if (crtl->is_leaf && !cfun->calls_alloca)
5713 {
5714 if (maybe_gt (frame_size, PROBE_INTERVAL)
5715 && maybe_gt (frame_size, get_stack_check_protect ()))
5716 aarch64_emit_probe_stack_range (get_stack_check_protect (),
5717 (frame_size
5718 - get_stack_check_protect ()));
5719 }
5720 else if (maybe_gt (frame_size, 0))
5721 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5722 }
5723
5724 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5725 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5726
5727 /* In theory we should never have both an initial adjustment
5728 and a callee save adjustment. Verify that is the case since the
5729 code below does not handle it for -fstack-clash-protection. */
5730 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5731
5732 /* Will only probe if the initial adjustment is larger than the guard
5733 less the amount of the guard reserved for use by the caller's
5734 outgoing args. */
5735 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5736 true, false);
5737
5738 if (callee_adjust != 0)
5739 aarch64_push_regs (reg1, reg2, callee_adjust);
5740
5741 if (emit_frame_chain)
5742 {
5743 poly_int64 reg_offset = callee_adjust;
5744 if (callee_adjust == 0)
5745 {
5746 reg1 = R29_REGNUM;
5747 reg2 = R30_REGNUM;
5748 reg_offset = callee_offset;
5749 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5750 }
5751 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5752 stack_pointer_rtx, callee_offset,
5753 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5754 if (frame_pointer_needed && !frame_size.is_constant ())
5755 {
5756 /* Variable-sized frames need to describe the save slot
5757 address using DW_CFA_expression rather than DW_CFA_offset.
5758 This means that, without taking further action, the
5759 locations of the registers that we've already saved would
5760 remain based on the stack pointer even after we redefine
5761 the CFA based on the frame pointer. We therefore need new
5762 DW_CFA_expressions to re-express the save slots with addresses
5763 based on the frame pointer. */
5764 rtx_insn *insn = get_last_insn ();
5765 gcc_assert (RTX_FRAME_RELATED_P (insn));
5766
5767 /* Add an explicit CFA definition if this was previously
5768 implicit. */
5769 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5770 {
5771 rtx src = plus_constant (Pmode, stack_pointer_rtx,
5772 callee_offset);
5773 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5774 gen_rtx_SET (hard_frame_pointer_rtx, src));
5775 }
5776
5777 /* Change the save slot expressions for the registers that
5778 we've already saved. */
5779 reg_offset -= callee_offset;
5780 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5781 reg_offset + UNITS_PER_WORD);
5782 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5783 reg_offset);
5784 }
5785 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5786 }
5787
5788 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5789 callee_adjust != 0 || emit_frame_chain);
5790 if (aarch64_simd_decl_p (cfun->decl))
5791 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5792 callee_adjust != 0 || emit_frame_chain);
5793 else
5794 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5795 callee_adjust != 0 || emit_frame_chain);
5796
5797 /* We may need to probe the final adjustment if it is larger than the guard
5798 that is assumed by the called. */
5799 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5800 !frame_pointer_needed, true);
5801 }
5802
5803 /* Return TRUE if we can use a simple_return insn.
5804
5805 This function checks whether the callee saved stack is empty, which
5806 means no restore actions are need. The pro_and_epilogue will use
5807 this to check whether shrink-wrapping opt is feasible. */
5808
5809 bool
5810 aarch64_use_return_insn_p (void)
5811 {
5812 if (!reload_completed)
5813 return false;
5814
5815 if (crtl->profile)
5816 return false;
5817
5818 return known_eq (cfun->machine->frame.frame_size, 0);
5819 }
5820
5821 /* Return false for non-leaf SIMD functions in order to avoid
5822 shrink-wrapping them. Doing this will lose the necessary
5823 save/restore of FP registers. */
5824
5825 bool
5826 aarch64_use_simple_return_insn_p (void)
5827 {
5828 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5829 return false;
5830
5831 return true;
5832 }
5833
5834 /* Generate the epilogue instructions for returning from a function.
5835 This is almost exactly the reverse of the prolog sequence, except
5836 that we need to insert barriers to avoid scheduling loads that read
5837 from a deallocated stack, and we optimize the unwind records by
5838 emitting them all together if possible. */
5839 void
5840 aarch64_expand_epilogue (bool for_sibcall)
5841 {
5842 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5843 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5844 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5845 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5846 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5847 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5848 rtx cfi_ops = NULL;
5849 rtx_insn *insn;
5850 /* A stack clash protection prologue may not have left EP0_REGNUM or
5851 EP1_REGNUM in a usable state. The same is true for allocations
5852 with an SVE component, since we then need both temporary registers
5853 for each allocation. For stack clash we are in a usable state if
5854 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
5855 HOST_WIDE_INT guard_size
5856 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5857 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5858
5859 /* We can re-use the registers when the allocation amount is smaller than
5860 guard_size - guard_used_by_caller because we won't be doing any probes
5861 then. In such situations the register should remain live with the correct
5862 value. */
5863 bool can_inherit_p = (initial_adjust.is_constant ()
5864 && final_adjust.is_constant ())
5865 && (!flag_stack_clash_protection
5866 || known_lt (initial_adjust,
5867 guard_size - guard_used_by_caller));
5868
5869 /* We need to add memory barrier to prevent read from deallocated stack. */
5870 bool need_barrier_p
5871 = maybe_ne (get_frame_size ()
5872 + cfun->machine->frame.saved_varargs_size, 0);
5873
5874 /* Emit a barrier to prevent loads from a deallocated stack. */
5875 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5876 || cfun->calls_alloca
5877 || crtl->calls_eh_return)
5878 {
5879 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5880 need_barrier_p = false;
5881 }
5882
5883 /* Restore the stack pointer from the frame pointer if it may not
5884 be the same as the stack pointer. */
5885 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5886 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5887 if (frame_pointer_needed
5888 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5889 /* If writeback is used when restoring callee-saves, the CFA
5890 is restored on the instruction doing the writeback. */
5891 aarch64_add_offset (Pmode, stack_pointer_rtx,
5892 hard_frame_pointer_rtx, -callee_offset,
5893 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5894 else
5895 /* The case where we need to re-use the register here is very rare, so
5896 avoid the complicated condition and just always emit a move if the
5897 immediate doesn't fit. */
5898 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5899
5900 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5901 callee_adjust != 0, &cfi_ops);
5902 if (aarch64_simd_decl_p (cfun->decl))
5903 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5904 callee_adjust != 0, &cfi_ops);
5905 else
5906 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5907 callee_adjust != 0, &cfi_ops);
5908
5909 if (need_barrier_p)
5910 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5911
5912 if (callee_adjust != 0)
5913 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5914
5915 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5916 {
5917 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5918 insn = get_last_insn ();
5919 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5920 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5921 RTX_FRAME_RELATED_P (insn) = 1;
5922 cfi_ops = NULL;
5923 }
5924
5925 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5926 add restriction on emit_move optimization to leaf functions. */
5927 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5928 (!can_inherit_p || !crtl->is_leaf
5929 || df_regs_ever_live_p (EP0_REGNUM)));
5930
5931 if (cfi_ops)
5932 {
5933 /* Emit delayed restores and reset the CFA to be SP. */
5934 insn = get_last_insn ();
5935 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5936 REG_NOTES (insn) = cfi_ops;
5937 RTX_FRAME_RELATED_P (insn) = 1;
5938 }
5939
5940 /* We prefer to emit the combined return/authenticate instruction RETAA,
5941 however there are three cases in which we must instead emit an explicit
5942 authentication instruction.
5943
5944 1) Sibcalls don't return in a normal way, so if we're about to call one
5945 we must authenticate.
5946
5947 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5948 generating code for !TARGET_ARMV8_3 we can't use it and must
5949 explicitly authenticate.
5950
5951 3) On an eh_return path we make extra stack adjustments to update the
5952 canonical frame address to be the exception handler's CFA. We want
5953 to authenticate using the CFA of the function which calls eh_return.
5954 */
5955 if (aarch64_return_address_signing_enabled ()
5956 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5957 {
5958 switch (aarch64_ra_sign_key)
5959 {
5960 case AARCH64_KEY_A:
5961 insn = emit_insn (gen_autiasp ());
5962 break;
5963 case AARCH64_KEY_B:
5964 insn = emit_insn (gen_autibsp ());
5965 break;
5966 default:
5967 gcc_unreachable ();
5968 }
5969 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5970 RTX_FRAME_RELATED_P (insn) = 1;
5971 }
5972
5973 /* Stack adjustment for exception handler. */
5974 if (crtl->calls_eh_return && !for_sibcall)
5975 {
5976 /* We need to unwind the stack by the offset computed by
5977 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5978 to be SP; letting the CFA move during this adjustment
5979 is just as correct as retaining the CFA from the body
5980 of the function. Therefore, do nothing special. */
5981 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5982 }
5983
5984 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5985 if (!for_sibcall)
5986 emit_jump_insn (ret_rtx);
5987 }
5988
5989 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5990 normally or return to a previous frame after unwinding.
5991
5992 An EH return uses a single shared return sequence. The epilogue is
5993 exactly like a normal epilogue except that it has an extra input
5994 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5995 that must be applied after the frame has been destroyed. An extra label
5996 is inserted before the epilogue which initializes this register to zero,
5997 and this is the entry point for a normal return.
5998
5999 An actual EH return updates the return address, initializes the stack
6000 adjustment and jumps directly into the epilogue (bypassing the zeroing
6001 of the adjustment). Since the return address is typically saved on the
6002 stack when a function makes a call, the saved LR must be updated outside
6003 the epilogue.
6004
6005 This poses problems as the store is generated well before the epilogue,
6006 so the offset of LR is not known yet. Also optimizations will remove the
6007 store as it appears dead, even after the epilogue is generated (as the
6008 base or offset for loading LR is different in many cases).
6009
6010 To avoid these problems this implementation forces the frame pointer
6011 in eh_return functions so that the location of LR is fixed and known early.
6012 It also marks the store volatile, so no optimization is permitted to
6013 remove the store. */
6014 rtx
6015 aarch64_eh_return_handler_rtx (void)
6016 {
6017 rtx tmp = gen_frame_mem (Pmode,
6018 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6019
6020 /* Mark the store volatile, so no optimization is permitted to remove it. */
6021 MEM_VOLATILE_P (tmp) = true;
6022 return tmp;
6023 }
6024
6025 /* Output code to add DELTA to the first argument, and then jump
6026 to FUNCTION. Used for C++ multiple inheritance. */
6027 static void
6028 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6029 HOST_WIDE_INT delta,
6030 HOST_WIDE_INT vcall_offset,
6031 tree function)
6032 {
6033 /* The this pointer is always in x0. Note that this differs from
6034 Arm where the this pointer maybe bumped to r1 if r0 is required
6035 to return a pointer to an aggregate. On AArch64 a result value
6036 pointer will be in x8. */
6037 int this_regno = R0_REGNUM;
6038 rtx this_rtx, temp0, temp1, addr, funexp;
6039 rtx_insn *insn;
6040 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6041
6042 if (aarch64_bti_enabled ())
6043 emit_insn (gen_bti_c());
6044
6045 reload_completed = 1;
6046 emit_note (NOTE_INSN_PROLOGUE_END);
6047
6048 this_rtx = gen_rtx_REG (Pmode, this_regno);
6049 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6050 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6051
6052 if (vcall_offset == 0)
6053 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6054 else
6055 {
6056 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6057
6058 addr = this_rtx;
6059 if (delta != 0)
6060 {
6061 if (delta >= -256 && delta < 256)
6062 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6063 plus_constant (Pmode, this_rtx, delta));
6064 else
6065 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6066 temp1, temp0, false);
6067 }
6068
6069 if (Pmode == ptr_mode)
6070 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6071 else
6072 aarch64_emit_move (temp0,
6073 gen_rtx_ZERO_EXTEND (Pmode,
6074 gen_rtx_MEM (ptr_mode, addr)));
6075
6076 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6077 addr = plus_constant (Pmode, temp0, vcall_offset);
6078 else
6079 {
6080 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6081 Pmode);
6082 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6083 }
6084
6085 if (Pmode == ptr_mode)
6086 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6087 else
6088 aarch64_emit_move (temp1,
6089 gen_rtx_SIGN_EXTEND (Pmode,
6090 gen_rtx_MEM (ptr_mode, addr)));
6091
6092 emit_insn (gen_add2_insn (this_rtx, temp1));
6093 }
6094
6095 /* Generate a tail call to the target function. */
6096 if (!TREE_USED (function))
6097 {
6098 assemble_external (function);
6099 TREE_USED (function) = 1;
6100 }
6101 funexp = XEXP (DECL_RTL (function), 0);
6102 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6103 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6104 SIBLING_CALL_P (insn) = 1;
6105
6106 insn = get_insns ();
6107 shorten_branches (insn);
6108
6109 assemble_start_function (thunk, fnname);
6110 final_start_function (insn, file, 1);
6111 final (insn, file, 1);
6112 final_end_function ();
6113 assemble_end_function (thunk, fnname);
6114
6115 /* Stop pretending to be a post-reload pass. */
6116 reload_completed = 0;
6117 }
6118
6119 static bool
6120 aarch64_tls_referenced_p (rtx x)
6121 {
6122 if (!TARGET_HAVE_TLS)
6123 return false;
6124 subrtx_iterator::array_type array;
6125 FOR_EACH_SUBRTX (iter, array, x, ALL)
6126 {
6127 const_rtx x = *iter;
6128 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6129 return true;
6130 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6131 TLS offsets, not real symbol references. */
6132 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6133 iter.skip_subrtxes ();
6134 }
6135 return false;
6136 }
6137
6138
6139 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6140 a left shift of 0 or 12 bits. */
6141 bool
6142 aarch64_uimm12_shift (HOST_WIDE_INT val)
6143 {
6144 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6145 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6146 );
6147 }
6148
6149 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6150 that can be created with a left shift of 0 or 12. */
6151 static HOST_WIDE_INT
6152 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6153 {
6154 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6155 handle correctly. */
6156 gcc_assert ((val & 0xffffff) == val);
6157
6158 if (((val & 0xfff) << 0) == val)
6159 return val;
6160
6161 return val & (0xfff << 12);
6162 }
6163
6164 /* Return true if val is an immediate that can be loaded into a
6165 register by a MOVZ instruction. */
6166 static bool
6167 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6168 {
6169 if (GET_MODE_SIZE (mode) > 4)
6170 {
6171 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6172 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6173 return 1;
6174 }
6175 else
6176 {
6177 /* Ignore sign extension. */
6178 val &= (HOST_WIDE_INT) 0xffffffff;
6179 }
6180 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6181 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6182 }
6183
6184 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6185 64-bit (DImode) integer. */
6186
6187 static unsigned HOST_WIDE_INT
6188 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6189 {
6190 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6191 while (size < 64)
6192 {
6193 val &= (HOST_WIDE_INT_1U << size) - 1;
6194 val |= val << size;
6195 size *= 2;
6196 }
6197 return val;
6198 }
6199
6200 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6201
6202 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6203 {
6204 0x0000000100000001ull,
6205 0x0001000100010001ull,
6206 0x0101010101010101ull,
6207 0x1111111111111111ull,
6208 0x5555555555555555ull,
6209 };
6210
6211
6212 /* Return true if val is a valid bitmask immediate. */
6213
6214 bool
6215 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6216 {
6217 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6218 int bits;
6219
6220 /* Check for a single sequence of one bits and return quickly if so.
6221 The special cases of all ones and all zeroes returns false. */
6222 val = aarch64_replicate_bitmask_imm (val_in, mode);
6223 tmp = val + (val & -val);
6224
6225 if (tmp == (tmp & -tmp))
6226 return (val + 1) > 1;
6227
6228 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6229 if (mode == SImode)
6230 val = (val << 32) | (val & 0xffffffff);
6231
6232 /* Invert if the immediate doesn't start with a zero bit - this means we
6233 only need to search for sequences of one bits. */
6234 if (val & 1)
6235 val = ~val;
6236
6237 /* Find the first set bit and set tmp to val with the first sequence of one
6238 bits removed. Return success if there is a single sequence of ones. */
6239 first_one = val & -val;
6240 tmp = val & (val + first_one);
6241
6242 if (tmp == 0)
6243 return true;
6244
6245 /* Find the next set bit and compute the difference in bit position. */
6246 next_one = tmp & -tmp;
6247 bits = clz_hwi (first_one) - clz_hwi (next_one);
6248 mask = val ^ tmp;
6249
6250 /* Check the bit position difference is a power of 2, and that the first
6251 sequence of one bits fits within 'bits' bits. */
6252 if ((mask >> bits) != 0 || bits != (bits & -bits))
6253 return false;
6254
6255 /* Check the sequence of one bits is repeated 64/bits times. */
6256 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6257 }
6258
6259 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6260 Assumed precondition: VAL_IN Is not zero. */
6261
6262 unsigned HOST_WIDE_INT
6263 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6264 {
6265 int lowest_bit_set = ctz_hwi (val_in);
6266 int highest_bit_set = floor_log2 (val_in);
6267 gcc_assert (val_in != 0);
6268
6269 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6270 (HOST_WIDE_INT_1U << lowest_bit_set));
6271 }
6272
6273 /* Create constant where bits outside of lowest bit set to highest bit set
6274 are set to 1. */
6275
6276 unsigned HOST_WIDE_INT
6277 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6278 {
6279 return val_in | ~aarch64_and_split_imm1 (val_in);
6280 }
6281
6282 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6283
6284 bool
6285 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6286 {
6287 scalar_int_mode int_mode;
6288 if (!is_a <scalar_int_mode> (mode, &int_mode))
6289 return false;
6290
6291 if (aarch64_bitmask_imm (val_in, int_mode))
6292 return false;
6293
6294 if (aarch64_move_imm (val_in, int_mode))
6295 return false;
6296
6297 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6298
6299 return aarch64_bitmask_imm (imm2, int_mode);
6300 }
6301
6302 /* Return true if val is an immediate that can be loaded into a
6303 register in a single instruction. */
6304 bool
6305 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6306 {
6307 scalar_int_mode int_mode;
6308 if (!is_a <scalar_int_mode> (mode, &int_mode))
6309 return false;
6310
6311 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6312 return 1;
6313 return aarch64_bitmask_imm (val, int_mode);
6314 }
6315
6316 static bool
6317 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6318 {
6319 rtx base, offset;
6320
6321 if (GET_CODE (x) == HIGH)
6322 return true;
6323
6324 /* There's no way to calculate VL-based values using relocations. */
6325 subrtx_iterator::array_type array;
6326 FOR_EACH_SUBRTX (iter, array, x, ALL)
6327 if (GET_CODE (*iter) == CONST_POLY_INT)
6328 return true;
6329
6330 split_const (x, &base, &offset);
6331 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6332 {
6333 if (aarch64_classify_symbol (base, INTVAL (offset))
6334 != SYMBOL_FORCE_TO_MEM)
6335 return true;
6336 else
6337 /* Avoid generating a 64-bit relocation in ILP32; leave
6338 to aarch64_expand_mov_immediate to handle it properly. */
6339 return mode != ptr_mode;
6340 }
6341
6342 return aarch64_tls_referenced_p (x);
6343 }
6344
6345 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6346 The expansion for a table switch is quite expensive due to the number
6347 of instructions, the table lookup and hard to predict indirect jump.
6348 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6349 set, otherwise use tables for > 16 cases as a tradeoff between size and
6350 performance. When optimizing for size, use the default setting. */
6351
6352 static unsigned int
6353 aarch64_case_values_threshold (void)
6354 {
6355 /* Use the specified limit for the number of cases before using jump
6356 tables at higher optimization levels. */
6357 if (optimize > 2
6358 && selected_cpu->tune->max_case_values != 0)
6359 return selected_cpu->tune->max_case_values;
6360 else
6361 return optimize_size ? default_case_values_threshold () : 17;
6362 }
6363
6364 /* Return true if register REGNO is a valid index register.
6365 STRICT_P is true if REG_OK_STRICT is in effect. */
6366
6367 bool
6368 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6369 {
6370 if (!HARD_REGISTER_NUM_P (regno))
6371 {
6372 if (!strict_p)
6373 return true;
6374
6375 if (!reg_renumber)
6376 return false;
6377
6378 regno = reg_renumber[regno];
6379 }
6380 return GP_REGNUM_P (regno);
6381 }
6382
6383 /* Return true if register REGNO is a valid base register for mode MODE.
6384 STRICT_P is true if REG_OK_STRICT is in effect. */
6385
6386 bool
6387 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6388 {
6389 if (!HARD_REGISTER_NUM_P (regno))
6390 {
6391 if (!strict_p)
6392 return true;
6393
6394 if (!reg_renumber)
6395 return false;
6396
6397 regno = reg_renumber[regno];
6398 }
6399
6400 /* The fake registers will be eliminated to either the stack or
6401 hard frame pointer, both of which are usually valid base registers.
6402 Reload deals with the cases where the eliminated form isn't valid. */
6403 return (GP_REGNUM_P (regno)
6404 || regno == SP_REGNUM
6405 || regno == FRAME_POINTER_REGNUM
6406 || regno == ARG_POINTER_REGNUM);
6407 }
6408
6409 /* Return true if X is a valid base register for mode MODE.
6410 STRICT_P is true if REG_OK_STRICT is in effect. */
6411
6412 static bool
6413 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6414 {
6415 if (!strict_p
6416 && GET_CODE (x) == SUBREG
6417 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6418 x = SUBREG_REG (x);
6419
6420 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6421 }
6422
6423 /* Return true if address offset is a valid index. If it is, fill in INFO
6424 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6425
6426 static bool
6427 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6428 machine_mode mode, bool strict_p)
6429 {
6430 enum aarch64_address_type type;
6431 rtx index;
6432 int shift;
6433
6434 /* (reg:P) */
6435 if ((REG_P (x) || GET_CODE (x) == SUBREG)
6436 && GET_MODE (x) == Pmode)
6437 {
6438 type = ADDRESS_REG_REG;
6439 index = x;
6440 shift = 0;
6441 }
6442 /* (sign_extend:DI (reg:SI)) */
6443 else if ((GET_CODE (x) == SIGN_EXTEND
6444 || GET_CODE (x) == ZERO_EXTEND)
6445 && GET_MODE (x) == DImode
6446 && GET_MODE (XEXP (x, 0)) == SImode)
6447 {
6448 type = (GET_CODE (x) == SIGN_EXTEND)
6449 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6450 index = XEXP (x, 0);
6451 shift = 0;
6452 }
6453 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6454 else if (GET_CODE (x) == MULT
6455 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6456 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6457 && GET_MODE (XEXP (x, 0)) == DImode
6458 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6459 && CONST_INT_P (XEXP (x, 1)))
6460 {
6461 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6462 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6463 index = XEXP (XEXP (x, 0), 0);
6464 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6465 }
6466 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6467 else if (GET_CODE (x) == ASHIFT
6468 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6469 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6470 && GET_MODE (XEXP (x, 0)) == DImode
6471 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6472 && CONST_INT_P (XEXP (x, 1)))
6473 {
6474 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6475 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6476 index = XEXP (XEXP (x, 0), 0);
6477 shift = INTVAL (XEXP (x, 1));
6478 }
6479 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6480 else if ((GET_CODE (x) == SIGN_EXTRACT
6481 || GET_CODE (x) == ZERO_EXTRACT)
6482 && GET_MODE (x) == DImode
6483 && GET_CODE (XEXP (x, 0)) == MULT
6484 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6485 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6486 {
6487 type = (GET_CODE (x) == SIGN_EXTRACT)
6488 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6489 index = XEXP (XEXP (x, 0), 0);
6490 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6491 if (INTVAL (XEXP (x, 1)) != 32 + shift
6492 || INTVAL (XEXP (x, 2)) != 0)
6493 shift = -1;
6494 }
6495 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6496 (const_int 0xffffffff<<shift)) */
6497 else if (GET_CODE (x) == AND
6498 && GET_MODE (x) == DImode
6499 && GET_CODE (XEXP (x, 0)) == MULT
6500 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6501 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6502 && CONST_INT_P (XEXP (x, 1)))
6503 {
6504 type = ADDRESS_REG_UXTW;
6505 index = XEXP (XEXP (x, 0), 0);
6506 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6507 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6508 shift = -1;
6509 }
6510 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6511 else if ((GET_CODE (x) == SIGN_EXTRACT
6512 || GET_CODE (x) == ZERO_EXTRACT)
6513 && GET_MODE (x) == DImode
6514 && GET_CODE (XEXP (x, 0)) == ASHIFT
6515 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6516 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6517 {
6518 type = (GET_CODE (x) == SIGN_EXTRACT)
6519 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6520 index = XEXP (XEXP (x, 0), 0);
6521 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6522 if (INTVAL (XEXP (x, 1)) != 32 + shift
6523 || INTVAL (XEXP (x, 2)) != 0)
6524 shift = -1;
6525 }
6526 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6527 (const_int 0xffffffff<<shift)) */
6528 else if (GET_CODE (x) == AND
6529 && GET_MODE (x) == DImode
6530 && GET_CODE (XEXP (x, 0)) == ASHIFT
6531 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6532 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6533 && CONST_INT_P (XEXP (x, 1)))
6534 {
6535 type = ADDRESS_REG_UXTW;
6536 index = XEXP (XEXP (x, 0), 0);
6537 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6538 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6539 shift = -1;
6540 }
6541 /* (mult:P (reg:P) (const_int scale)) */
6542 else if (GET_CODE (x) == MULT
6543 && GET_MODE (x) == Pmode
6544 && GET_MODE (XEXP (x, 0)) == Pmode
6545 && CONST_INT_P (XEXP (x, 1)))
6546 {
6547 type = ADDRESS_REG_REG;
6548 index = XEXP (x, 0);
6549 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6550 }
6551 /* (ashift:P (reg:P) (const_int shift)) */
6552 else if (GET_CODE (x) == ASHIFT
6553 && GET_MODE (x) == Pmode
6554 && GET_MODE (XEXP (x, 0)) == Pmode
6555 && CONST_INT_P (XEXP (x, 1)))
6556 {
6557 type = ADDRESS_REG_REG;
6558 index = XEXP (x, 0);
6559 shift = INTVAL (XEXP (x, 1));
6560 }
6561 else
6562 return false;
6563
6564 if (!strict_p
6565 && GET_CODE (index) == SUBREG
6566 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6567 index = SUBREG_REG (index);
6568
6569 if (aarch64_sve_data_mode_p (mode))
6570 {
6571 if (type != ADDRESS_REG_REG
6572 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6573 return false;
6574 }
6575 else
6576 {
6577 if (shift != 0
6578 && !(IN_RANGE (shift, 1, 3)
6579 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6580 return false;
6581 }
6582
6583 if (REG_P (index)
6584 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6585 {
6586 info->type = type;
6587 info->offset = index;
6588 info->shift = shift;
6589 return true;
6590 }
6591
6592 return false;
6593 }
6594
6595 /* Return true if MODE is one of the modes for which we
6596 support LDP/STP operations. */
6597
6598 static bool
6599 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6600 {
6601 return mode == SImode || mode == DImode
6602 || mode == SFmode || mode == DFmode
6603 || (aarch64_vector_mode_supported_p (mode)
6604 && (known_eq (GET_MODE_SIZE (mode), 8)
6605 || (known_eq (GET_MODE_SIZE (mode), 16)
6606 && (aarch64_tune_params.extra_tuning_flags
6607 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6608 }
6609
6610 /* Return true if REGNO is a virtual pointer register, or an eliminable
6611 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6612 include stack_pointer or hard_frame_pointer. */
6613 static bool
6614 virt_or_elim_regno_p (unsigned regno)
6615 {
6616 return ((regno >= FIRST_VIRTUAL_REGISTER
6617 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6618 || regno == FRAME_POINTER_REGNUM
6619 || regno == ARG_POINTER_REGNUM);
6620 }
6621
6622 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6623 If it is, fill in INFO appropriately. STRICT_P is true if
6624 REG_OK_STRICT is in effect. */
6625
6626 bool
6627 aarch64_classify_address (struct aarch64_address_info *info,
6628 rtx x, machine_mode mode, bool strict_p,
6629 aarch64_addr_query_type type)
6630 {
6631 enum rtx_code code = GET_CODE (x);
6632 rtx op0, op1;
6633 poly_int64 offset;
6634
6635 HOST_WIDE_INT const_size;
6636
6637 /* On BE, we use load/store pair for all large int mode load/stores.
6638 TI/TFmode may also use a load/store pair. */
6639 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6640 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6641 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6642 || type == ADDR_QUERY_LDP_STP_N
6643 || mode == TImode
6644 || mode == TFmode
6645 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6646
6647 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6648 corresponds to the actual size of the memory being loaded/stored and the
6649 mode of the corresponding addressing mode is half of that. */
6650 if (type == ADDR_QUERY_LDP_STP_N
6651 && known_eq (GET_MODE_SIZE (mode), 16))
6652 mode = DFmode;
6653
6654 bool allow_reg_index_p = (!load_store_pair_p
6655 && (known_lt (GET_MODE_SIZE (mode), 16)
6656 || vec_flags == VEC_ADVSIMD
6657 || vec_flags & VEC_SVE_DATA));
6658
6659 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6660 [Rn, #offset, MUL VL]. */
6661 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6662 && (code != REG && code != PLUS))
6663 return false;
6664
6665 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6666 REG addressing. */
6667 if (advsimd_struct_p
6668 && !BYTES_BIG_ENDIAN
6669 && (code != POST_INC && code != REG))
6670 return false;
6671
6672 gcc_checking_assert (GET_MODE (x) == VOIDmode
6673 || SCALAR_INT_MODE_P (GET_MODE (x)));
6674
6675 switch (code)
6676 {
6677 case REG:
6678 case SUBREG:
6679 info->type = ADDRESS_REG_IMM;
6680 info->base = x;
6681 info->offset = const0_rtx;
6682 info->const_offset = 0;
6683 return aarch64_base_register_rtx_p (x, strict_p);
6684
6685 case PLUS:
6686 op0 = XEXP (x, 0);
6687 op1 = XEXP (x, 1);
6688
6689 if (! strict_p
6690 && REG_P (op0)
6691 && virt_or_elim_regno_p (REGNO (op0))
6692 && poly_int_rtx_p (op1, &offset))
6693 {
6694 info->type = ADDRESS_REG_IMM;
6695 info->base = op0;
6696 info->offset = op1;
6697 info->const_offset = offset;
6698
6699 return true;
6700 }
6701
6702 if (maybe_ne (GET_MODE_SIZE (mode), 0)
6703 && aarch64_base_register_rtx_p (op0, strict_p)
6704 && poly_int_rtx_p (op1, &offset))
6705 {
6706 info->type = ADDRESS_REG_IMM;
6707 info->base = op0;
6708 info->offset = op1;
6709 info->const_offset = offset;
6710
6711 /* TImode and TFmode values are allowed in both pairs of X
6712 registers and individual Q registers. The available
6713 address modes are:
6714 X,X: 7-bit signed scaled offset
6715 Q: 9-bit signed offset
6716 We conservatively require an offset representable in either mode.
6717 When performing the check for pairs of X registers i.e. LDP/STP
6718 pass down DImode since that is the natural size of the LDP/STP
6719 instruction memory accesses. */
6720 if (mode == TImode || mode == TFmode)
6721 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6722 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6723 || offset_12bit_unsigned_scaled_p (mode, offset)));
6724
6725 /* A 7bit offset check because OImode will emit a ldp/stp
6726 instruction (only big endian will get here).
6727 For ldp/stp instructions, the offset is scaled for the size of a
6728 single element of the pair. */
6729 if (mode == OImode)
6730 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6731
6732 /* Three 9/12 bit offsets checks because CImode will emit three
6733 ldr/str instructions (only big endian will get here). */
6734 if (mode == CImode)
6735 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6736 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6737 offset + 32)
6738 || offset_12bit_unsigned_scaled_p (V16QImode,
6739 offset + 32)));
6740
6741 /* Two 7bit offsets checks because XImode will emit two ldp/stp
6742 instructions (only big endian will get here). */
6743 if (mode == XImode)
6744 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6745 && aarch64_offset_7bit_signed_scaled_p (TImode,
6746 offset + 32));
6747
6748 /* Make "m" use the LD1 offset range for SVE data modes, so
6749 that pre-RTL optimizers like ivopts will work to that
6750 instead of the wider LDR/STR range. */
6751 if (vec_flags == VEC_SVE_DATA)
6752 return (type == ADDR_QUERY_M
6753 ? offset_4bit_signed_scaled_p (mode, offset)
6754 : offset_9bit_signed_scaled_p (mode, offset));
6755
6756 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6757 {
6758 poly_int64 end_offset = (offset
6759 + GET_MODE_SIZE (mode)
6760 - BYTES_PER_SVE_VECTOR);
6761 return (type == ADDR_QUERY_M
6762 ? offset_4bit_signed_scaled_p (mode, offset)
6763 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6764 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6765 end_offset)));
6766 }
6767
6768 if (vec_flags == VEC_SVE_PRED)
6769 return offset_9bit_signed_scaled_p (mode, offset);
6770
6771 if (load_store_pair_p)
6772 return ((known_eq (GET_MODE_SIZE (mode), 4)
6773 || known_eq (GET_MODE_SIZE (mode), 8)
6774 || known_eq (GET_MODE_SIZE (mode), 16))
6775 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6776 else
6777 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6778 || offset_12bit_unsigned_scaled_p (mode, offset));
6779 }
6780
6781 if (allow_reg_index_p)
6782 {
6783 /* Look for base + (scaled/extended) index register. */
6784 if (aarch64_base_register_rtx_p (op0, strict_p)
6785 && aarch64_classify_index (info, op1, mode, strict_p))
6786 {
6787 info->base = op0;
6788 return true;
6789 }
6790 if (aarch64_base_register_rtx_p (op1, strict_p)
6791 && aarch64_classify_index (info, op0, mode, strict_p))
6792 {
6793 info->base = op1;
6794 return true;
6795 }
6796 }
6797
6798 return false;
6799
6800 case POST_INC:
6801 case POST_DEC:
6802 case PRE_INC:
6803 case PRE_DEC:
6804 info->type = ADDRESS_REG_WB;
6805 info->base = XEXP (x, 0);
6806 info->offset = NULL_RTX;
6807 return aarch64_base_register_rtx_p (info->base, strict_p);
6808
6809 case POST_MODIFY:
6810 case PRE_MODIFY:
6811 info->type = ADDRESS_REG_WB;
6812 info->base = XEXP (x, 0);
6813 if (GET_CODE (XEXP (x, 1)) == PLUS
6814 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6815 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6816 && aarch64_base_register_rtx_p (info->base, strict_p))
6817 {
6818 info->offset = XEXP (XEXP (x, 1), 1);
6819 info->const_offset = offset;
6820
6821 /* TImode and TFmode values are allowed in both pairs of X
6822 registers and individual Q registers. The available
6823 address modes are:
6824 X,X: 7-bit signed scaled offset
6825 Q: 9-bit signed offset
6826 We conservatively require an offset representable in either mode.
6827 */
6828 if (mode == TImode || mode == TFmode)
6829 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6830 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6831
6832 if (load_store_pair_p)
6833 return ((known_eq (GET_MODE_SIZE (mode), 4)
6834 || known_eq (GET_MODE_SIZE (mode), 8)
6835 || known_eq (GET_MODE_SIZE (mode), 16))
6836 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6837 else
6838 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6839 }
6840 return false;
6841
6842 case CONST:
6843 case SYMBOL_REF:
6844 case LABEL_REF:
6845 /* load literal: pc-relative constant pool entry. Only supported
6846 for SI mode or larger. */
6847 info->type = ADDRESS_SYMBOLIC;
6848
6849 if (!load_store_pair_p
6850 && GET_MODE_SIZE (mode).is_constant (&const_size)
6851 && const_size >= 4)
6852 {
6853 rtx sym, addend;
6854
6855 split_const (x, &sym, &addend);
6856 return ((GET_CODE (sym) == LABEL_REF
6857 || (GET_CODE (sym) == SYMBOL_REF
6858 && CONSTANT_POOL_ADDRESS_P (sym)
6859 && aarch64_pcrelative_literal_loads)));
6860 }
6861 return false;
6862
6863 case LO_SUM:
6864 info->type = ADDRESS_LO_SUM;
6865 info->base = XEXP (x, 0);
6866 info->offset = XEXP (x, 1);
6867 if (allow_reg_index_p
6868 && aarch64_base_register_rtx_p (info->base, strict_p))
6869 {
6870 rtx sym, offs;
6871 split_const (info->offset, &sym, &offs);
6872 if (GET_CODE (sym) == SYMBOL_REF
6873 && (aarch64_classify_symbol (sym, INTVAL (offs))
6874 == SYMBOL_SMALL_ABSOLUTE))
6875 {
6876 /* The symbol and offset must be aligned to the access size. */
6877 unsigned int align;
6878
6879 if (CONSTANT_POOL_ADDRESS_P (sym))
6880 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6881 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6882 {
6883 tree exp = SYMBOL_REF_DECL (sym);
6884 align = TYPE_ALIGN (TREE_TYPE (exp));
6885 align = aarch64_constant_alignment (exp, align);
6886 }
6887 else if (SYMBOL_REF_DECL (sym))
6888 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6889 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6890 && SYMBOL_REF_BLOCK (sym) != NULL)
6891 align = SYMBOL_REF_BLOCK (sym)->alignment;
6892 else
6893 align = BITS_PER_UNIT;
6894
6895 poly_int64 ref_size = GET_MODE_SIZE (mode);
6896 if (known_eq (ref_size, 0))
6897 ref_size = GET_MODE_SIZE (DImode);
6898
6899 return (multiple_p (INTVAL (offs), ref_size)
6900 && multiple_p (align / BITS_PER_UNIT, ref_size));
6901 }
6902 }
6903 return false;
6904
6905 default:
6906 return false;
6907 }
6908 }
6909
6910 /* Return true if the address X is valid for a PRFM instruction.
6911 STRICT_P is true if we should do strict checking with
6912 aarch64_classify_address. */
6913
6914 bool
6915 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6916 {
6917 struct aarch64_address_info addr;
6918
6919 /* PRFM accepts the same addresses as DImode... */
6920 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6921 if (!res)
6922 return false;
6923
6924 /* ... except writeback forms. */
6925 return addr.type != ADDRESS_REG_WB;
6926 }
6927
6928 bool
6929 aarch64_symbolic_address_p (rtx x)
6930 {
6931 rtx offset;
6932
6933 split_const (x, &x, &offset);
6934 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6935 }
6936
6937 /* Classify the base of symbolic expression X. */
6938
6939 enum aarch64_symbol_type
6940 aarch64_classify_symbolic_expression (rtx x)
6941 {
6942 rtx offset;
6943
6944 split_const (x, &x, &offset);
6945 return aarch64_classify_symbol (x, INTVAL (offset));
6946 }
6947
6948
6949 /* Return TRUE if X is a legitimate address for accessing memory in
6950 mode MODE. */
6951 static bool
6952 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6953 {
6954 struct aarch64_address_info addr;
6955
6956 return aarch64_classify_address (&addr, x, mode, strict_p);
6957 }
6958
6959 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6960 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6961 bool
6962 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6963 aarch64_addr_query_type type)
6964 {
6965 struct aarch64_address_info addr;
6966
6967 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6968 }
6969
6970 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6971
6972 static bool
6973 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6974 poly_int64 orig_offset,
6975 machine_mode mode)
6976 {
6977 HOST_WIDE_INT size;
6978 if (GET_MODE_SIZE (mode).is_constant (&size))
6979 {
6980 HOST_WIDE_INT const_offset, second_offset;
6981
6982 /* A general SVE offset is A * VQ + B. Remove the A component from
6983 coefficient 0 in order to get the constant B. */
6984 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6985
6986 /* Split an out-of-range address displacement into a base and
6987 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6988 range otherwise to increase opportunities for sharing the base
6989 address of different sizes. Unaligned accesses use the signed
6990 9-bit range, TImode/TFmode use the intersection of signed
6991 scaled 7-bit and signed 9-bit offset. */
6992 if (mode == TImode || mode == TFmode)
6993 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6994 else if ((const_offset & (size - 1)) != 0)
6995 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6996 else
6997 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6998
6999 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7000 return false;
7001
7002 /* Split the offset into second_offset and the rest. */
7003 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7004 *offset2 = gen_int_mode (second_offset, Pmode);
7005 return true;
7006 }
7007 else
7008 {
7009 /* Get the mode we should use as the basis of the range. For structure
7010 modes this is the mode of one vector. */
7011 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7012 machine_mode step_mode
7013 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7014
7015 /* Get the "mul vl" multiplier we'd like to use. */
7016 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7017 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7018 if (vec_flags & VEC_SVE_DATA)
7019 /* LDR supports a 9-bit range, but the move patterns for
7020 structure modes require all vectors to be in range of the
7021 same base. The simplest way of accomodating that while still
7022 promoting reuse of anchor points between different modes is
7023 to use an 8-bit range unconditionally. */
7024 vnum = ((vnum + 128) & 255) - 128;
7025 else
7026 /* Predicates are only handled singly, so we might as well use
7027 the full range. */
7028 vnum = ((vnum + 256) & 511) - 256;
7029 if (vnum == 0)
7030 return false;
7031
7032 /* Convert the "mul vl" multiplier into a byte offset. */
7033 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7034 if (known_eq (second_offset, orig_offset))
7035 return false;
7036
7037 /* Split the offset into second_offset and the rest. */
7038 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7039 *offset2 = gen_int_mode (second_offset, Pmode);
7040 return true;
7041 }
7042 }
7043
7044 /* Return the binary representation of floating point constant VALUE in INTVAL.
7045 If the value cannot be converted, return false without setting INTVAL.
7046 The conversion is done in the given MODE. */
7047 bool
7048 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7049 {
7050
7051 /* We make a general exception for 0. */
7052 if (aarch64_float_const_zero_rtx_p (value))
7053 {
7054 *intval = 0;
7055 return true;
7056 }
7057
7058 scalar_float_mode mode;
7059 if (GET_CODE (value) != CONST_DOUBLE
7060 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7061 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7062 /* Only support up to DF mode. */
7063 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7064 return false;
7065
7066 unsigned HOST_WIDE_INT ival = 0;
7067
7068 long res[2];
7069 real_to_target (res,
7070 CONST_DOUBLE_REAL_VALUE (value),
7071 REAL_MODE_FORMAT (mode));
7072
7073 if (mode == DFmode)
7074 {
7075 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7076 ival = zext_hwi (res[order], 32);
7077 ival |= (zext_hwi (res[1 - order], 32) << 32);
7078 }
7079 else
7080 ival = zext_hwi (res[0], 32);
7081
7082 *intval = ival;
7083 return true;
7084 }
7085
7086 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7087 single MOV(+MOVK) followed by an FMOV. */
7088 bool
7089 aarch64_float_const_rtx_p (rtx x)
7090 {
7091 machine_mode mode = GET_MODE (x);
7092 if (mode == VOIDmode)
7093 return false;
7094
7095 /* Determine whether it's cheaper to write float constants as
7096 mov/movk pairs over ldr/adrp pairs. */
7097 unsigned HOST_WIDE_INT ival;
7098
7099 if (GET_CODE (x) == CONST_DOUBLE
7100 && SCALAR_FLOAT_MODE_P (mode)
7101 && aarch64_reinterpret_float_as_int (x, &ival))
7102 {
7103 scalar_int_mode imode = (mode == HFmode
7104 ? SImode
7105 : int_mode_for_mode (mode).require ());
7106 int num_instr = aarch64_internal_mov_immediate
7107 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7108 return num_instr < 3;
7109 }
7110
7111 return false;
7112 }
7113
7114 /* Return TRUE if rtx X is immediate constant 0.0 */
7115 bool
7116 aarch64_float_const_zero_rtx_p (rtx x)
7117 {
7118 if (GET_MODE (x) == VOIDmode)
7119 return false;
7120
7121 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7122 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7123 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7124 }
7125
7126 /* Return TRUE if rtx X is immediate constant that fits in a single
7127 MOVI immediate operation. */
7128 bool
7129 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7130 {
7131 if (!TARGET_SIMD)
7132 return false;
7133
7134 machine_mode vmode;
7135 scalar_int_mode imode;
7136 unsigned HOST_WIDE_INT ival;
7137
7138 if (GET_CODE (x) == CONST_DOUBLE
7139 && SCALAR_FLOAT_MODE_P (mode))
7140 {
7141 if (!aarch64_reinterpret_float_as_int (x, &ival))
7142 return false;
7143
7144 /* We make a general exception for 0. */
7145 if (aarch64_float_const_zero_rtx_p (x))
7146 return true;
7147
7148 imode = int_mode_for_mode (mode).require ();
7149 }
7150 else if (GET_CODE (x) == CONST_INT
7151 && is_a <scalar_int_mode> (mode, &imode))
7152 ival = INTVAL (x);
7153 else
7154 return false;
7155
7156 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7157 a 128 bit vector mode. */
7158 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7159
7160 vmode = aarch64_simd_container_mode (imode, width);
7161 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7162
7163 return aarch64_simd_valid_immediate (v_op, NULL);
7164 }
7165
7166
7167 /* Return the fixed registers used for condition codes. */
7168
7169 static bool
7170 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7171 {
7172 *p1 = CC_REGNUM;
7173 *p2 = INVALID_REGNUM;
7174 return true;
7175 }
7176
7177 /* This function is used by the call expanders of the machine description.
7178 RESULT is the register in which the result is returned. It's NULL for
7179 "call" and "sibcall".
7180 MEM is the location of the function call.
7181 SIBCALL indicates whether this function call is normal call or sibling call.
7182 It will generate different pattern accordingly. */
7183
7184 void
7185 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7186 {
7187 rtx call, callee, tmp;
7188 rtvec vec;
7189 machine_mode mode;
7190
7191 gcc_assert (MEM_P (mem));
7192 callee = XEXP (mem, 0);
7193 mode = GET_MODE (callee);
7194 gcc_assert (mode == Pmode);
7195
7196 /* Decide if we should generate indirect calls by loading the
7197 address of the callee into a register before performing
7198 the branch-and-link. */
7199 if (SYMBOL_REF_P (callee)
7200 ? (aarch64_is_long_call_p (callee)
7201 || aarch64_is_noplt_call_p (callee))
7202 : !REG_P (callee))
7203 XEXP (mem, 0) = force_reg (mode, callee);
7204
7205 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7206
7207 if (result != NULL_RTX)
7208 call = gen_rtx_SET (result, call);
7209
7210 if (sibcall)
7211 tmp = ret_rtx;
7212 else
7213 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7214
7215 vec = gen_rtvec (2, call, tmp);
7216 call = gen_rtx_PARALLEL (VOIDmode, vec);
7217
7218 aarch64_emit_call_insn (call);
7219 }
7220
7221 /* Emit call insn with PAT and do aarch64-specific handling. */
7222
7223 void
7224 aarch64_emit_call_insn (rtx pat)
7225 {
7226 rtx insn = emit_call_insn (pat);
7227
7228 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7229 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7230 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7231 }
7232
7233 machine_mode
7234 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7235 {
7236 machine_mode mode_x = GET_MODE (x);
7237 rtx_code code_x = GET_CODE (x);
7238
7239 /* All floating point compares return CCFP if it is an equality
7240 comparison, and CCFPE otherwise. */
7241 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7242 {
7243 switch (code)
7244 {
7245 case EQ:
7246 case NE:
7247 case UNORDERED:
7248 case ORDERED:
7249 case UNLT:
7250 case UNLE:
7251 case UNGT:
7252 case UNGE:
7253 case UNEQ:
7254 return CCFPmode;
7255
7256 case LT:
7257 case LE:
7258 case GT:
7259 case GE:
7260 case LTGT:
7261 return CCFPEmode;
7262
7263 default:
7264 gcc_unreachable ();
7265 }
7266 }
7267
7268 /* Equality comparisons of short modes against zero can be performed
7269 using the TST instruction with the appropriate bitmask. */
7270 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7271 && (code == EQ || code == NE)
7272 && (mode_x == HImode || mode_x == QImode))
7273 return CC_NZmode;
7274
7275 /* Similarly, comparisons of zero_extends from shorter modes can
7276 be performed using an ANDS with an immediate mask. */
7277 if (y == const0_rtx && code_x == ZERO_EXTEND
7278 && (mode_x == SImode || mode_x == DImode)
7279 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7280 && (code == EQ || code == NE))
7281 return CC_NZmode;
7282
7283 if ((mode_x == SImode || mode_x == DImode)
7284 && y == const0_rtx
7285 && (code == EQ || code == NE || code == LT || code == GE)
7286 && (code_x == PLUS || code_x == MINUS || code_x == AND
7287 || code_x == NEG
7288 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7289 && CONST_INT_P (XEXP (x, 2)))))
7290 return CC_NZmode;
7291
7292 /* A compare with a shifted operand. Because of canonicalization,
7293 the comparison will have to be swapped when we emit the assembly
7294 code. */
7295 if ((mode_x == SImode || mode_x == DImode)
7296 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7297 && (code_x == ASHIFT || code_x == ASHIFTRT
7298 || code_x == LSHIFTRT
7299 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7300 return CC_SWPmode;
7301
7302 /* Similarly for a negated operand, but we can only do this for
7303 equalities. */
7304 if ((mode_x == SImode || mode_x == DImode)
7305 && (REG_P (y) || GET_CODE (y) == SUBREG)
7306 && (code == EQ || code == NE)
7307 && code_x == NEG)
7308 return CC_Zmode;
7309
7310 /* A test for unsigned overflow from an addition. */
7311 if ((mode_x == DImode || mode_x == TImode)
7312 && (code == LTU || code == GEU)
7313 && code_x == PLUS
7314 && rtx_equal_p (XEXP (x, 0), y))
7315 return CC_Cmode;
7316
7317 /* A test for unsigned overflow from an add with carry. */
7318 if ((mode_x == DImode || mode_x == TImode)
7319 && (code == LTU || code == GEU)
7320 && code_x == PLUS
7321 && CONST_SCALAR_INT_P (y)
7322 && (rtx_mode_t (y, mode_x)
7323 == (wi::shwi (1, mode_x)
7324 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7325 return CC_ADCmode;
7326
7327 /* A test for signed overflow. */
7328 if ((mode_x == DImode || mode_x == TImode)
7329 && code == NE
7330 && code_x == PLUS
7331 && GET_CODE (y) == SIGN_EXTEND)
7332 return CC_Vmode;
7333
7334 /* For everything else, return CCmode. */
7335 return CCmode;
7336 }
7337
7338 static int
7339 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7340
7341 int
7342 aarch64_get_condition_code (rtx x)
7343 {
7344 machine_mode mode = GET_MODE (XEXP (x, 0));
7345 enum rtx_code comp_code = GET_CODE (x);
7346
7347 if (GET_MODE_CLASS (mode) != MODE_CC)
7348 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7349 return aarch64_get_condition_code_1 (mode, comp_code);
7350 }
7351
7352 static int
7353 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7354 {
7355 switch (mode)
7356 {
7357 case E_CCFPmode:
7358 case E_CCFPEmode:
7359 switch (comp_code)
7360 {
7361 case GE: return AARCH64_GE;
7362 case GT: return AARCH64_GT;
7363 case LE: return AARCH64_LS;
7364 case LT: return AARCH64_MI;
7365 case NE: return AARCH64_NE;
7366 case EQ: return AARCH64_EQ;
7367 case ORDERED: return AARCH64_VC;
7368 case UNORDERED: return AARCH64_VS;
7369 case UNLT: return AARCH64_LT;
7370 case UNLE: return AARCH64_LE;
7371 case UNGT: return AARCH64_HI;
7372 case UNGE: return AARCH64_PL;
7373 default: return -1;
7374 }
7375 break;
7376
7377 case E_CCmode:
7378 switch (comp_code)
7379 {
7380 case NE: return AARCH64_NE;
7381 case EQ: return AARCH64_EQ;
7382 case GE: return AARCH64_GE;
7383 case GT: return AARCH64_GT;
7384 case LE: return AARCH64_LE;
7385 case LT: return AARCH64_LT;
7386 case GEU: return AARCH64_CS;
7387 case GTU: return AARCH64_HI;
7388 case LEU: return AARCH64_LS;
7389 case LTU: return AARCH64_CC;
7390 default: return -1;
7391 }
7392 break;
7393
7394 case E_CC_SWPmode:
7395 switch (comp_code)
7396 {
7397 case NE: return AARCH64_NE;
7398 case EQ: return AARCH64_EQ;
7399 case GE: return AARCH64_LE;
7400 case GT: return AARCH64_LT;
7401 case LE: return AARCH64_GE;
7402 case LT: return AARCH64_GT;
7403 case GEU: return AARCH64_LS;
7404 case GTU: return AARCH64_CC;
7405 case LEU: return AARCH64_CS;
7406 case LTU: return AARCH64_HI;
7407 default: return -1;
7408 }
7409 break;
7410
7411 case E_CC_NZCmode:
7412 switch (comp_code)
7413 {
7414 case NE: return AARCH64_NE; /* = any */
7415 case EQ: return AARCH64_EQ; /* = none */
7416 case GE: return AARCH64_PL; /* = nfrst */
7417 case LT: return AARCH64_MI; /* = first */
7418 case GEU: return AARCH64_CS; /* = nlast */
7419 case GTU: return AARCH64_HI; /* = pmore */
7420 case LEU: return AARCH64_LS; /* = plast */
7421 case LTU: return AARCH64_CC; /* = last */
7422 default: return -1;
7423 }
7424 break;
7425
7426 case E_CC_NZmode:
7427 switch (comp_code)
7428 {
7429 case NE: return AARCH64_NE;
7430 case EQ: return AARCH64_EQ;
7431 case GE: return AARCH64_PL;
7432 case LT: return AARCH64_MI;
7433 default: return -1;
7434 }
7435 break;
7436
7437 case E_CC_Zmode:
7438 switch (comp_code)
7439 {
7440 case NE: return AARCH64_NE;
7441 case EQ: return AARCH64_EQ;
7442 default: return -1;
7443 }
7444 break;
7445
7446 case E_CC_Cmode:
7447 switch (comp_code)
7448 {
7449 case LTU: return AARCH64_CS;
7450 case GEU: return AARCH64_CC;
7451 default: return -1;
7452 }
7453 break;
7454
7455 case E_CC_ADCmode:
7456 switch (comp_code)
7457 {
7458 case GEU: return AARCH64_CS;
7459 case LTU: return AARCH64_CC;
7460 default: return -1;
7461 }
7462 break;
7463
7464 case E_CC_Vmode:
7465 switch (comp_code)
7466 {
7467 case NE: return AARCH64_VS;
7468 case EQ: return AARCH64_VC;
7469 default: return -1;
7470 }
7471 break;
7472
7473 default:
7474 return -1;
7475 }
7476
7477 return -1;
7478 }
7479
7480 bool
7481 aarch64_const_vec_all_same_in_range_p (rtx x,
7482 HOST_WIDE_INT minval,
7483 HOST_WIDE_INT maxval)
7484 {
7485 rtx elt;
7486 return (const_vec_duplicate_p (x, &elt)
7487 && CONST_INT_P (elt)
7488 && IN_RANGE (INTVAL (elt), minval, maxval));
7489 }
7490
7491 bool
7492 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7493 {
7494 return aarch64_const_vec_all_same_in_range_p (x, val, val);
7495 }
7496
7497 /* Return true if VEC is a constant in which every element is in the range
7498 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7499
7500 static bool
7501 aarch64_const_vec_all_in_range_p (rtx vec,
7502 HOST_WIDE_INT minval,
7503 HOST_WIDE_INT maxval)
7504 {
7505 if (GET_CODE (vec) != CONST_VECTOR
7506 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7507 return false;
7508
7509 int nunits;
7510 if (!CONST_VECTOR_STEPPED_P (vec))
7511 nunits = const_vector_encoded_nelts (vec);
7512 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7513 return false;
7514
7515 for (int i = 0; i < nunits; i++)
7516 {
7517 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7518 if (!CONST_INT_P (vec_elem)
7519 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7520 return false;
7521 }
7522 return true;
7523 }
7524
7525 /* N Z C V. */
7526 #define AARCH64_CC_V 1
7527 #define AARCH64_CC_C (1 << 1)
7528 #define AARCH64_CC_Z (1 << 2)
7529 #define AARCH64_CC_N (1 << 3)
7530
7531 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7532 static const int aarch64_nzcv_codes[] =
7533 {
7534 0, /* EQ, Z == 1. */
7535 AARCH64_CC_Z, /* NE, Z == 0. */
7536 0, /* CS, C == 1. */
7537 AARCH64_CC_C, /* CC, C == 0. */
7538 0, /* MI, N == 1. */
7539 AARCH64_CC_N, /* PL, N == 0. */
7540 0, /* VS, V == 1. */
7541 AARCH64_CC_V, /* VC, V == 0. */
7542 0, /* HI, C ==1 && Z == 0. */
7543 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
7544 AARCH64_CC_V, /* GE, N == V. */
7545 0, /* LT, N != V. */
7546 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
7547 0, /* LE, !(Z == 0 && N == V). */
7548 0, /* AL, Any. */
7549 0 /* NV, Any. */
7550 };
7551
7552 /* Print floating-point vector immediate operand X to F, negating it
7553 first if NEGATE is true. Return true on success, false if it isn't
7554 a constant we can handle. */
7555
7556 static bool
7557 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7558 {
7559 rtx elt;
7560
7561 if (!const_vec_duplicate_p (x, &elt))
7562 return false;
7563
7564 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7565 if (negate)
7566 r = real_value_negate (&r);
7567
7568 /* We only handle the SVE single-bit immediates here. */
7569 if (real_equal (&r, &dconst0))
7570 asm_fprintf (f, "0.0");
7571 else if (real_equal (&r, &dconst1))
7572 asm_fprintf (f, "1.0");
7573 else if (real_equal (&r, &dconsthalf))
7574 asm_fprintf (f, "0.5");
7575 else
7576 return false;
7577
7578 return true;
7579 }
7580
7581 /* Return the equivalent letter for size. */
7582 static char
7583 sizetochar (int size)
7584 {
7585 switch (size)
7586 {
7587 case 64: return 'd';
7588 case 32: return 's';
7589 case 16: return 'h';
7590 case 8 : return 'b';
7591 default: gcc_unreachable ();
7592 }
7593 }
7594
7595 /* Print operand X to file F in a target specific manner according to CODE.
7596 The acceptable formatting commands given by CODE are:
7597 'c': An integer or symbol address without a preceding #
7598 sign.
7599 'C': Take the duplicated element in a vector constant
7600 and print it in hex.
7601 'D': Take the duplicated element in a vector constant
7602 and print it as an unsigned integer, in decimal.
7603 'e': Print the sign/zero-extend size as a character 8->b,
7604 16->h, 32->w.
7605 'p': Prints N such that 2^N == X (X must be power of 2 and
7606 const int).
7607 'P': Print the number of non-zero bits in X (a const_int).
7608 'H': Print the higher numbered register of a pair (TImode)
7609 of regs.
7610 'm': Print a condition (eq, ne, etc).
7611 'M': Same as 'm', but invert condition.
7612 'N': Take the duplicated element in a vector constant
7613 and print the negative of it in decimal.
7614 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7615 'S/T/U/V': Print a FP/SIMD register name for a register list.
7616 The register printed is the FP/SIMD register name
7617 of X + 0/1/2/3 for S/T/U/V.
7618 'R': Print a scalar FP/SIMD register name + 1.
7619 'X': Print bottom 16 bits of integer constant in hex.
7620 'w/x': Print a general register name or the zero register
7621 (32-bit or 64-bit).
7622 '0': Print a normal operand, if it's a general register,
7623 then we assume DImode.
7624 'k': Print NZCV for conditional compare instructions.
7625 'A': Output address constant representing the first
7626 argument of X, specifying a relocation offset
7627 if appropriate.
7628 'L': Output constant address specified by X
7629 with a relocation offset if appropriate.
7630 'G': Prints address of X, specifying a PC relative
7631 relocation mode if appropriate.
7632 'y': Output address of LDP or STP - this is used for
7633 some LDP/STPs which don't use a PARALLEL in their
7634 pattern (so the mode needs to be adjusted).
7635 'z': Output address of a typical LDP or STP. */
7636
7637 static void
7638 aarch64_print_operand (FILE *f, rtx x, int code)
7639 {
7640 rtx elt;
7641 switch (code)
7642 {
7643 case 'c':
7644 switch (GET_CODE (x))
7645 {
7646 case CONST_INT:
7647 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7648 break;
7649
7650 case SYMBOL_REF:
7651 output_addr_const (f, x);
7652 break;
7653
7654 case CONST:
7655 if (GET_CODE (XEXP (x, 0)) == PLUS
7656 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7657 {
7658 output_addr_const (f, x);
7659 break;
7660 }
7661 /* Fall through. */
7662
7663 default:
7664 output_operand_lossage ("unsupported operand for code '%c'", code);
7665 }
7666 break;
7667
7668 case 'e':
7669 {
7670 int n;
7671
7672 if (!CONST_INT_P (x)
7673 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7674 {
7675 output_operand_lossage ("invalid operand for '%%%c'", code);
7676 return;
7677 }
7678
7679 switch (n)
7680 {
7681 case 3:
7682 fputc ('b', f);
7683 break;
7684 case 4:
7685 fputc ('h', f);
7686 break;
7687 case 5:
7688 fputc ('w', f);
7689 break;
7690 default:
7691 output_operand_lossage ("invalid operand for '%%%c'", code);
7692 return;
7693 }
7694 }
7695 break;
7696
7697 case 'p':
7698 {
7699 int n;
7700
7701 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7702 {
7703 output_operand_lossage ("invalid operand for '%%%c'", code);
7704 return;
7705 }
7706
7707 asm_fprintf (f, "%d", n);
7708 }
7709 break;
7710
7711 case 'P':
7712 if (!CONST_INT_P (x))
7713 {
7714 output_operand_lossage ("invalid operand for '%%%c'", code);
7715 return;
7716 }
7717
7718 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7719 break;
7720
7721 case 'H':
7722 if (x == const0_rtx)
7723 {
7724 asm_fprintf (f, "xzr");
7725 break;
7726 }
7727
7728 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7729 {
7730 output_operand_lossage ("invalid operand for '%%%c'", code);
7731 return;
7732 }
7733
7734 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7735 break;
7736
7737 case 'M':
7738 case 'm':
7739 {
7740 int cond_code;
7741 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
7742 if (x == const_true_rtx)
7743 {
7744 if (code == 'M')
7745 fputs ("nv", f);
7746 return;
7747 }
7748
7749 if (!COMPARISON_P (x))
7750 {
7751 output_operand_lossage ("invalid operand for '%%%c'", code);
7752 return;
7753 }
7754
7755 cond_code = aarch64_get_condition_code (x);
7756 gcc_assert (cond_code >= 0);
7757 if (code == 'M')
7758 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7759 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
7760 fputs (aarch64_sve_condition_codes[cond_code], f);
7761 else
7762 fputs (aarch64_condition_codes[cond_code], f);
7763 }
7764 break;
7765
7766 case 'N':
7767 if (!const_vec_duplicate_p (x, &elt))
7768 {
7769 output_operand_lossage ("invalid vector constant");
7770 return;
7771 }
7772
7773 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7774 asm_fprintf (f, "%wd", -INTVAL (elt));
7775 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7776 && aarch64_print_vector_float_operand (f, x, true))
7777 ;
7778 else
7779 {
7780 output_operand_lossage ("invalid vector constant");
7781 return;
7782 }
7783 break;
7784
7785 case 'b':
7786 case 'h':
7787 case 's':
7788 case 'd':
7789 case 'q':
7790 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7791 {
7792 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7793 return;
7794 }
7795 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7796 break;
7797
7798 case 'S':
7799 case 'T':
7800 case 'U':
7801 case 'V':
7802 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7803 {
7804 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7805 return;
7806 }
7807 asm_fprintf (f, "%c%d",
7808 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7809 REGNO (x) - V0_REGNUM + (code - 'S'));
7810 break;
7811
7812 case 'R':
7813 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7814 {
7815 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7816 return;
7817 }
7818 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7819 break;
7820
7821 case 'X':
7822 if (!CONST_INT_P (x))
7823 {
7824 output_operand_lossage ("invalid operand for '%%%c'", code);
7825 return;
7826 }
7827 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7828 break;
7829
7830 case 'C':
7831 {
7832 /* Print a replicated constant in hex. */
7833 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7834 {
7835 output_operand_lossage ("invalid operand for '%%%c'", code);
7836 return;
7837 }
7838 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7839 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7840 }
7841 break;
7842
7843 case 'D':
7844 {
7845 /* Print a replicated constant in decimal, treating it as
7846 unsigned. */
7847 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7848 {
7849 output_operand_lossage ("invalid operand for '%%%c'", code);
7850 return;
7851 }
7852 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7853 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7854 }
7855 break;
7856
7857 case 'w':
7858 case 'x':
7859 if (x == const0_rtx
7860 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7861 {
7862 asm_fprintf (f, "%czr", code);
7863 break;
7864 }
7865
7866 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7867 {
7868 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7869 break;
7870 }
7871
7872 if (REG_P (x) && REGNO (x) == SP_REGNUM)
7873 {
7874 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7875 break;
7876 }
7877
7878 /* Fall through */
7879
7880 case 0:
7881 if (x == NULL)
7882 {
7883 output_operand_lossage ("missing operand");
7884 return;
7885 }
7886
7887 switch (GET_CODE (x))
7888 {
7889 case REG:
7890 if (aarch64_sve_data_mode_p (GET_MODE (x)))
7891 {
7892 if (REG_NREGS (x) == 1)
7893 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7894 else
7895 {
7896 char suffix
7897 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7898 asm_fprintf (f, "{z%d.%c - z%d.%c}",
7899 REGNO (x) - V0_REGNUM, suffix,
7900 END_REGNO (x) - V0_REGNUM - 1, suffix);
7901 }
7902 }
7903 else
7904 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7905 break;
7906
7907 case MEM:
7908 output_address (GET_MODE (x), XEXP (x, 0));
7909 break;
7910
7911 case LABEL_REF:
7912 case SYMBOL_REF:
7913 output_addr_const (asm_out_file, x);
7914 break;
7915
7916 case CONST_INT:
7917 asm_fprintf (f, "%wd", INTVAL (x));
7918 break;
7919
7920 case CONST:
7921 if (!VECTOR_MODE_P (GET_MODE (x)))
7922 {
7923 output_addr_const (asm_out_file, x);
7924 break;
7925 }
7926 /* fall through */
7927
7928 case CONST_VECTOR:
7929 if (!const_vec_duplicate_p (x, &elt))
7930 {
7931 output_operand_lossage ("invalid vector constant");
7932 return;
7933 }
7934
7935 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7936 asm_fprintf (f, "%wd", INTVAL (elt));
7937 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7938 && aarch64_print_vector_float_operand (f, x, false))
7939 ;
7940 else
7941 {
7942 output_operand_lossage ("invalid vector constant");
7943 return;
7944 }
7945 break;
7946
7947 case CONST_DOUBLE:
7948 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7949 be getting CONST_DOUBLEs holding integers. */
7950 gcc_assert (GET_MODE (x) != VOIDmode);
7951 if (aarch64_float_const_zero_rtx_p (x))
7952 {
7953 fputc ('0', f);
7954 break;
7955 }
7956 else if (aarch64_float_const_representable_p (x))
7957 {
7958 #define buf_size 20
7959 char float_buf[buf_size] = {'\0'};
7960 real_to_decimal_for_mode (float_buf,
7961 CONST_DOUBLE_REAL_VALUE (x),
7962 buf_size, buf_size,
7963 1, GET_MODE (x));
7964 asm_fprintf (asm_out_file, "%s", float_buf);
7965 break;
7966 #undef buf_size
7967 }
7968 output_operand_lossage ("invalid constant");
7969 return;
7970 default:
7971 output_operand_lossage ("invalid operand");
7972 return;
7973 }
7974 break;
7975
7976 case 'A':
7977 if (GET_CODE (x) == HIGH)
7978 x = XEXP (x, 0);
7979
7980 switch (aarch64_classify_symbolic_expression (x))
7981 {
7982 case SYMBOL_SMALL_GOT_4G:
7983 asm_fprintf (asm_out_file, ":got:");
7984 break;
7985
7986 case SYMBOL_SMALL_TLSGD:
7987 asm_fprintf (asm_out_file, ":tlsgd:");
7988 break;
7989
7990 case SYMBOL_SMALL_TLSDESC:
7991 asm_fprintf (asm_out_file, ":tlsdesc:");
7992 break;
7993
7994 case SYMBOL_SMALL_TLSIE:
7995 asm_fprintf (asm_out_file, ":gottprel:");
7996 break;
7997
7998 case SYMBOL_TLSLE24:
7999 asm_fprintf (asm_out_file, ":tprel:");
8000 break;
8001
8002 case SYMBOL_TINY_GOT:
8003 gcc_unreachable ();
8004 break;
8005
8006 default:
8007 break;
8008 }
8009 output_addr_const (asm_out_file, x);
8010 break;
8011
8012 case 'L':
8013 switch (aarch64_classify_symbolic_expression (x))
8014 {
8015 case SYMBOL_SMALL_GOT_4G:
8016 asm_fprintf (asm_out_file, ":lo12:");
8017 break;
8018
8019 case SYMBOL_SMALL_TLSGD:
8020 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8021 break;
8022
8023 case SYMBOL_SMALL_TLSDESC:
8024 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8025 break;
8026
8027 case SYMBOL_SMALL_TLSIE:
8028 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8029 break;
8030
8031 case SYMBOL_TLSLE12:
8032 asm_fprintf (asm_out_file, ":tprel_lo12:");
8033 break;
8034
8035 case SYMBOL_TLSLE24:
8036 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8037 break;
8038
8039 case SYMBOL_TINY_GOT:
8040 asm_fprintf (asm_out_file, ":got:");
8041 break;
8042
8043 case SYMBOL_TINY_TLSIE:
8044 asm_fprintf (asm_out_file, ":gottprel:");
8045 break;
8046
8047 default:
8048 break;
8049 }
8050 output_addr_const (asm_out_file, x);
8051 break;
8052
8053 case 'G':
8054 switch (aarch64_classify_symbolic_expression (x))
8055 {
8056 case SYMBOL_TLSLE24:
8057 asm_fprintf (asm_out_file, ":tprel_hi12:");
8058 break;
8059 default:
8060 break;
8061 }
8062 output_addr_const (asm_out_file, x);
8063 break;
8064
8065 case 'k':
8066 {
8067 HOST_WIDE_INT cond_code;
8068
8069 if (!CONST_INT_P (x))
8070 {
8071 output_operand_lossage ("invalid operand for '%%%c'", code);
8072 return;
8073 }
8074
8075 cond_code = INTVAL (x);
8076 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8077 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8078 }
8079 break;
8080
8081 case 'y':
8082 case 'z':
8083 {
8084 machine_mode mode = GET_MODE (x);
8085
8086 if (GET_CODE (x) != MEM
8087 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8088 {
8089 output_operand_lossage ("invalid operand for '%%%c'", code);
8090 return;
8091 }
8092
8093 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8094 code == 'y'
8095 ? ADDR_QUERY_LDP_STP_N
8096 : ADDR_QUERY_LDP_STP))
8097 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8098 }
8099 break;
8100
8101 default:
8102 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8103 return;
8104 }
8105 }
8106
8107 /* Print address 'x' of a memory access with mode 'mode'.
8108 'op' is the context required by aarch64_classify_address. It can either be
8109 MEM for a normal memory access or PARALLEL for LDP/STP. */
8110 static bool
8111 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8112 aarch64_addr_query_type type)
8113 {
8114 struct aarch64_address_info addr;
8115 unsigned int size;
8116
8117 /* Check all addresses are Pmode - including ILP32. */
8118 if (GET_MODE (x) != Pmode
8119 && (!CONST_INT_P (x)
8120 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8121 {
8122 output_operand_lossage ("invalid address mode");
8123 return false;
8124 }
8125
8126 if (aarch64_classify_address (&addr, x, mode, true, type))
8127 switch (addr.type)
8128 {
8129 case ADDRESS_REG_IMM:
8130 if (known_eq (addr.const_offset, 0))
8131 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8132 else if (aarch64_sve_data_mode_p (mode))
8133 {
8134 HOST_WIDE_INT vnum
8135 = exact_div (addr.const_offset,
8136 BYTES_PER_SVE_VECTOR).to_constant ();
8137 asm_fprintf (f, "[%s, #%wd, mul vl]",
8138 reg_names[REGNO (addr.base)], vnum);
8139 }
8140 else if (aarch64_sve_pred_mode_p (mode))
8141 {
8142 HOST_WIDE_INT vnum
8143 = exact_div (addr.const_offset,
8144 BYTES_PER_SVE_PRED).to_constant ();
8145 asm_fprintf (f, "[%s, #%wd, mul vl]",
8146 reg_names[REGNO (addr.base)], vnum);
8147 }
8148 else
8149 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8150 INTVAL (addr.offset));
8151 return true;
8152
8153 case ADDRESS_REG_REG:
8154 if (addr.shift == 0)
8155 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8156 reg_names [REGNO (addr.offset)]);
8157 else
8158 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8159 reg_names [REGNO (addr.offset)], addr.shift);
8160 return true;
8161
8162 case ADDRESS_REG_UXTW:
8163 if (addr.shift == 0)
8164 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8165 REGNO (addr.offset) - R0_REGNUM);
8166 else
8167 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8168 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8169 return true;
8170
8171 case ADDRESS_REG_SXTW:
8172 if (addr.shift == 0)
8173 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8174 REGNO (addr.offset) - R0_REGNUM);
8175 else
8176 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8177 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8178 return true;
8179
8180 case ADDRESS_REG_WB:
8181 /* Writeback is only supported for fixed-width modes. */
8182 size = GET_MODE_SIZE (mode).to_constant ();
8183 switch (GET_CODE (x))
8184 {
8185 case PRE_INC:
8186 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8187 return true;
8188 case POST_INC:
8189 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8190 return true;
8191 case PRE_DEC:
8192 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8193 return true;
8194 case POST_DEC:
8195 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8196 return true;
8197 case PRE_MODIFY:
8198 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8199 INTVAL (addr.offset));
8200 return true;
8201 case POST_MODIFY:
8202 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8203 INTVAL (addr.offset));
8204 return true;
8205 default:
8206 break;
8207 }
8208 break;
8209
8210 case ADDRESS_LO_SUM:
8211 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8212 output_addr_const (f, addr.offset);
8213 asm_fprintf (f, "]");
8214 return true;
8215
8216 case ADDRESS_SYMBOLIC:
8217 output_addr_const (f, x);
8218 return true;
8219 }
8220
8221 return false;
8222 }
8223
8224 /* Print address 'x' of a memory access with mode 'mode'. */
8225 static void
8226 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8227 {
8228 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8229 output_addr_const (f, x);
8230 }
8231
8232 bool
8233 aarch64_label_mentioned_p (rtx x)
8234 {
8235 const char *fmt;
8236 int i;
8237
8238 if (GET_CODE (x) == LABEL_REF)
8239 return true;
8240
8241 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8242 referencing instruction, but they are constant offsets, not
8243 symbols. */
8244 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8245 return false;
8246
8247 fmt = GET_RTX_FORMAT (GET_CODE (x));
8248 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8249 {
8250 if (fmt[i] == 'E')
8251 {
8252 int j;
8253
8254 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8255 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8256 return 1;
8257 }
8258 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8259 return 1;
8260 }
8261
8262 return 0;
8263 }
8264
8265 /* Implement REGNO_REG_CLASS. */
8266
8267 enum reg_class
8268 aarch64_regno_regclass (unsigned regno)
8269 {
8270 if (GP_REGNUM_P (regno))
8271 return GENERAL_REGS;
8272
8273 if (regno == SP_REGNUM)
8274 return STACK_REG;
8275
8276 if (regno == FRAME_POINTER_REGNUM
8277 || regno == ARG_POINTER_REGNUM)
8278 return POINTER_REGS;
8279
8280 if (FP_REGNUM_P (regno))
8281 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
8282
8283 if (PR_REGNUM_P (regno))
8284 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8285
8286 return NO_REGS;
8287 }
8288
8289 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8290 If OFFSET is out of range, return an offset of an anchor point
8291 that is in range. Return 0 otherwise. */
8292
8293 static HOST_WIDE_INT
8294 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8295 machine_mode mode)
8296 {
8297 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8298 if (size > 16)
8299 return (offset + 0x400) & ~0x7f0;
8300
8301 /* For offsets that aren't a multiple of the access size, the limit is
8302 -256...255. */
8303 if (offset & (size - 1))
8304 {
8305 /* BLKmode typically uses LDP of X-registers. */
8306 if (mode == BLKmode)
8307 return (offset + 512) & ~0x3ff;
8308 return (offset + 0x100) & ~0x1ff;
8309 }
8310
8311 /* Small negative offsets are supported. */
8312 if (IN_RANGE (offset, -256, 0))
8313 return 0;
8314
8315 if (mode == TImode || mode == TFmode)
8316 return (offset + 0x100) & ~0x1ff;
8317
8318 /* Use 12-bit offset by access size. */
8319 return offset & (~0xfff * size);
8320 }
8321
8322 static rtx
8323 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
8324 {
8325 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8326 where mask is selected by alignment and size of the offset.
8327 We try to pick as large a range for the offset as possible to
8328 maximize the chance of a CSE. However, for aligned addresses
8329 we limit the range to 4k so that structures with different sized
8330 elements are likely to use the same base. We need to be careful
8331 not to split a CONST for some forms of address expression, otherwise
8332 it will generate sub-optimal code. */
8333
8334 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8335 {
8336 rtx base = XEXP (x, 0);
8337 rtx offset_rtx = XEXP (x, 1);
8338 HOST_WIDE_INT offset = INTVAL (offset_rtx);
8339
8340 if (GET_CODE (base) == PLUS)
8341 {
8342 rtx op0 = XEXP (base, 0);
8343 rtx op1 = XEXP (base, 1);
8344
8345 /* Force any scaling into a temp for CSE. */
8346 op0 = force_reg (Pmode, op0);
8347 op1 = force_reg (Pmode, op1);
8348
8349 /* Let the pointer register be in op0. */
8350 if (REG_POINTER (op1))
8351 std::swap (op0, op1);
8352
8353 /* If the pointer is virtual or frame related, then we know that
8354 virtual register instantiation or register elimination is going
8355 to apply a second constant. We want the two constants folded
8356 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8357 if (virt_or_elim_regno_p (REGNO (op0)))
8358 {
8359 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8360 NULL_RTX, true, OPTAB_DIRECT);
8361 return gen_rtx_PLUS (Pmode, base, op1);
8362 }
8363
8364 /* Otherwise, in order to encourage CSE (and thence loop strength
8365 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8366 base = expand_binop (Pmode, add_optab, op0, op1,
8367 NULL_RTX, true, OPTAB_DIRECT);
8368 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8369 }
8370
8371 HOST_WIDE_INT size;
8372 if (GET_MODE_SIZE (mode).is_constant (&size))
8373 {
8374 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8375 mode);
8376 if (base_offset != 0)
8377 {
8378 base = plus_constant (Pmode, base, base_offset);
8379 base = force_operand (base, NULL_RTX);
8380 return plus_constant (Pmode, base, offset - base_offset);
8381 }
8382 }
8383 }
8384
8385 return x;
8386 }
8387
8388 static reg_class_t
8389 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8390 reg_class_t rclass,
8391 machine_mode mode,
8392 secondary_reload_info *sri)
8393 {
8394 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8395 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8396 comment at the head of aarch64-sve.md for more details about the
8397 big-endian handling. */
8398 if (BYTES_BIG_ENDIAN
8399 && reg_class_subset_p (rclass, FP_REGS)
8400 && !((REG_P (x) && HARD_REGISTER_P (x))
8401 || aarch64_simd_valid_immediate (x, NULL))
8402 && aarch64_sve_data_mode_p (mode))
8403 {
8404 sri->icode = CODE_FOR_aarch64_sve_reload_be;
8405 return NO_REGS;
8406 }
8407
8408 /* If we have to disable direct literal pool loads and stores because the
8409 function is too big, then we need a scratch register. */
8410 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8411 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8412 || targetm.vector_mode_supported_p (GET_MODE (x)))
8413 && !aarch64_pcrelative_literal_loads)
8414 {
8415 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8416 return NO_REGS;
8417 }
8418
8419 /* Without the TARGET_SIMD instructions we cannot move a Q register
8420 to a Q register directly. We need a scratch. */
8421 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8422 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8423 && reg_class_subset_p (rclass, FP_REGS))
8424 {
8425 sri->icode = code_for_aarch64_reload_mov (mode);
8426 return NO_REGS;
8427 }
8428
8429 /* A TFmode or TImode memory access should be handled via an FP_REGS
8430 because AArch64 has richer addressing modes for LDR/STR instructions
8431 than LDP/STP instructions. */
8432 if (TARGET_FLOAT && rclass == GENERAL_REGS
8433 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8434 return FP_REGS;
8435
8436 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8437 return GENERAL_REGS;
8438
8439 return NO_REGS;
8440 }
8441
8442 static bool
8443 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8444 {
8445 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8446
8447 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8448 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8449 if (frame_pointer_needed)
8450 return to == HARD_FRAME_POINTER_REGNUM;
8451 return true;
8452 }
8453
8454 poly_int64
8455 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8456 {
8457 if (to == HARD_FRAME_POINTER_REGNUM)
8458 {
8459 if (from == ARG_POINTER_REGNUM)
8460 return cfun->machine->frame.hard_fp_offset;
8461
8462 if (from == FRAME_POINTER_REGNUM)
8463 return cfun->machine->frame.hard_fp_offset
8464 - cfun->machine->frame.locals_offset;
8465 }
8466
8467 if (to == STACK_POINTER_REGNUM)
8468 {
8469 if (from == FRAME_POINTER_REGNUM)
8470 return cfun->machine->frame.frame_size
8471 - cfun->machine->frame.locals_offset;
8472 }
8473
8474 return cfun->machine->frame.frame_size;
8475 }
8476
8477 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8478 previous frame. */
8479
8480 rtx
8481 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8482 {
8483 if (count != 0)
8484 return const0_rtx;
8485 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8486 }
8487
8488
8489 static void
8490 aarch64_asm_trampoline_template (FILE *f)
8491 {
8492 int offset1 = 16;
8493 int offset2 = 20;
8494
8495 if (aarch64_bti_enabled ())
8496 {
8497 asm_fprintf (f, "\thint\t34 // bti c\n");
8498 offset1 -= 4;
8499 offset2 -= 4;
8500 }
8501
8502 if (TARGET_ILP32)
8503 {
8504 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8505 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8506 offset1);
8507 }
8508 else
8509 {
8510 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8511 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8512 offset2);
8513 }
8514 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8515
8516 /* The trampoline needs an extra padding instruction. In case if BTI is
8517 enabled the padding instruction is replaced by the BTI instruction at
8518 the beginning. */
8519 if (!aarch64_bti_enabled ())
8520 assemble_aligned_integer (4, const0_rtx);
8521
8522 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8523 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8524 }
8525
8526 static void
8527 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8528 {
8529 rtx fnaddr, mem, a_tramp;
8530 const int tramp_code_sz = 16;
8531
8532 /* Don't need to copy the trailing D-words, we fill those in below. */
8533 emit_block_move (m_tramp, assemble_trampoline_template (),
8534 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8535 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8536 fnaddr = XEXP (DECL_RTL (fndecl), 0);
8537 if (GET_MODE (fnaddr) != ptr_mode)
8538 fnaddr = convert_memory_address (ptr_mode, fnaddr);
8539 emit_move_insn (mem, fnaddr);
8540
8541 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8542 emit_move_insn (mem, chain_value);
8543
8544 /* XXX We should really define a "clear_cache" pattern and use
8545 gen_clear_cache(). */
8546 a_tramp = XEXP (m_tramp, 0);
8547 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8548 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8549 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8550 ptr_mode);
8551 }
8552
8553 static unsigned char
8554 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8555 {
8556 /* ??? Logically we should only need to provide a value when
8557 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8558 can hold MODE, but at the moment we need to handle all modes.
8559 Just ignore any runtime parts for registers that can't store them. */
8560 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8561 unsigned int nregs;
8562 switch (regclass)
8563 {
8564 case TAILCALL_ADDR_REGS:
8565 case POINTER_REGS:
8566 case GENERAL_REGS:
8567 case ALL_REGS:
8568 case POINTER_AND_FP_REGS:
8569 case FP_REGS:
8570 case FP_LO_REGS:
8571 if (aarch64_sve_data_mode_p (mode)
8572 && constant_multiple_p (GET_MODE_SIZE (mode),
8573 BYTES_PER_SVE_VECTOR, &nregs))
8574 return nregs;
8575 return (aarch64_vector_data_mode_p (mode)
8576 ? CEIL (lowest_size, UNITS_PER_VREG)
8577 : CEIL (lowest_size, UNITS_PER_WORD));
8578 case STACK_REG:
8579 case PR_REGS:
8580 case PR_LO_REGS:
8581 case PR_HI_REGS:
8582 return 1;
8583
8584 case NO_REGS:
8585 return 0;
8586
8587 default:
8588 break;
8589 }
8590 gcc_unreachable ();
8591 }
8592
8593 static reg_class_t
8594 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8595 {
8596 if (regclass == POINTER_REGS)
8597 return GENERAL_REGS;
8598
8599 if (regclass == STACK_REG)
8600 {
8601 if (REG_P(x)
8602 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8603 return regclass;
8604
8605 return NO_REGS;
8606 }
8607
8608 /* Register eliminiation can result in a request for
8609 SP+constant->FP_REGS. We cannot support such operations which
8610 use SP as source and an FP_REG as destination, so reject out
8611 right now. */
8612 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8613 {
8614 rtx lhs = XEXP (x, 0);
8615
8616 /* Look through a possible SUBREG introduced by ILP32. */
8617 if (GET_CODE (lhs) == SUBREG)
8618 lhs = SUBREG_REG (lhs);
8619
8620 gcc_assert (REG_P (lhs));
8621 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8622 POINTER_REGS));
8623 return NO_REGS;
8624 }
8625
8626 return regclass;
8627 }
8628
8629 void
8630 aarch64_asm_output_labelref (FILE* f, const char *name)
8631 {
8632 asm_fprintf (f, "%U%s", name);
8633 }
8634
8635 static void
8636 aarch64_elf_asm_constructor (rtx symbol, int priority)
8637 {
8638 if (priority == DEFAULT_INIT_PRIORITY)
8639 default_ctor_section_asm_out_constructor (symbol, priority);
8640 else
8641 {
8642 section *s;
8643 /* While priority is known to be in range [0, 65535], so 18 bytes
8644 would be enough, the compiler might not know that. To avoid
8645 -Wformat-truncation false positive, use a larger size. */
8646 char buf[23];
8647 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8648 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8649 switch_to_section (s);
8650 assemble_align (POINTER_SIZE);
8651 assemble_aligned_integer (POINTER_BYTES, symbol);
8652 }
8653 }
8654
8655 static void
8656 aarch64_elf_asm_destructor (rtx symbol, int priority)
8657 {
8658 if (priority == DEFAULT_INIT_PRIORITY)
8659 default_dtor_section_asm_out_destructor (symbol, priority);
8660 else
8661 {
8662 section *s;
8663 /* While priority is known to be in range [0, 65535], so 18 bytes
8664 would be enough, the compiler might not know that. To avoid
8665 -Wformat-truncation false positive, use a larger size. */
8666 char buf[23];
8667 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8668 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8669 switch_to_section (s);
8670 assemble_align (POINTER_SIZE);
8671 assemble_aligned_integer (POINTER_BYTES, symbol);
8672 }
8673 }
8674
8675 const char*
8676 aarch64_output_casesi (rtx *operands)
8677 {
8678 char buf[100];
8679 char label[100];
8680 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8681 int index;
8682 static const char *const patterns[4][2] =
8683 {
8684 {
8685 "ldrb\t%w3, [%0,%w1,uxtw]",
8686 "add\t%3, %4, %w3, sxtb #2"
8687 },
8688 {
8689 "ldrh\t%w3, [%0,%w1,uxtw #1]",
8690 "add\t%3, %4, %w3, sxth #2"
8691 },
8692 {
8693 "ldr\t%w3, [%0,%w1,uxtw #2]",
8694 "add\t%3, %4, %w3, sxtw #2"
8695 },
8696 /* We assume that DImode is only generated when not optimizing and
8697 that we don't really need 64-bit address offsets. That would
8698 imply an object file with 8GB of code in a single function! */
8699 {
8700 "ldr\t%w3, [%0,%w1,uxtw #2]",
8701 "add\t%3, %4, %w3, sxtw #2"
8702 }
8703 };
8704
8705 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8706
8707 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8708 index = exact_log2 (GET_MODE_SIZE (mode));
8709
8710 gcc_assert (index >= 0 && index <= 3);
8711
8712 /* Need to implement table size reduction, by chaning the code below. */
8713 output_asm_insn (patterns[index][0], operands);
8714 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8715 snprintf (buf, sizeof (buf),
8716 "adr\t%%4, %s", targetm.strip_name_encoding (label));
8717 output_asm_insn (buf, operands);
8718 output_asm_insn (patterns[index][1], operands);
8719 output_asm_insn ("br\t%3", operands);
8720 assemble_label (asm_out_file, label);
8721 return "";
8722 }
8723
8724
8725 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8726 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8727 operator. */
8728
8729 int
8730 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8731 {
8732 if (shift >= 0 && shift <= 3)
8733 {
8734 int size;
8735 for (size = 8; size <= 32; size *= 2)
8736 {
8737 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8738 if (mask == bits << shift)
8739 return size;
8740 }
8741 }
8742 return 0;
8743 }
8744
8745 /* Constant pools are per function only when PC relative
8746 literal loads are true or we are in the large memory
8747 model. */
8748
8749 static inline bool
8750 aarch64_can_use_per_function_literal_pools_p (void)
8751 {
8752 return (aarch64_pcrelative_literal_loads
8753 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8754 }
8755
8756 static bool
8757 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8758 {
8759 /* We can't use blocks for constants when we're using a per-function
8760 constant pool. */
8761 return !aarch64_can_use_per_function_literal_pools_p ();
8762 }
8763
8764 /* Select appropriate section for constants depending
8765 on where we place literal pools. */
8766
8767 static section *
8768 aarch64_select_rtx_section (machine_mode mode,
8769 rtx x,
8770 unsigned HOST_WIDE_INT align)
8771 {
8772 if (aarch64_can_use_per_function_literal_pools_p ())
8773 return function_section (current_function_decl);
8774
8775 return default_elf_select_rtx_section (mode, x, align);
8776 }
8777
8778 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
8779 void
8780 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8781 HOST_WIDE_INT offset)
8782 {
8783 /* When using per-function literal pools, we must ensure that any code
8784 section is aligned to the minimal instruction length, lest we get
8785 errors from the assembler re "unaligned instructions". */
8786 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8787 ASM_OUTPUT_ALIGN (f, 2);
8788 }
8789
8790 /* Costs. */
8791
8792 /* Helper function for rtx cost calculation. Strip a shift expression
8793 from X. Returns the inner operand if successful, or the original
8794 expression on failure. */
8795 static rtx
8796 aarch64_strip_shift (rtx x)
8797 {
8798 rtx op = x;
8799
8800 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8801 we can convert both to ROR during final output. */
8802 if ((GET_CODE (op) == ASHIFT
8803 || GET_CODE (op) == ASHIFTRT
8804 || GET_CODE (op) == LSHIFTRT
8805 || GET_CODE (op) == ROTATERT
8806 || GET_CODE (op) == ROTATE)
8807 && CONST_INT_P (XEXP (op, 1)))
8808 return XEXP (op, 0);
8809
8810 if (GET_CODE (op) == MULT
8811 && CONST_INT_P (XEXP (op, 1))
8812 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8813 return XEXP (op, 0);
8814
8815 return x;
8816 }
8817
8818 /* Helper function for rtx cost calculation. Strip an extend
8819 expression from X. Returns the inner operand if successful, or the
8820 original expression on failure. We deal with a number of possible
8821 canonicalization variations here. If STRIP_SHIFT is true, then
8822 we can strip off a shift also. */
8823 static rtx
8824 aarch64_strip_extend (rtx x, bool strip_shift)
8825 {
8826 scalar_int_mode mode;
8827 rtx op = x;
8828
8829 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8830 return op;
8831
8832 /* Zero and sign extraction of a widened value. */
8833 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8834 && XEXP (op, 2) == const0_rtx
8835 && GET_CODE (XEXP (op, 0)) == MULT
8836 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8837 XEXP (op, 1)))
8838 return XEXP (XEXP (op, 0), 0);
8839
8840 /* It can also be represented (for zero-extend) as an AND with an
8841 immediate. */
8842 if (GET_CODE (op) == AND
8843 && GET_CODE (XEXP (op, 0)) == MULT
8844 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8845 && CONST_INT_P (XEXP (op, 1))
8846 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8847 INTVAL (XEXP (op, 1))) != 0)
8848 return XEXP (XEXP (op, 0), 0);
8849
8850 /* Now handle extended register, as this may also have an optional
8851 left shift by 1..4. */
8852 if (strip_shift
8853 && GET_CODE (op) == ASHIFT
8854 && CONST_INT_P (XEXP (op, 1))
8855 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8856 op = XEXP (op, 0);
8857
8858 if (GET_CODE (op) == ZERO_EXTEND
8859 || GET_CODE (op) == SIGN_EXTEND)
8860 op = XEXP (op, 0);
8861
8862 if (op != x)
8863 return op;
8864
8865 return x;
8866 }
8867
8868 /* Return true iff CODE is a shift supported in combination
8869 with arithmetic instructions. */
8870
8871 static bool
8872 aarch64_shift_p (enum rtx_code code)
8873 {
8874 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8875 }
8876
8877
8878 /* Return true iff X is a cheap shift without a sign extend. */
8879
8880 static bool
8881 aarch64_cheap_mult_shift_p (rtx x)
8882 {
8883 rtx op0, op1;
8884
8885 op0 = XEXP (x, 0);
8886 op1 = XEXP (x, 1);
8887
8888 if (!(aarch64_tune_params.extra_tuning_flags
8889 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8890 return false;
8891
8892 if (GET_CODE (op0) == SIGN_EXTEND)
8893 return false;
8894
8895 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8896 && UINTVAL (op1) <= 4)
8897 return true;
8898
8899 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8900 return false;
8901
8902 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8903
8904 if (l2 > 0 && l2 <= 4)
8905 return true;
8906
8907 return false;
8908 }
8909
8910 /* Helper function for rtx cost calculation. Calculate the cost of
8911 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8912 Return the calculated cost of the expression, recursing manually in to
8913 operands where needed. */
8914
8915 static int
8916 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8917 {
8918 rtx op0, op1;
8919 const struct cpu_cost_table *extra_cost
8920 = aarch64_tune_params.insn_extra_cost;
8921 int cost = 0;
8922 bool compound_p = (outer == PLUS || outer == MINUS);
8923 machine_mode mode = GET_MODE (x);
8924
8925 gcc_checking_assert (code == MULT);
8926
8927 op0 = XEXP (x, 0);
8928 op1 = XEXP (x, 1);
8929
8930 if (VECTOR_MODE_P (mode))
8931 mode = GET_MODE_INNER (mode);
8932
8933 /* Integer multiply/fma. */
8934 if (GET_MODE_CLASS (mode) == MODE_INT)
8935 {
8936 /* The multiply will be canonicalized as a shift, cost it as such. */
8937 if (aarch64_shift_p (GET_CODE (x))
8938 || (CONST_INT_P (op1)
8939 && exact_log2 (INTVAL (op1)) > 0))
8940 {
8941 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8942 || GET_CODE (op0) == SIGN_EXTEND;
8943 if (speed)
8944 {
8945 if (compound_p)
8946 {
8947 /* If the shift is considered cheap,
8948 then don't add any cost. */
8949 if (aarch64_cheap_mult_shift_p (x))
8950 ;
8951 else if (REG_P (op1))
8952 /* ARITH + shift-by-register. */
8953 cost += extra_cost->alu.arith_shift_reg;
8954 else if (is_extend)
8955 /* ARITH + extended register. We don't have a cost field
8956 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8957 cost += extra_cost->alu.extend_arith;
8958 else
8959 /* ARITH + shift-by-immediate. */
8960 cost += extra_cost->alu.arith_shift;
8961 }
8962 else
8963 /* LSL (immediate). */
8964 cost += extra_cost->alu.shift;
8965
8966 }
8967 /* Strip extends as we will have costed them in the case above. */
8968 if (is_extend)
8969 op0 = aarch64_strip_extend (op0, true);
8970
8971 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8972
8973 return cost;
8974 }
8975
8976 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8977 compound and let the below cases handle it. After all, MNEG is a
8978 special-case alias of MSUB. */
8979 if (GET_CODE (op0) == NEG)
8980 {
8981 op0 = XEXP (op0, 0);
8982 compound_p = true;
8983 }
8984
8985 /* Integer multiplies or FMAs have zero/sign extending variants. */
8986 if ((GET_CODE (op0) == ZERO_EXTEND
8987 && GET_CODE (op1) == ZERO_EXTEND)
8988 || (GET_CODE (op0) == SIGN_EXTEND
8989 && GET_CODE (op1) == SIGN_EXTEND))
8990 {
8991 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8992 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8993
8994 if (speed)
8995 {
8996 if (compound_p)
8997 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8998 cost += extra_cost->mult[0].extend_add;
8999 else
9000 /* MUL/SMULL/UMULL. */
9001 cost += extra_cost->mult[0].extend;
9002 }
9003
9004 return cost;
9005 }
9006
9007 /* This is either an integer multiply or a MADD. In both cases
9008 we want to recurse and cost the operands. */
9009 cost += rtx_cost (op0, mode, MULT, 0, speed);
9010 cost += rtx_cost (op1, mode, MULT, 1, speed);
9011
9012 if (speed)
9013 {
9014 if (compound_p)
9015 /* MADD/MSUB. */
9016 cost += extra_cost->mult[mode == DImode].add;
9017 else
9018 /* MUL. */
9019 cost += extra_cost->mult[mode == DImode].simple;
9020 }
9021
9022 return cost;
9023 }
9024 else
9025 {
9026 if (speed)
9027 {
9028 /* Floating-point FMA/FMUL can also support negations of the
9029 operands, unless the rounding mode is upward or downward in
9030 which case FNMUL is different than FMUL with operand negation. */
9031 bool neg0 = GET_CODE (op0) == NEG;
9032 bool neg1 = GET_CODE (op1) == NEG;
9033 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9034 {
9035 if (neg0)
9036 op0 = XEXP (op0, 0);
9037 if (neg1)
9038 op1 = XEXP (op1, 0);
9039 }
9040
9041 if (compound_p)
9042 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9043 cost += extra_cost->fp[mode == DFmode].fma;
9044 else
9045 /* FMUL/FNMUL. */
9046 cost += extra_cost->fp[mode == DFmode].mult;
9047 }
9048
9049 cost += rtx_cost (op0, mode, MULT, 0, speed);
9050 cost += rtx_cost (op1, mode, MULT, 1, speed);
9051 return cost;
9052 }
9053 }
9054
9055 static int
9056 aarch64_address_cost (rtx x,
9057 machine_mode mode,
9058 addr_space_t as ATTRIBUTE_UNUSED,
9059 bool speed)
9060 {
9061 enum rtx_code c = GET_CODE (x);
9062 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9063 struct aarch64_address_info info;
9064 int cost = 0;
9065 info.shift = 0;
9066
9067 if (!aarch64_classify_address (&info, x, mode, false))
9068 {
9069 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9070 {
9071 /* This is a CONST or SYMBOL ref which will be split
9072 in a different way depending on the code model in use.
9073 Cost it through the generic infrastructure. */
9074 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9075 /* Divide through by the cost of one instruction to
9076 bring it to the same units as the address costs. */
9077 cost_symbol_ref /= COSTS_N_INSNS (1);
9078 /* The cost is then the cost of preparing the address,
9079 followed by an immediate (possibly 0) offset. */
9080 return cost_symbol_ref + addr_cost->imm_offset;
9081 }
9082 else
9083 {
9084 /* This is most likely a jump table from a case
9085 statement. */
9086 return addr_cost->register_offset;
9087 }
9088 }
9089
9090 switch (info.type)
9091 {
9092 case ADDRESS_LO_SUM:
9093 case ADDRESS_SYMBOLIC:
9094 case ADDRESS_REG_IMM:
9095 cost += addr_cost->imm_offset;
9096 break;
9097
9098 case ADDRESS_REG_WB:
9099 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9100 cost += addr_cost->pre_modify;
9101 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9102 cost += addr_cost->post_modify;
9103 else
9104 gcc_unreachable ();
9105
9106 break;
9107
9108 case ADDRESS_REG_REG:
9109 cost += addr_cost->register_offset;
9110 break;
9111
9112 case ADDRESS_REG_SXTW:
9113 cost += addr_cost->register_sextend;
9114 break;
9115
9116 case ADDRESS_REG_UXTW:
9117 cost += addr_cost->register_zextend;
9118 break;
9119
9120 default:
9121 gcc_unreachable ();
9122 }
9123
9124
9125 if (info.shift > 0)
9126 {
9127 /* For the sake of calculating the cost of the shifted register
9128 component, we can treat same sized modes in the same way. */
9129 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9130 cost += addr_cost->addr_scale_costs.hi;
9131 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9132 cost += addr_cost->addr_scale_costs.si;
9133 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9134 cost += addr_cost->addr_scale_costs.di;
9135 else
9136 /* We can't tell, or this is a 128-bit vector. */
9137 cost += addr_cost->addr_scale_costs.ti;
9138 }
9139
9140 return cost;
9141 }
9142
9143 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9144 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9145 to be taken. */
9146
9147 int
9148 aarch64_branch_cost (bool speed_p, bool predictable_p)
9149 {
9150 /* When optimizing for speed, use the cost of unpredictable branches. */
9151 const struct cpu_branch_cost *branch_costs =
9152 aarch64_tune_params.branch_costs;
9153
9154 if (!speed_p || predictable_p)
9155 return branch_costs->predictable;
9156 else
9157 return branch_costs->unpredictable;
9158 }
9159
9160 /* Return true if the RTX X in mode MODE is a zero or sign extract
9161 usable in an ADD or SUB (extended register) instruction. */
9162 static bool
9163 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9164 {
9165 /* Catch add with a sign extract.
9166 This is add_<optab><mode>_multp2. */
9167 if (GET_CODE (x) == SIGN_EXTRACT
9168 || GET_CODE (x) == ZERO_EXTRACT)
9169 {
9170 rtx op0 = XEXP (x, 0);
9171 rtx op1 = XEXP (x, 1);
9172 rtx op2 = XEXP (x, 2);
9173
9174 if (GET_CODE (op0) == MULT
9175 && CONST_INT_P (op1)
9176 && op2 == const0_rtx
9177 && CONST_INT_P (XEXP (op0, 1))
9178 && aarch64_is_extend_from_extract (mode,
9179 XEXP (op0, 1),
9180 op1))
9181 {
9182 return true;
9183 }
9184 }
9185 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9186 No shift. */
9187 else if (GET_CODE (x) == SIGN_EXTEND
9188 || GET_CODE (x) == ZERO_EXTEND)
9189 return REG_P (XEXP (x, 0));
9190
9191 return false;
9192 }
9193
9194 static bool
9195 aarch64_frint_unspec_p (unsigned int u)
9196 {
9197 switch (u)
9198 {
9199 case UNSPEC_FRINTZ:
9200 case UNSPEC_FRINTP:
9201 case UNSPEC_FRINTM:
9202 case UNSPEC_FRINTA:
9203 case UNSPEC_FRINTN:
9204 case UNSPEC_FRINTX:
9205 case UNSPEC_FRINTI:
9206 return true;
9207
9208 default:
9209 return false;
9210 }
9211 }
9212
9213 /* Return true iff X is an rtx that will match an extr instruction
9214 i.e. as described in the *extr<mode>5_insn family of patterns.
9215 OP0 and OP1 will be set to the operands of the shifts involved
9216 on success and will be NULL_RTX otherwise. */
9217
9218 static bool
9219 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9220 {
9221 rtx op0, op1;
9222 scalar_int_mode mode;
9223 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9224 return false;
9225
9226 *res_op0 = NULL_RTX;
9227 *res_op1 = NULL_RTX;
9228
9229 if (GET_CODE (x) != IOR)
9230 return false;
9231
9232 op0 = XEXP (x, 0);
9233 op1 = XEXP (x, 1);
9234
9235 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9236 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9237 {
9238 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9239 if (GET_CODE (op1) == ASHIFT)
9240 std::swap (op0, op1);
9241
9242 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9243 return false;
9244
9245 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9246 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9247
9248 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9249 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9250 {
9251 *res_op0 = XEXP (op0, 0);
9252 *res_op1 = XEXP (op1, 0);
9253 return true;
9254 }
9255 }
9256
9257 return false;
9258 }
9259
9260 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9261 storing it in *COST. Result is true if the total cost of the operation
9262 has now been calculated. */
9263 static bool
9264 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9265 {
9266 rtx inner;
9267 rtx comparator;
9268 enum rtx_code cmpcode;
9269
9270 if (COMPARISON_P (op0))
9271 {
9272 inner = XEXP (op0, 0);
9273 comparator = XEXP (op0, 1);
9274 cmpcode = GET_CODE (op0);
9275 }
9276 else
9277 {
9278 inner = op0;
9279 comparator = const0_rtx;
9280 cmpcode = NE;
9281 }
9282
9283 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9284 {
9285 /* Conditional branch. */
9286 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9287 return true;
9288 else
9289 {
9290 if (cmpcode == NE || cmpcode == EQ)
9291 {
9292 if (comparator == const0_rtx)
9293 {
9294 /* TBZ/TBNZ/CBZ/CBNZ. */
9295 if (GET_CODE (inner) == ZERO_EXTRACT)
9296 /* TBZ/TBNZ. */
9297 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9298 ZERO_EXTRACT, 0, speed);
9299 else
9300 /* CBZ/CBNZ. */
9301 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9302
9303 return true;
9304 }
9305 }
9306 else if (cmpcode == LT || cmpcode == GE)
9307 {
9308 /* TBZ/TBNZ. */
9309 if (comparator == const0_rtx)
9310 return true;
9311 }
9312 }
9313 }
9314 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9315 {
9316 /* CCMP. */
9317 if (GET_CODE (op1) == COMPARE)
9318 {
9319 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9320 if (XEXP (op1, 1) == const0_rtx)
9321 *cost += 1;
9322 if (speed)
9323 {
9324 machine_mode mode = GET_MODE (XEXP (op1, 0));
9325 const struct cpu_cost_table *extra_cost
9326 = aarch64_tune_params.insn_extra_cost;
9327
9328 if (GET_MODE_CLASS (mode) == MODE_INT)
9329 *cost += extra_cost->alu.arith;
9330 else
9331 *cost += extra_cost->fp[mode == DFmode].compare;
9332 }
9333 return true;
9334 }
9335
9336 /* It's a conditional operation based on the status flags,
9337 so it must be some flavor of CSEL. */
9338
9339 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9340 if (GET_CODE (op1) == NEG
9341 || GET_CODE (op1) == NOT
9342 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9343 op1 = XEXP (op1, 0);
9344 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9345 {
9346 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9347 op1 = XEXP (op1, 0);
9348 op2 = XEXP (op2, 0);
9349 }
9350
9351 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9352 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9353 return true;
9354 }
9355
9356 /* We don't know what this is, cost all operands. */
9357 return false;
9358 }
9359
9360 /* Check whether X is a bitfield operation of the form shift + extend that
9361 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9362 operand to which the bitfield operation is applied. Otherwise return
9363 NULL_RTX. */
9364
9365 static rtx
9366 aarch64_extend_bitfield_pattern_p (rtx x)
9367 {
9368 rtx_code outer_code = GET_CODE (x);
9369 machine_mode outer_mode = GET_MODE (x);
9370
9371 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9372 && outer_mode != SImode && outer_mode != DImode)
9373 return NULL_RTX;
9374
9375 rtx inner = XEXP (x, 0);
9376 rtx_code inner_code = GET_CODE (inner);
9377 machine_mode inner_mode = GET_MODE (inner);
9378 rtx op = NULL_RTX;
9379
9380 switch (inner_code)
9381 {
9382 case ASHIFT:
9383 if (CONST_INT_P (XEXP (inner, 1))
9384 && (inner_mode == QImode || inner_mode == HImode))
9385 op = XEXP (inner, 0);
9386 break;
9387 case LSHIFTRT:
9388 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9389 && (inner_mode == QImode || inner_mode == HImode))
9390 op = XEXP (inner, 0);
9391 break;
9392 case ASHIFTRT:
9393 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9394 && (inner_mode == QImode || inner_mode == HImode))
9395 op = XEXP (inner, 0);
9396 break;
9397 default:
9398 break;
9399 }
9400
9401 return op;
9402 }
9403
9404 /* Return true if the mask and a shift amount from an RTX of the form
9405 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9406 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9407
9408 bool
9409 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9410 rtx shft_amnt)
9411 {
9412 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9413 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9414 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9415 && (INTVAL (mask)
9416 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9417 }
9418
9419 /* Return true if the masks and a shift amount from an RTX of the form
9420 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9421 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
9422
9423 bool
9424 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9425 unsigned HOST_WIDE_INT mask1,
9426 unsigned HOST_WIDE_INT shft_amnt,
9427 unsigned HOST_WIDE_INT mask2)
9428 {
9429 unsigned HOST_WIDE_INT t;
9430
9431 /* Verify that there is no overlap in what bits are set in the two masks. */
9432 if (mask1 != ~mask2)
9433 return false;
9434
9435 /* Verify that mask2 is not all zeros or ones. */
9436 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9437 return false;
9438
9439 /* The shift amount should always be less than the mode size. */
9440 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9441
9442 /* Verify that the mask being shifted is contiguous and would be in the
9443 least significant bits after shifting by shft_amnt. */
9444 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9445 return (t == (t & -t));
9446 }
9447
9448 /* Calculate the cost of calculating X, storing it in *COST. Result
9449 is true if the total cost of the operation has now been calculated. */
9450 static bool
9451 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9452 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9453 {
9454 rtx op0, op1, op2;
9455 const struct cpu_cost_table *extra_cost
9456 = aarch64_tune_params.insn_extra_cost;
9457 int code = GET_CODE (x);
9458 scalar_int_mode int_mode;
9459
9460 /* By default, assume that everything has equivalent cost to the
9461 cheapest instruction. Any additional costs are applied as a delta
9462 above this default. */
9463 *cost = COSTS_N_INSNS (1);
9464
9465 switch (code)
9466 {
9467 case SET:
9468 /* The cost depends entirely on the operands to SET. */
9469 *cost = 0;
9470 op0 = SET_DEST (x);
9471 op1 = SET_SRC (x);
9472
9473 switch (GET_CODE (op0))
9474 {
9475 case MEM:
9476 if (speed)
9477 {
9478 rtx address = XEXP (op0, 0);
9479 if (VECTOR_MODE_P (mode))
9480 *cost += extra_cost->ldst.storev;
9481 else if (GET_MODE_CLASS (mode) == MODE_INT)
9482 *cost += extra_cost->ldst.store;
9483 else if (mode == SFmode)
9484 *cost += extra_cost->ldst.storef;
9485 else if (mode == DFmode)
9486 *cost += extra_cost->ldst.stored;
9487
9488 *cost +=
9489 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9490 0, speed));
9491 }
9492
9493 *cost += rtx_cost (op1, mode, SET, 1, speed);
9494 return true;
9495
9496 case SUBREG:
9497 if (! REG_P (SUBREG_REG (op0)))
9498 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9499
9500 /* Fall through. */
9501 case REG:
9502 /* The cost is one per vector-register copied. */
9503 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9504 {
9505 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9506 *cost = COSTS_N_INSNS (nregs);
9507 }
9508 /* const0_rtx is in general free, but we will use an
9509 instruction to set a register to 0. */
9510 else if (REG_P (op1) || op1 == const0_rtx)
9511 {
9512 /* The cost is 1 per register copied. */
9513 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9514 *cost = COSTS_N_INSNS (nregs);
9515 }
9516 else
9517 /* Cost is just the cost of the RHS of the set. */
9518 *cost += rtx_cost (op1, mode, SET, 1, speed);
9519 return true;
9520
9521 case ZERO_EXTRACT:
9522 case SIGN_EXTRACT:
9523 /* Bit-field insertion. Strip any redundant widening of
9524 the RHS to meet the width of the target. */
9525 if (GET_CODE (op1) == SUBREG)
9526 op1 = SUBREG_REG (op1);
9527 if ((GET_CODE (op1) == ZERO_EXTEND
9528 || GET_CODE (op1) == SIGN_EXTEND)
9529 && CONST_INT_P (XEXP (op0, 1))
9530 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9531 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9532 op1 = XEXP (op1, 0);
9533
9534 if (CONST_INT_P (op1))
9535 {
9536 /* MOV immediate is assumed to always be cheap. */
9537 *cost = COSTS_N_INSNS (1);
9538 }
9539 else
9540 {
9541 /* BFM. */
9542 if (speed)
9543 *cost += extra_cost->alu.bfi;
9544 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9545 }
9546
9547 return true;
9548
9549 default:
9550 /* We can't make sense of this, assume default cost. */
9551 *cost = COSTS_N_INSNS (1);
9552 return false;
9553 }
9554 return false;
9555
9556 case CONST_INT:
9557 /* If an instruction can incorporate a constant within the
9558 instruction, the instruction's expression avoids calling
9559 rtx_cost() on the constant. If rtx_cost() is called on a
9560 constant, then it is usually because the constant must be
9561 moved into a register by one or more instructions.
9562
9563 The exception is constant 0, which can be expressed
9564 as XZR/WZR and is therefore free. The exception to this is
9565 if we have (set (reg) (const0_rtx)) in which case we must cost
9566 the move. However, we can catch that when we cost the SET, so
9567 we don't need to consider that here. */
9568 if (x == const0_rtx)
9569 *cost = 0;
9570 else
9571 {
9572 /* To an approximation, building any other constant is
9573 proportionally expensive to the number of instructions
9574 required to build that constant. This is true whether we
9575 are compiling for SPEED or otherwise. */
9576 if (!is_a <scalar_int_mode> (mode, &int_mode))
9577 int_mode = word_mode;
9578 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9579 (NULL_RTX, x, false, int_mode));
9580 }
9581 return true;
9582
9583 case CONST_DOUBLE:
9584
9585 /* First determine number of instructions to do the move
9586 as an integer constant. */
9587 if (!aarch64_float_const_representable_p (x)
9588 && !aarch64_can_const_movi_rtx_p (x, mode)
9589 && aarch64_float_const_rtx_p (x))
9590 {
9591 unsigned HOST_WIDE_INT ival;
9592 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9593 gcc_assert (succeed);
9594
9595 scalar_int_mode imode = (mode == HFmode
9596 ? SImode
9597 : int_mode_for_mode (mode).require ());
9598 int ncost = aarch64_internal_mov_immediate
9599 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9600 *cost += COSTS_N_INSNS (ncost);
9601 return true;
9602 }
9603
9604 if (speed)
9605 {
9606 /* mov[df,sf]_aarch64. */
9607 if (aarch64_float_const_representable_p (x))
9608 /* FMOV (scalar immediate). */
9609 *cost += extra_cost->fp[mode == DFmode].fpconst;
9610 else if (!aarch64_float_const_zero_rtx_p (x))
9611 {
9612 /* This will be a load from memory. */
9613 if (mode == DFmode)
9614 *cost += extra_cost->ldst.loadd;
9615 else
9616 *cost += extra_cost->ldst.loadf;
9617 }
9618 else
9619 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9620 or MOV v0.s[0], wzr - neither of which are modeled by the
9621 cost tables. Just use the default cost. */
9622 {
9623 }
9624 }
9625
9626 return true;
9627
9628 case MEM:
9629 if (speed)
9630 {
9631 /* For loads we want the base cost of a load, plus an
9632 approximation for the additional cost of the addressing
9633 mode. */
9634 rtx address = XEXP (x, 0);
9635 if (VECTOR_MODE_P (mode))
9636 *cost += extra_cost->ldst.loadv;
9637 else if (GET_MODE_CLASS (mode) == MODE_INT)
9638 *cost += extra_cost->ldst.load;
9639 else if (mode == SFmode)
9640 *cost += extra_cost->ldst.loadf;
9641 else if (mode == DFmode)
9642 *cost += extra_cost->ldst.loadd;
9643
9644 *cost +=
9645 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9646 0, speed));
9647 }
9648
9649 return true;
9650
9651 case NEG:
9652 op0 = XEXP (x, 0);
9653
9654 if (VECTOR_MODE_P (mode))
9655 {
9656 if (speed)
9657 {
9658 /* FNEG. */
9659 *cost += extra_cost->vect.alu;
9660 }
9661 return false;
9662 }
9663
9664 if (GET_MODE_CLASS (mode) == MODE_INT)
9665 {
9666 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9667 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9668 {
9669 /* CSETM. */
9670 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9671 return true;
9672 }
9673
9674 /* Cost this as SUB wzr, X. */
9675 op0 = CONST0_RTX (mode);
9676 op1 = XEXP (x, 0);
9677 goto cost_minus;
9678 }
9679
9680 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9681 {
9682 /* Support (neg(fma...)) as a single instruction only if
9683 sign of zeros is unimportant. This matches the decision
9684 making in aarch64.md. */
9685 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9686 {
9687 /* FNMADD. */
9688 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9689 return true;
9690 }
9691 if (GET_CODE (op0) == MULT)
9692 {
9693 /* FNMUL. */
9694 *cost = rtx_cost (op0, mode, NEG, 0, speed);
9695 return true;
9696 }
9697 if (speed)
9698 /* FNEG. */
9699 *cost += extra_cost->fp[mode == DFmode].neg;
9700 return false;
9701 }
9702
9703 return false;
9704
9705 case CLRSB:
9706 case CLZ:
9707 if (speed)
9708 {
9709 if (VECTOR_MODE_P (mode))
9710 *cost += extra_cost->vect.alu;
9711 else
9712 *cost += extra_cost->alu.clz;
9713 }
9714
9715 return false;
9716
9717 case COMPARE:
9718 op0 = XEXP (x, 0);
9719 op1 = XEXP (x, 1);
9720
9721 if (op1 == const0_rtx
9722 && GET_CODE (op0) == AND)
9723 {
9724 x = op0;
9725 mode = GET_MODE (op0);
9726 goto cost_logic;
9727 }
9728
9729 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9730 {
9731 /* TODO: A write to the CC flags possibly costs extra, this
9732 needs encoding in the cost tables. */
9733
9734 mode = GET_MODE (op0);
9735 /* ANDS. */
9736 if (GET_CODE (op0) == AND)
9737 {
9738 x = op0;
9739 goto cost_logic;
9740 }
9741
9742 if (GET_CODE (op0) == PLUS)
9743 {
9744 /* ADDS (and CMN alias). */
9745 x = op0;
9746 goto cost_plus;
9747 }
9748
9749 if (GET_CODE (op0) == MINUS)
9750 {
9751 /* SUBS. */
9752 x = op0;
9753 goto cost_minus;
9754 }
9755
9756 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9757 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9758 && CONST_INT_P (XEXP (op0, 2)))
9759 {
9760 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9761 Handle it here directly rather than going to cost_logic
9762 since we know the immediate generated for the TST is valid
9763 so we can avoid creating an intermediate rtx for it only
9764 for costing purposes. */
9765 if (speed)
9766 *cost += extra_cost->alu.logical;
9767
9768 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9769 ZERO_EXTRACT, 0, speed);
9770 return true;
9771 }
9772
9773 if (GET_CODE (op1) == NEG)
9774 {
9775 /* CMN. */
9776 if (speed)
9777 *cost += extra_cost->alu.arith;
9778
9779 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9780 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9781 return true;
9782 }
9783
9784 /* CMP.
9785
9786 Compare can freely swap the order of operands, and
9787 canonicalization puts the more complex operation first.
9788 But the integer MINUS logic expects the shift/extend
9789 operation in op1. */
9790 if (! (REG_P (op0)
9791 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9792 {
9793 op0 = XEXP (x, 1);
9794 op1 = XEXP (x, 0);
9795 }
9796 goto cost_minus;
9797 }
9798
9799 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9800 {
9801 /* FCMP. */
9802 if (speed)
9803 *cost += extra_cost->fp[mode == DFmode].compare;
9804
9805 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9806 {
9807 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9808 /* FCMP supports constant 0.0 for no extra cost. */
9809 return true;
9810 }
9811 return false;
9812 }
9813
9814 if (VECTOR_MODE_P (mode))
9815 {
9816 /* Vector compare. */
9817 if (speed)
9818 *cost += extra_cost->vect.alu;
9819
9820 if (aarch64_float_const_zero_rtx_p (op1))
9821 {
9822 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9823 cost. */
9824 return true;
9825 }
9826 return false;
9827 }
9828 return false;
9829
9830 case MINUS:
9831 {
9832 op0 = XEXP (x, 0);
9833 op1 = XEXP (x, 1);
9834
9835 cost_minus:
9836 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
9837
9838 /* Detect valid immediates. */
9839 if ((GET_MODE_CLASS (mode) == MODE_INT
9840 || (GET_MODE_CLASS (mode) == MODE_CC
9841 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9842 && CONST_INT_P (op1)
9843 && aarch64_uimm12_shift (INTVAL (op1)))
9844 {
9845 if (speed)
9846 /* SUB(S) (immediate). */
9847 *cost += extra_cost->alu.arith;
9848 return true;
9849 }
9850
9851 /* Look for SUB (extended register). */
9852 if (is_a <scalar_int_mode> (mode, &int_mode)
9853 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9854 {
9855 if (speed)
9856 *cost += extra_cost->alu.extend_arith;
9857
9858 op1 = aarch64_strip_extend (op1, true);
9859 *cost += rtx_cost (op1, VOIDmode,
9860 (enum rtx_code) GET_CODE (op1), 0, speed);
9861 return true;
9862 }
9863
9864 rtx new_op1 = aarch64_strip_extend (op1, false);
9865
9866 /* Cost this as an FMA-alike operation. */
9867 if ((GET_CODE (new_op1) == MULT
9868 || aarch64_shift_p (GET_CODE (new_op1)))
9869 && code != COMPARE)
9870 {
9871 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9872 (enum rtx_code) code,
9873 speed);
9874 return true;
9875 }
9876
9877 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9878
9879 if (speed)
9880 {
9881 if (VECTOR_MODE_P (mode))
9882 {
9883 /* Vector SUB. */
9884 *cost += extra_cost->vect.alu;
9885 }
9886 else if (GET_MODE_CLASS (mode) == MODE_INT)
9887 {
9888 /* SUB(S). */
9889 *cost += extra_cost->alu.arith;
9890 }
9891 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9892 {
9893 /* FSUB. */
9894 *cost += extra_cost->fp[mode == DFmode].addsub;
9895 }
9896 }
9897 return true;
9898 }
9899
9900 case PLUS:
9901 {
9902 rtx new_op0;
9903
9904 op0 = XEXP (x, 0);
9905 op1 = XEXP (x, 1);
9906
9907 cost_plus:
9908 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9909 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9910 {
9911 /* CSINC. */
9912 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9913 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9914 return true;
9915 }
9916
9917 if (GET_MODE_CLASS (mode) == MODE_INT
9918 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9919 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
9920 {
9921 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9922
9923 if (speed)
9924 /* ADD (immediate). */
9925 *cost += extra_cost->alu.arith;
9926 return true;
9927 }
9928
9929 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9930
9931 /* Look for ADD (extended register). */
9932 if (is_a <scalar_int_mode> (mode, &int_mode)
9933 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9934 {
9935 if (speed)
9936 *cost += extra_cost->alu.extend_arith;
9937
9938 op0 = aarch64_strip_extend (op0, true);
9939 *cost += rtx_cost (op0, VOIDmode,
9940 (enum rtx_code) GET_CODE (op0), 0, speed);
9941 return true;
9942 }
9943
9944 /* Strip any extend, leave shifts behind as we will
9945 cost them through mult_cost. */
9946 new_op0 = aarch64_strip_extend (op0, false);
9947
9948 if (GET_CODE (new_op0) == MULT
9949 || aarch64_shift_p (GET_CODE (new_op0)))
9950 {
9951 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9952 speed);
9953 return true;
9954 }
9955
9956 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9957
9958 if (speed)
9959 {
9960 if (VECTOR_MODE_P (mode))
9961 {
9962 /* Vector ADD. */
9963 *cost += extra_cost->vect.alu;
9964 }
9965 else if (GET_MODE_CLASS (mode) == MODE_INT)
9966 {
9967 /* ADD. */
9968 *cost += extra_cost->alu.arith;
9969 }
9970 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9971 {
9972 /* FADD. */
9973 *cost += extra_cost->fp[mode == DFmode].addsub;
9974 }
9975 }
9976 return true;
9977 }
9978
9979 case BSWAP:
9980 *cost = COSTS_N_INSNS (1);
9981
9982 if (speed)
9983 {
9984 if (VECTOR_MODE_P (mode))
9985 *cost += extra_cost->vect.alu;
9986 else
9987 *cost += extra_cost->alu.rev;
9988 }
9989 return false;
9990
9991 case IOR:
9992 if (aarch_rev16_p (x))
9993 {
9994 *cost = COSTS_N_INSNS (1);
9995
9996 if (speed)
9997 {
9998 if (VECTOR_MODE_P (mode))
9999 *cost += extra_cost->vect.alu;
10000 else
10001 *cost += extra_cost->alu.rev;
10002 }
10003 return true;
10004 }
10005
10006 if (aarch64_extr_rtx_p (x, &op0, &op1))
10007 {
10008 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10009 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10010 if (speed)
10011 *cost += extra_cost->alu.shift;
10012
10013 return true;
10014 }
10015 /* Fall through. */
10016 case XOR:
10017 case AND:
10018 cost_logic:
10019 op0 = XEXP (x, 0);
10020 op1 = XEXP (x, 1);
10021
10022 if (VECTOR_MODE_P (mode))
10023 {
10024 if (speed)
10025 *cost += extra_cost->vect.alu;
10026 return true;
10027 }
10028
10029 if (code == AND
10030 && GET_CODE (op0) == MULT
10031 && CONST_INT_P (XEXP (op0, 1))
10032 && CONST_INT_P (op1)
10033 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10034 INTVAL (op1)) != 0)
10035 {
10036 /* This is a UBFM/SBFM. */
10037 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10038 if (speed)
10039 *cost += extra_cost->alu.bfx;
10040 return true;
10041 }
10042
10043 if (is_int_mode (mode, &int_mode))
10044 {
10045 if (CONST_INT_P (op1))
10046 {
10047 /* We have a mask + shift version of a UBFIZ
10048 i.e. the *andim_ashift<mode>_bfiz pattern. */
10049 if (GET_CODE (op0) == ASHIFT
10050 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10051 XEXP (op0, 1)))
10052 {
10053 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10054 (enum rtx_code) code, 0, speed);
10055 if (speed)
10056 *cost += extra_cost->alu.bfx;
10057
10058 return true;
10059 }
10060 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10061 {
10062 /* We possibly get the immediate for free, this is not
10063 modelled. */
10064 *cost += rtx_cost (op0, int_mode,
10065 (enum rtx_code) code, 0, speed);
10066 if (speed)
10067 *cost += extra_cost->alu.logical;
10068
10069 return true;
10070 }
10071 }
10072 else
10073 {
10074 rtx new_op0 = op0;
10075
10076 /* Handle ORN, EON, or BIC. */
10077 if (GET_CODE (op0) == NOT)
10078 op0 = XEXP (op0, 0);
10079
10080 new_op0 = aarch64_strip_shift (op0);
10081
10082 /* If we had a shift on op0 then this is a logical-shift-
10083 by-register/immediate operation. Otherwise, this is just
10084 a logical operation. */
10085 if (speed)
10086 {
10087 if (new_op0 != op0)
10088 {
10089 /* Shift by immediate. */
10090 if (CONST_INT_P (XEXP (op0, 1)))
10091 *cost += extra_cost->alu.log_shift;
10092 else
10093 *cost += extra_cost->alu.log_shift_reg;
10094 }
10095 else
10096 *cost += extra_cost->alu.logical;
10097 }
10098
10099 /* In both cases we want to cost both operands. */
10100 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10101 0, speed);
10102 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10103 1, speed);
10104
10105 return true;
10106 }
10107 }
10108 return false;
10109
10110 case NOT:
10111 x = XEXP (x, 0);
10112 op0 = aarch64_strip_shift (x);
10113
10114 if (VECTOR_MODE_P (mode))
10115 {
10116 /* Vector NOT. */
10117 *cost += extra_cost->vect.alu;
10118 return false;
10119 }
10120
10121 /* MVN-shifted-reg. */
10122 if (op0 != x)
10123 {
10124 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10125
10126 if (speed)
10127 *cost += extra_cost->alu.log_shift;
10128
10129 return true;
10130 }
10131 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10132 Handle the second form here taking care that 'a' in the above can
10133 be a shift. */
10134 else if (GET_CODE (op0) == XOR)
10135 {
10136 rtx newop0 = XEXP (op0, 0);
10137 rtx newop1 = XEXP (op0, 1);
10138 rtx op0_stripped = aarch64_strip_shift (newop0);
10139
10140 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10141 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10142
10143 if (speed)
10144 {
10145 if (op0_stripped != newop0)
10146 *cost += extra_cost->alu.log_shift;
10147 else
10148 *cost += extra_cost->alu.logical;
10149 }
10150
10151 return true;
10152 }
10153 /* MVN. */
10154 if (speed)
10155 *cost += extra_cost->alu.logical;
10156
10157 return false;
10158
10159 case ZERO_EXTEND:
10160
10161 op0 = XEXP (x, 0);
10162 /* If a value is written in SI mode, then zero extended to DI
10163 mode, the operation will in general be free as a write to
10164 a 'w' register implicitly zeroes the upper bits of an 'x'
10165 register. However, if this is
10166
10167 (set (reg) (zero_extend (reg)))
10168
10169 we must cost the explicit register move. */
10170 if (mode == DImode
10171 && GET_MODE (op0) == SImode
10172 && outer == SET)
10173 {
10174 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10175
10176 /* If OP_COST is non-zero, then the cost of the zero extend
10177 is effectively the cost of the inner operation. Otherwise
10178 we have a MOV instruction and we take the cost from the MOV
10179 itself. This is true independently of whether we are
10180 optimizing for space or time. */
10181 if (op_cost)
10182 *cost = op_cost;
10183
10184 return true;
10185 }
10186 else if (MEM_P (op0))
10187 {
10188 /* All loads can zero extend to any size for free. */
10189 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10190 return true;
10191 }
10192
10193 op0 = aarch64_extend_bitfield_pattern_p (x);
10194 if (op0)
10195 {
10196 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10197 if (speed)
10198 *cost += extra_cost->alu.bfx;
10199 return true;
10200 }
10201
10202 if (speed)
10203 {
10204 if (VECTOR_MODE_P (mode))
10205 {
10206 /* UMOV. */
10207 *cost += extra_cost->vect.alu;
10208 }
10209 else
10210 {
10211 /* We generate an AND instead of UXTB/UXTH. */
10212 *cost += extra_cost->alu.logical;
10213 }
10214 }
10215 return false;
10216
10217 case SIGN_EXTEND:
10218 if (MEM_P (XEXP (x, 0)))
10219 {
10220 /* LDRSH. */
10221 if (speed)
10222 {
10223 rtx address = XEXP (XEXP (x, 0), 0);
10224 *cost += extra_cost->ldst.load_sign_extend;
10225
10226 *cost +=
10227 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10228 0, speed));
10229 }
10230 return true;
10231 }
10232
10233 op0 = aarch64_extend_bitfield_pattern_p (x);
10234 if (op0)
10235 {
10236 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10237 if (speed)
10238 *cost += extra_cost->alu.bfx;
10239 return true;
10240 }
10241
10242 if (speed)
10243 {
10244 if (VECTOR_MODE_P (mode))
10245 *cost += extra_cost->vect.alu;
10246 else
10247 *cost += extra_cost->alu.extend;
10248 }
10249 return false;
10250
10251 case ASHIFT:
10252 op0 = XEXP (x, 0);
10253 op1 = XEXP (x, 1);
10254
10255 if (CONST_INT_P (op1))
10256 {
10257 if (speed)
10258 {
10259 if (VECTOR_MODE_P (mode))
10260 {
10261 /* Vector shift (immediate). */
10262 *cost += extra_cost->vect.alu;
10263 }
10264 else
10265 {
10266 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10267 aliases. */
10268 *cost += extra_cost->alu.shift;
10269 }
10270 }
10271
10272 /* We can incorporate zero/sign extend for free. */
10273 if (GET_CODE (op0) == ZERO_EXTEND
10274 || GET_CODE (op0) == SIGN_EXTEND)
10275 op0 = XEXP (op0, 0);
10276
10277 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10278 return true;
10279 }
10280 else
10281 {
10282 if (VECTOR_MODE_P (mode))
10283 {
10284 if (speed)
10285 /* Vector shift (register). */
10286 *cost += extra_cost->vect.alu;
10287 }
10288 else
10289 {
10290 if (speed)
10291 /* LSLV. */
10292 *cost += extra_cost->alu.shift_reg;
10293
10294 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10295 && CONST_INT_P (XEXP (op1, 1))
10296 && known_eq (INTVAL (XEXP (op1, 1)),
10297 GET_MODE_BITSIZE (mode) - 1))
10298 {
10299 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10300 /* We already demanded XEXP (op1, 0) to be REG_P, so
10301 don't recurse into it. */
10302 return true;
10303 }
10304 }
10305 return false; /* All arguments need to be in registers. */
10306 }
10307
10308 case ROTATE:
10309 case ROTATERT:
10310 case LSHIFTRT:
10311 case ASHIFTRT:
10312 op0 = XEXP (x, 0);
10313 op1 = XEXP (x, 1);
10314
10315 if (CONST_INT_P (op1))
10316 {
10317 /* ASR (immediate) and friends. */
10318 if (speed)
10319 {
10320 if (VECTOR_MODE_P (mode))
10321 *cost += extra_cost->vect.alu;
10322 else
10323 *cost += extra_cost->alu.shift;
10324 }
10325
10326 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10327 return true;
10328 }
10329 else
10330 {
10331 if (VECTOR_MODE_P (mode))
10332 {
10333 if (speed)
10334 /* Vector shift (register). */
10335 *cost += extra_cost->vect.alu;
10336 }
10337 else
10338 {
10339 if (speed)
10340 /* ASR (register) and friends. */
10341 *cost += extra_cost->alu.shift_reg;
10342
10343 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10344 && CONST_INT_P (XEXP (op1, 1))
10345 && known_eq (INTVAL (XEXP (op1, 1)),
10346 GET_MODE_BITSIZE (mode) - 1))
10347 {
10348 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10349 /* We already demanded XEXP (op1, 0) to be REG_P, so
10350 don't recurse into it. */
10351 return true;
10352 }
10353 }
10354 return false; /* All arguments need to be in registers. */
10355 }
10356
10357 case SYMBOL_REF:
10358
10359 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10360 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10361 {
10362 /* LDR. */
10363 if (speed)
10364 *cost += extra_cost->ldst.load;
10365 }
10366 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10367 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10368 {
10369 /* ADRP, followed by ADD. */
10370 *cost += COSTS_N_INSNS (1);
10371 if (speed)
10372 *cost += 2 * extra_cost->alu.arith;
10373 }
10374 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10375 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10376 {
10377 /* ADR. */
10378 if (speed)
10379 *cost += extra_cost->alu.arith;
10380 }
10381
10382 if (flag_pic)
10383 {
10384 /* One extra load instruction, after accessing the GOT. */
10385 *cost += COSTS_N_INSNS (1);
10386 if (speed)
10387 *cost += extra_cost->ldst.load;
10388 }
10389 return true;
10390
10391 case HIGH:
10392 case LO_SUM:
10393 /* ADRP/ADD (immediate). */
10394 if (speed)
10395 *cost += extra_cost->alu.arith;
10396 return true;
10397
10398 case ZERO_EXTRACT:
10399 case SIGN_EXTRACT:
10400 /* UBFX/SBFX. */
10401 if (speed)
10402 {
10403 if (VECTOR_MODE_P (mode))
10404 *cost += extra_cost->vect.alu;
10405 else
10406 *cost += extra_cost->alu.bfx;
10407 }
10408
10409 /* We can trust that the immediates used will be correct (there
10410 are no by-register forms), so we need only cost op0. */
10411 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10412 return true;
10413
10414 case MULT:
10415 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10416 /* aarch64_rtx_mult_cost always handles recursion to its
10417 operands. */
10418 return true;
10419
10420 case MOD:
10421 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10422 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10423 an unconditional negate. This case should only ever be reached through
10424 the set_smod_pow2_cheap check in expmed.c. */
10425 if (CONST_INT_P (XEXP (x, 1))
10426 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10427 && (mode == SImode || mode == DImode))
10428 {
10429 /* We expand to 4 instructions. Reset the baseline. */
10430 *cost = COSTS_N_INSNS (4);
10431
10432 if (speed)
10433 *cost += 2 * extra_cost->alu.logical
10434 + 2 * extra_cost->alu.arith;
10435
10436 return true;
10437 }
10438
10439 /* Fall-through. */
10440 case UMOD:
10441 if (speed)
10442 {
10443 /* Slighly prefer UMOD over SMOD. */
10444 if (VECTOR_MODE_P (mode))
10445 *cost += extra_cost->vect.alu;
10446 else if (GET_MODE_CLASS (mode) == MODE_INT)
10447 *cost += (extra_cost->mult[mode == DImode].add
10448 + extra_cost->mult[mode == DImode].idiv
10449 + (code == MOD ? 1 : 0));
10450 }
10451 return false; /* All arguments need to be in registers. */
10452
10453 case DIV:
10454 case UDIV:
10455 case SQRT:
10456 if (speed)
10457 {
10458 if (VECTOR_MODE_P (mode))
10459 *cost += extra_cost->vect.alu;
10460 else if (GET_MODE_CLASS (mode) == MODE_INT)
10461 /* There is no integer SQRT, so only DIV and UDIV can get
10462 here. */
10463 *cost += (extra_cost->mult[mode == DImode].idiv
10464 /* Slighly prefer UDIV over SDIV. */
10465 + (code == DIV ? 1 : 0));
10466 else
10467 *cost += extra_cost->fp[mode == DFmode].div;
10468 }
10469 return false; /* All arguments need to be in registers. */
10470
10471 case IF_THEN_ELSE:
10472 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10473 XEXP (x, 2), cost, speed);
10474
10475 case EQ:
10476 case NE:
10477 case GT:
10478 case GTU:
10479 case LT:
10480 case LTU:
10481 case GE:
10482 case GEU:
10483 case LE:
10484 case LEU:
10485
10486 return false; /* All arguments must be in registers. */
10487
10488 case FMA:
10489 op0 = XEXP (x, 0);
10490 op1 = XEXP (x, 1);
10491 op2 = XEXP (x, 2);
10492
10493 if (speed)
10494 {
10495 if (VECTOR_MODE_P (mode))
10496 *cost += extra_cost->vect.alu;
10497 else
10498 *cost += extra_cost->fp[mode == DFmode].fma;
10499 }
10500
10501 /* FMSUB, FNMADD, and FNMSUB are free. */
10502 if (GET_CODE (op0) == NEG)
10503 op0 = XEXP (op0, 0);
10504
10505 if (GET_CODE (op2) == NEG)
10506 op2 = XEXP (op2, 0);
10507
10508 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10509 and the by-element operand as operand 0. */
10510 if (GET_CODE (op1) == NEG)
10511 op1 = XEXP (op1, 0);
10512
10513 /* Catch vector-by-element operations. The by-element operand can
10514 either be (vec_duplicate (vec_select (x))) or just
10515 (vec_select (x)), depending on whether we are multiplying by
10516 a vector or a scalar.
10517
10518 Canonicalization is not very good in these cases, FMA4 will put the
10519 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10520 if (GET_CODE (op0) == VEC_DUPLICATE)
10521 op0 = XEXP (op0, 0);
10522 else if (GET_CODE (op1) == VEC_DUPLICATE)
10523 op1 = XEXP (op1, 0);
10524
10525 if (GET_CODE (op0) == VEC_SELECT)
10526 op0 = XEXP (op0, 0);
10527 else if (GET_CODE (op1) == VEC_SELECT)
10528 op1 = XEXP (op1, 0);
10529
10530 /* If the remaining parameters are not registers,
10531 get the cost to put them into registers. */
10532 *cost += rtx_cost (op0, mode, FMA, 0, speed);
10533 *cost += rtx_cost (op1, mode, FMA, 1, speed);
10534 *cost += rtx_cost (op2, mode, FMA, 2, speed);
10535 return true;
10536
10537 case FLOAT:
10538 case UNSIGNED_FLOAT:
10539 if (speed)
10540 *cost += extra_cost->fp[mode == DFmode].fromint;
10541 return false;
10542
10543 case FLOAT_EXTEND:
10544 if (speed)
10545 {
10546 if (VECTOR_MODE_P (mode))
10547 {
10548 /*Vector truncate. */
10549 *cost += extra_cost->vect.alu;
10550 }
10551 else
10552 *cost += extra_cost->fp[mode == DFmode].widen;
10553 }
10554 return false;
10555
10556 case FLOAT_TRUNCATE:
10557 if (speed)
10558 {
10559 if (VECTOR_MODE_P (mode))
10560 {
10561 /*Vector conversion. */
10562 *cost += extra_cost->vect.alu;
10563 }
10564 else
10565 *cost += extra_cost->fp[mode == DFmode].narrow;
10566 }
10567 return false;
10568
10569 case FIX:
10570 case UNSIGNED_FIX:
10571 x = XEXP (x, 0);
10572 /* Strip the rounding part. They will all be implemented
10573 by the fcvt* family of instructions anyway. */
10574 if (GET_CODE (x) == UNSPEC)
10575 {
10576 unsigned int uns_code = XINT (x, 1);
10577
10578 if (uns_code == UNSPEC_FRINTA
10579 || uns_code == UNSPEC_FRINTM
10580 || uns_code == UNSPEC_FRINTN
10581 || uns_code == UNSPEC_FRINTP
10582 || uns_code == UNSPEC_FRINTZ)
10583 x = XVECEXP (x, 0, 0);
10584 }
10585
10586 if (speed)
10587 {
10588 if (VECTOR_MODE_P (mode))
10589 *cost += extra_cost->vect.alu;
10590 else
10591 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10592 }
10593
10594 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10595 fixed-point fcvt. */
10596 if (GET_CODE (x) == MULT
10597 && ((VECTOR_MODE_P (mode)
10598 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10599 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10600 {
10601 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10602 0, speed);
10603 return true;
10604 }
10605
10606 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10607 return true;
10608
10609 case ABS:
10610 if (VECTOR_MODE_P (mode))
10611 {
10612 /* ABS (vector). */
10613 if (speed)
10614 *cost += extra_cost->vect.alu;
10615 }
10616 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10617 {
10618 op0 = XEXP (x, 0);
10619
10620 /* FABD, which is analogous to FADD. */
10621 if (GET_CODE (op0) == MINUS)
10622 {
10623 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10624 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10625 if (speed)
10626 *cost += extra_cost->fp[mode == DFmode].addsub;
10627
10628 return true;
10629 }
10630 /* Simple FABS is analogous to FNEG. */
10631 if (speed)
10632 *cost += extra_cost->fp[mode == DFmode].neg;
10633 }
10634 else
10635 {
10636 /* Integer ABS will either be split to
10637 two arithmetic instructions, or will be an ABS
10638 (scalar), which we don't model. */
10639 *cost = COSTS_N_INSNS (2);
10640 if (speed)
10641 *cost += 2 * extra_cost->alu.arith;
10642 }
10643 return false;
10644
10645 case SMAX:
10646 case SMIN:
10647 if (speed)
10648 {
10649 if (VECTOR_MODE_P (mode))
10650 *cost += extra_cost->vect.alu;
10651 else
10652 {
10653 /* FMAXNM/FMINNM/FMAX/FMIN.
10654 TODO: This may not be accurate for all implementations, but
10655 we do not model this in the cost tables. */
10656 *cost += extra_cost->fp[mode == DFmode].addsub;
10657 }
10658 }
10659 return false;
10660
10661 case UNSPEC:
10662 /* The floating point round to integer frint* instructions. */
10663 if (aarch64_frint_unspec_p (XINT (x, 1)))
10664 {
10665 if (speed)
10666 *cost += extra_cost->fp[mode == DFmode].roundint;
10667
10668 return false;
10669 }
10670
10671 if (XINT (x, 1) == UNSPEC_RBIT)
10672 {
10673 if (speed)
10674 *cost += extra_cost->alu.rev;
10675
10676 return false;
10677 }
10678 break;
10679
10680 case TRUNCATE:
10681
10682 /* Decompose <su>muldi3_highpart. */
10683 if (/* (truncate:DI */
10684 mode == DImode
10685 /* (lshiftrt:TI */
10686 && GET_MODE (XEXP (x, 0)) == TImode
10687 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10688 /* (mult:TI */
10689 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10690 /* (ANY_EXTEND:TI (reg:DI))
10691 (ANY_EXTEND:TI (reg:DI))) */
10692 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10693 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10694 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10695 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10696 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10697 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10698 /* (const_int 64) */
10699 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10700 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10701 {
10702 /* UMULH/SMULH. */
10703 if (speed)
10704 *cost += extra_cost->mult[mode == DImode].extend;
10705 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10706 mode, MULT, 0, speed);
10707 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10708 mode, MULT, 1, speed);
10709 return true;
10710 }
10711
10712 /* Fall through. */
10713 default:
10714 break;
10715 }
10716
10717 if (dump_file
10718 && flag_aarch64_verbose_cost)
10719 fprintf (dump_file,
10720 "\nFailed to cost RTX. Assuming default cost.\n");
10721
10722 return true;
10723 }
10724
10725 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10726 calculated for X. This cost is stored in *COST. Returns true
10727 if the total cost of X was calculated. */
10728 static bool
10729 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10730 int param, int *cost, bool speed)
10731 {
10732 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10733
10734 if (dump_file
10735 && flag_aarch64_verbose_cost)
10736 {
10737 print_rtl_single (dump_file, x);
10738 fprintf (dump_file, "\n%s cost: %d (%s)\n",
10739 speed ? "Hot" : "Cold",
10740 *cost, result ? "final" : "partial");
10741 }
10742
10743 return result;
10744 }
10745
10746 static int
10747 aarch64_register_move_cost (machine_mode mode,
10748 reg_class_t from_i, reg_class_t to_i)
10749 {
10750 enum reg_class from = (enum reg_class) from_i;
10751 enum reg_class to = (enum reg_class) to_i;
10752 const struct cpu_regmove_cost *regmove_cost
10753 = aarch64_tune_params.regmove_cost;
10754
10755 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
10756 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10757 to = GENERAL_REGS;
10758
10759 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10760 from = GENERAL_REGS;
10761
10762 /* Moving between GPR and stack cost is the same as GP2GP. */
10763 if ((from == GENERAL_REGS && to == STACK_REG)
10764 || (to == GENERAL_REGS && from == STACK_REG))
10765 return regmove_cost->GP2GP;
10766
10767 /* To/From the stack register, we move via the gprs. */
10768 if (to == STACK_REG || from == STACK_REG)
10769 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10770 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10771
10772 if (known_eq (GET_MODE_SIZE (mode), 16))
10773 {
10774 /* 128-bit operations on general registers require 2 instructions. */
10775 if (from == GENERAL_REGS && to == GENERAL_REGS)
10776 return regmove_cost->GP2GP * 2;
10777 else if (from == GENERAL_REGS)
10778 return regmove_cost->GP2FP * 2;
10779 else if (to == GENERAL_REGS)
10780 return regmove_cost->FP2GP * 2;
10781
10782 /* When AdvSIMD instructions are disabled it is not possible to move
10783 a 128-bit value directly between Q registers. This is handled in
10784 secondary reload. A general register is used as a scratch to move
10785 the upper DI value and the lower DI value is moved directly,
10786 hence the cost is the sum of three moves. */
10787 if (! TARGET_SIMD)
10788 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10789
10790 return regmove_cost->FP2FP;
10791 }
10792
10793 if (from == GENERAL_REGS && to == GENERAL_REGS)
10794 return regmove_cost->GP2GP;
10795 else if (from == GENERAL_REGS)
10796 return regmove_cost->GP2FP;
10797 else if (to == GENERAL_REGS)
10798 return regmove_cost->FP2GP;
10799
10800 return regmove_cost->FP2FP;
10801 }
10802
10803 static int
10804 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10805 reg_class_t rclass ATTRIBUTE_UNUSED,
10806 bool in ATTRIBUTE_UNUSED)
10807 {
10808 return aarch64_tune_params.memmov_cost;
10809 }
10810
10811 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10812 to optimize 1.0/sqrt. */
10813
10814 static bool
10815 use_rsqrt_p (machine_mode mode)
10816 {
10817 return (!flag_trapping_math
10818 && flag_unsafe_math_optimizations
10819 && ((aarch64_tune_params.approx_modes->recip_sqrt
10820 & AARCH64_APPROX_MODE (mode))
10821 || flag_mrecip_low_precision_sqrt));
10822 }
10823
10824 /* Function to decide when to use the approximate reciprocal square root
10825 builtin. */
10826
10827 static tree
10828 aarch64_builtin_reciprocal (tree fndecl)
10829 {
10830 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10831
10832 if (!use_rsqrt_p (mode))
10833 return NULL_TREE;
10834 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10835 }
10836
10837 /* Emit instruction sequence to compute either the approximate square root
10838 or its approximate reciprocal, depending on the flag RECP, and return
10839 whether the sequence was emitted or not. */
10840
10841 bool
10842 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10843 {
10844 machine_mode mode = GET_MODE (dst);
10845
10846 if (GET_MODE_INNER (mode) == HFmode)
10847 {
10848 gcc_assert (!recp);
10849 return false;
10850 }
10851
10852 if (!recp)
10853 {
10854 if (!(flag_mlow_precision_sqrt
10855 || (aarch64_tune_params.approx_modes->sqrt
10856 & AARCH64_APPROX_MODE (mode))))
10857 return false;
10858
10859 if (flag_finite_math_only
10860 || flag_trapping_math
10861 || !flag_unsafe_math_optimizations
10862 || optimize_function_for_size_p (cfun))
10863 return false;
10864 }
10865 else
10866 /* Caller assumes we cannot fail. */
10867 gcc_assert (use_rsqrt_p (mode));
10868
10869 machine_mode mmsk = mode_for_int_vector (mode).require ();
10870 rtx xmsk = gen_reg_rtx (mmsk);
10871 if (!recp)
10872 /* When calculating the approximate square root, compare the
10873 argument with 0.0 and create a mask. */
10874 emit_insn (gen_rtx_SET (xmsk,
10875 gen_rtx_NEG (mmsk,
10876 gen_rtx_EQ (mmsk, src,
10877 CONST0_RTX (mode)))));
10878
10879 /* Estimate the approximate reciprocal square root. */
10880 rtx xdst = gen_reg_rtx (mode);
10881 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10882
10883 /* Iterate over the series twice for SF and thrice for DF. */
10884 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10885
10886 /* Optionally iterate over the series once less for faster performance
10887 while sacrificing the accuracy. */
10888 if ((recp && flag_mrecip_low_precision_sqrt)
10889 || (!recp && flag_mlow_precision_sqrt))
10890 iterations--;
10891
10892 /* Iterate over the series to calculate the approximate reciprocal square
10893 root. */
10894 rtx x1 = gen_reg_rtx (mode);
10895 while (iterations--)
10896 {
10897 rtx x2 = gen_reg_rtx (mode);
10898 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10899
10900 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10901
10902 if (iterations > 0)
10903 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10904 }
10905
10906 if (!recp)
10907 {
10908 /* Qualify the approximate reciprocal square root when the argument is
10909 0.0 by squashing the intermediary result to 0.0. */
10910 rtx xtmp = gen_reg_rtx (mmsk);
10911 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10912 gen_rtx_SUBREG (mmsk, xdst, 0)));
10913 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10914
10915 /* Calculate the approximate square root. */
10916 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10917 }
10918
10919 /* Finalize the approximation. */
10920 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10921
10922 return true;
10923 }
10924
10925 /* Emit the instruction sequence to compute the approximation for the division
10926 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10927
10928 bool
10929 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10930 {
10931 machine_mode mode = GET_MODE (quo);
10932
10933 if (GET_MODE_INNER (mode) == HFmode)
10934 return false;
10935
10936 bool use_approx_division_p = (flag_mlow_precision_div
10937 || (aarch64_tune_params.approx_modes->division
10938 & AARCH64_APPROX_MODE (mode)));
10939
10940 if (!flag_finite_math_only
10941 || flag_trapping_math
10942 || !flag_unsafe_math_optimizations
10943 || optimize_function_for_size_p (cfun)
10944 || !use_approx_division_p)
10945 return false;
10946
10947 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10948 return false;
10949
10950 /* Estimate the approximate reciprocal. */
10951 rtx xrcp = gen_reg_rtx (mode);
10952 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10953
10954 /* Iterate over the series twice for SF and thrice for DF. */
10955 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10956
10957 /* Optionally iterate over the series once less for faster performance,
10958 while sacrificing the accuracy. */
10959 if (flag_mlow_precision_div)
10960 iterations--;
10961
10962 /* Iterate over the series to calculate the approximate reciprocal. */
10963 rtx xtmp = gen_reg_rtx (mode);
10964 while (iterations--)
10965 {
10966 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10967
10968 if (iterations > 0)
10969 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10970 }
10971
10972 if (num != CONST1_RTX (mode))
10973 {
10974 /* As the approximate reciprocal of DEN is already calculated, only
10975 calculate the approximate division when NUM is not 1.0. */
10976 rtx xnum = force_reg (mode, num);
10977 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10978 }
10979
10980 /* Finalize the approximation. */
10981 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10982 return true;
10983 }
10984
10985 /* Return the number of instructions that can be issued per cycle. */
10986 static int
10987 aarch64_sched_issue_rate (void)
10988 {
10989 return aarch64_tune_params.issue_rate;
10990 }
10991
10992 static int
10993 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10994 {
10995 int issue_rate = aarch64_sched_issue_rate ();
10996
10997 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10998 }
10999
11000
11001 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11002 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11003 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11004
11005 static int
11006 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11007 int ready_index)
11008 {
11009 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11010 }
11011
11012
11013 /* Vectorizer cost model target hooks. */
11014
11015 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11016 static int
11017 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11018 tree vectype,
11019 int misalign ATTRIBUTE_UNUSED)
11020 {
11021 unsigned elements;
11022 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11023 bool fp = false;
11024
11025 if (vectype != NULL)
11026 fp = FLOAT_TYPE_P (vectype);
11027
11028 switch (type_of_cost)
11029 {
11030 case scalar_stmt:
11031 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11032
11033 case scalar_load:
11034 return costs->scalar_load_cost;
11035
11036 case scalar_store:
11037 return costs->scalar_store_cost;
11038
11039 case vector_stmt:
11040 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11041
11042 case vector_load:
11043 return costs->vec_align_load_cost;
11044
11045 case vector_store:
11046 return costs->vec_store_cost;
11047
11048 case vec_to_scalar:
11049 return costs->vec_to_scalar_cost;
11050
11051 case scalar_to_vec:
11052 return costs->scalar_to_vec_cost;
11053
11054 case unaligned_load:
11055 case vector_gather_load:
11056 return costs->vec_unalign_load_cost;
11057
11058 case unaligned_store:
11059 case vector_scatter_store:
11060 return costs->vec_unalign_store_cost;
11061
11062 case cond_branch_taken:
11063 return costs->cond_taken_branch_cost;
11064
11065 case cond_branch_not_taken:
11066 return costs->cond_not_taken_branch_cost;
11067
11068 case vec_perm:
11069 return costs->vec_permute_cost;
11070
11071 case vec_promote_demote:
11072 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11073
11074 case vec_construct:
11075 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11076 return elements / 2 + 1;
11077
11078 default:
11079 gcc_unreachable ();
11080 }
11081 }
11082
11083 /* Implement targetm.vectorize.add_stmt_cost. */
11084 static unsigned
11085 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11086 struct _stmt_vec_info *stmt_info, int misalign,
11087 enum vect_cost_model_location where)
11088 {
11089 unsigned *cost = (unsigned *) data;
11090 unsigned retval = 0;
11091
11092 if (flag_vect_cost_model)
11093 {
11094 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11095 int stmt_cost =
11096 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11097
11098 /* Statements in an inner loop relative to the loop being
11099 vectorized are weighted more heavily. The value here is
11100 arbitrary and could potentially be improved with analysis. */
11101 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11102 count *= 50; /* FIXME */
11103
11104 retval = (unsigned) (count * stmt_cost);
11105 cost[where] += retval;
11106 }
11107
11108 return retval;
11109 }
11110
11111 static void initialize_aarch64_code_model (struct gcc_options *);
11112
11113 /* Parse the TO_PARSE string and put the architecture struct that it
11114 selects into RES and the architectural features into ISA_FLAGS.
11115 Return an aarch64_parse_opt_result describing the parse result.
11116 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11117 When the TO_PARSE string contains an invalid extension,
11118 a copy of the string is created and stored to INVALID_EXTENSION. */
11119
11120 static enum aarch64_parse_opt_result
11121 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11122 uint64_t *isa_flags, std::string *invalid_extension)
11123 {
11124 const char *ext;
11125 const struct processor *arch;
11126 size_t len;
11127
11128 ext = strchr (to_parse, '+');
11129
11130 if (ext != NULL)
11131 len = ext - to_parse;
11132 else
11133 len = strlen (to_parse);
11134
11135 if (len == 0)
11136 return AARCH64_PARSE_MISSING_ARG;
11137
11138
11139 /* Loop through the list of supported ARCHes to find a match. */
11140 for (arch = all_architectures; arch->name != NULL; arch++)
11141 {
11142 if (strlen (arch->name) == len
11143 && strncmp (arch->name, to_parse, len) == 0)
11144 {
11145 uint64_t isa_temp = arch->flags;
11146
11147 if (ext != NULL)
11148 {
11149 /* TO_PARSE string contains at least one extension. */
11150 enum aarch64_parse_opt_result ext_res
11151 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11152
11153 if (ext_res != AARCH64_PARSE_OK)
11154 return ext_res;
11155 }
11156 /* Extension parsing was successful. Confirm the result
11157 arch and ISA flags. */
11158 *res = arch;
11159 *isa_flags = isa_temp;
11160 return AARCH64_PARSE_OK;
11161 }
11162 }
11163
11164 /* ARCH name not found in list. */
11165 return AARCH64_PARSE_INVALID_ARG;
11166 }
11167
11168 /* Parse the TO_PARSE string and put the result tuning in RES and the
11169 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11170 describing the parse result. If there is an error parsing, RES and
11171 ISA_FLAGS are left unchanged.
11172 When the TO_PARSE string contains an invalid extension,
11173 a copy of the string is created and stored to INVALID_EXTENSION. */
11174
11175 static enum aarch64_parse_opt_result
11176 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11177 uint64_t *isa_flags, std::string *invalid_extension)
11178 {
11179 const char *ext;
11180 const struct processor *cpu;
11181 size_t len;
11182
11183 ext = strchr (to_parse, '+');
11184
11185 if (ext != NULL)
11186 len = ext - to_parse;
11187 else
11188 len = strlen (to_parse);
11189
11190 if (len == 0)
11191 return AARCH64_PARSE_MISSING_ARG;
11192
11193
11194 /* Loop through the list of supported CPUs to find a match. */
11195 for (cpu = all_cores; cpu->name != NULL; cpu++)
11196 {
11197 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11198 {
11199 uint64_t isa_temp = cpu->flags;
11200
11201
11202 if (ext != NULL)
11203 {
11204 /* TO_PARSE string contains at least one extension. */
11205 enum aarch64_parse_opt_result ext_res
11206 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11207
11208 if (ext_res != AARCH64_PARSE_OK)
11209 return ext_res;
11210 }
11211 /* Extension parsing was successfull. Confirm the result
11212 cpu and ISA flags. */
11213 *res = cpu;
11214 *isa_flags = isa_temp;
11215 return AARCH64_PARSE_OK;
11216 }
11217 }
11218
11219 /* CPU name not found in list. */
11220 return AARCH64_PARSE_INVALID_ARG;
11221 }
11222
11223 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11224 Return an aarch64_parse_opt_result describing the parse result.
11225 If the parsing fails the RES does not change. */
11226
11227 static enum aarch64_parse_opt_result
11228 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11229 {
11230 const struct processor *cpu;
11231
11232 /* Loop through the list of supported CPUs to find a match. */
11233 for (cpu = all_cores; cpu->name != NULL; cpu++)
11234 {
11235 if (strcmp (cpu->name, to_parse) == 0)
11236 {
11237 *res = cpu;
11238 return AARCH64_PARSE_OK;
11239 }
11240 }
11241
11242 /* CPU name not found in list. */
11243 return AARCH64_PARSE_INVALID_ARG;
11244 }
11245
11246 /* Parse TOKEN, which has length LENGTH to see if it is an option
11247 described in FLAG. If it is, return the index bit for that fusion type.
11248 If not, error (printing OPTION_NAME) and return zero. */
11249
11250 static unsigned int
11251 aarch64_parse_one_option_token (const char *token,
11252 size_t length,
11253 const struct aarch64_flag_desc *flag,
11254 const char *option_name)
11255 {
11256 for (; flag->name != NULL; flag++)
11257 {
11258 if (length == strlen (flag->name)
11259 && !strncmp (flag->name, token, length))
11260 return flag->flag;
11261 }
11262
11263 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11264 return 0;
11265 }
11266
11267 /* Parse OPTION which is a comma-separated list of flags to enable.
11268 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11269 default state we inherit from the CPU tuning structures. OPTION_NAME
11270 gives the top-level option we are parsing in the -moverride string,
11271 for use in error messages. */
11272
11273 static unsigned int
11274 aarch64_parse_boolean_options (const char *option,
11275 const struct aarch64_flag_desc *flags,
11276 unsigned int initial_state,
11277 const char *option_name)
11278 {
11279 const char separator = '.';
11280 const char* specs = option;
11281 const char* ntoken = option;
11282 unsigned int found_flags = initial_state;
11283
11284 while ((ntoken = strchr (specs, separator)))
11285 {
11286 size_t token_length = ntoken - specs;
11287 unsigned token_ops = aarch64_parse_one_option_token (specs,
11288 token_length,
11289 flags,
11290 option_name);
11291 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11292 in the token stream, reset the supported operations. So:
11293
11294 adrp+add.cmp+branch.none.adrp+add
11295
11296 would have the result of turning on only adrp+add fusion. */
11297 if (!token_ops)
11298 found_flags = 0;
11299
11300 found_flags |= token_ops;
11301 specs = ++ntoken;
11302 }
11303
11304 /* We ended with a comma, print something. */
11305 if (!(*specs))
11306 {
11307 error ("%s string ill-formed\n", option_name);
11308 return 0;
11309 }
11310
11311 /* We still have one more token to parse. */
11312 size_t token_length = strlen (specs);
11313 unsigned token_ops = aarch64_parse_one_option_token (specs,
11314 token_length,
11315 flags,
11316 option_name);
11317 if (!token_ops)
11318 found_flags = 0;
11319
11320 found_flags |= token_ops;
11321 return found_flags;
11322 }
11323
11324 /* Support for overriding instruction fusion. */
11325
11326 static void
11327 aarch64_parse_fuse_string (const char *fuse_string,
11328 struct tune_params *tune)
11329 {
11330 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11331 aarch64_fusible_pairs,
11332 tune->fusible_ops,
11333 "fuse=");
11334 }
11335
11336 /* Support for overriding other tuning flags. */
11337
11338 static void
11339 aarch64_parse_tune_string (const char *tune_string,
11340 struct tune_params *tune)
11341 {
11342 tune->extra_tuning_flags
11343 = aarch64_parse_boolean_options (tune_string,
11344 aarch64_tuning_flags,
11345 tune->extra_tuning_flags,
11346 "tune=");
11347 }
11348
11349 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11350 Accept the valid SVE vector widths allowed by
11351 aarch64_sve_vector_bits_enum and use it to override sve_width
11352 in TUNE. */
11353
11354 static void
11355 aarch64_parse_sve_width_string (const char *tune_string,
11356 struct tune_params *tune)
11357 {
11358 int width = -1;
11359
11360 int n = sscanf (tune_string, "%d", &width);
11361 if (n == EOF)
11362 {
11363 error ("invalid format for sve_width");
11364 return;
11365 }
11366 switch (width)
11367 {
11368 case SVE_128:
11369 case SVE_256:
11370 case SVE_512:
11371 case SVE_1024:
11372 case SVE_2048:
11373 break;
11374 default:
11375 error ("invalid sve_width value: %d", width);
11376 }
11377 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11378 }
11379
11380 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11381 we understand. If it is, extract the option string and handoff to
11382 the appropriate function. */
11383
11384 void
11385 aarch64_parse_one_override_token (const char* token,
11386 size_t length,
11387 struct tune_params *tune)
11388 {
11389 const struct aarch64_tuning_override_function *fn
11390 = aarch64_tuning_override_functions;
11391
11392 const char *option_part = strchr (token, '=');
11393 if (!option_part)
11394 {
11395 error ("tuning string missing in option (%s)", token);
11396 return;
11397 }
11398
11399 /* Get the length of the option name. */
11400 length = option_part - token;
11401 /* Skip the '=' to get to the option string. */
11402 option_part++;
11403
11404 for (; fn->name != NULL; fn++)
11405 {
11406 if (!strncmp (fn->name, token, length))
11407 {
11408 fn->parse_override (option_part, tune);
11409 return;
11410 }
11411 }
11412
11413 error ("unknown tuning option (%s)",token);
11414 return;
11415 }
11416
11417 /* A checking mechanism for the implementation of the tls size. */
11418
11419 static void
11420 initialize_aarch64_tls_size (struct gcc_options *opts)
11421 {
11422 if (aarch64_tls_size == 0)
11423 aarch64_tls_size = 24;
11424
11425 switch (opts->x_aarch64_cmodel_var)
11426 {
11427 case AARCH64_CMODEL_TINY:
11428 /* Both the default and maximum TLS size allowed under tiny is 1M which
11429 needs two instructions to address, so we clamp the size to 24. */
11430 if (aarch64_tls_size > 24)
11431 aarch64_tls_size = 24;
11432 break;
11433 case AARCH64_CMODEL_SMALL:
11434 /* The maximum TLS size allowed under small is 4G. */
11435 if (aarch64_tls_size > 32)
11436 aarch64_tls_size = 32;
11437 break;
11438 case AARCH64_CMODEL_LARGE:
11439 /* The maximum TLS size allowed under large is 16E.
11440 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11441 if (aarch64_tls_size > 48)
11442 aarch64_tls_size = 48;
11443 break;
11444 default:
11445 gcc_unreachable ();
11446 }
11447
11448 return;
11449 }
11450
11451 /* Parse STRING looking for options in the format:
11452 string :: option:string
11453 option :: name=substring
11454 name :: {a-z}
11455 substring :: defined by option. */
11456
11457 static void
11458 aarch64_parse_override_string (const char* input_string,
11459 struct tune_params* tune)
11460 {
11461 const char separator = ':';
11462 size_t string_length = strlen (input_string) + 1;
11463 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11464 char *string = string_root;
11465 strncpy (string, input_string, string_length);
11466 string[string_length - 1] = '\0';
11467
11468 char* ntoken = string;
11469
11470 while ((ntoken = strchr (string, separator)))
11471 {
11472 size_t token_length = ntoken - string;
11473 /* Make this substring look like a string. */
11474 *ntoken = '\0';
11475 aarch64_parse_one_override_token (string, token_length, tune);
11476 string = ++ntoken;
11477 }
11478
11479 /* One last option to parse. */
11480 aarch64_parse_one_override_token (string, strlen (string), tune);
11481 free (string_root);
11482 }
11483
11484
11485 static void
11486 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11487 {
11488 if (accepted_branch_protection_string)
11489 {
11490 opts->x_aarch64_branch_protection_string
11491 = xstrdup (accepted_branch_protection_string);
11492 }
11493
11494 /* PR 70044: We have to be careful about being called multiple times for the
11495 same function. This means all changes should be repeatable. */
11496
11497 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11498 Disable the frame pointer flag so the mid-end will not use a frame
11499 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11500 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11501 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11502 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11503 if (opts->x_flag_omit_frame_pointer == 0)
11504 opts->x_flag_omit_frame_pointer = 2;
11505
11506 /* If not optimizing for size, set the default
11507 alignment to what the target wants. */
11508 if (!opts->x_optimize_size)
11509 {
11510 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11511 opts->x_str_align_loops = aarch64_tune_params.loop_align;
11512 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11513 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11514 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11515 opts->x_str_align_functions = aarch64_tune_params.function_align;
11516 }
11517
11518 /* We default to no pc-relative literal loads. */
11519
11520 aarch64_pcrelative_literal_loads = false;
11521
11522 /* If -mpc-relative-literal-loads is set on the command line, this
11523 implies that the user asked for PC relative literal loads. */
11524 if (opts->x_pcrelative_literal_loads == 1)
11525 aarch64_pcrelative_literal_loads = true;
11526
11527 /* In the tiny memory model it makes no sense to disallow PC relative
11528 literal pool loads. */
11529 if (aarch64_cmodel == AARCH64_CMODEL_TINY
11530 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11531 aarch64_pcrelative_literal_loads = true;
11532
11533 /* When enabling the lower precision Newton series for the square root, also
11534 enable it for the reciprocal square root, since the latter is an
11535 intermediary step for the former. */
11536 if (flag_mlow_precision_sqrt)
11537 flag_mrecip_low_precision_sqrt = true;
11538 }
11539
11540 /* 'Unpack' up the internal tuning structs and update the options
11541 in OPTS. The caller must have set up selected_tune and selected_arch
11542 as all the other target-specific codegen decisions are
11543 derived from them. */
11544
11545 void
11546 aarch64_override_options_internal (struct gcc_options *opts)
11547 {
11548 aarch64_tune_flags = selected_tune->flags;
11549 aarch64_tune = selected_tune->sched_core;
11550 /* Make a copy of the tuning parameters attached to the core, which
11551 we may later overwrite. */
11552 aarch64_tune_params = *(selected_tune->tune);
11553 aarch64_architecture_version = selected_arch->architecture_version;
11554
11555 if (opts->x_aarch64_override_tune_string)
11556 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11557 &aarch64_tune_params);
11558
11559 /* This target defaults to strict volatile bitfields. */
11560 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11561 opts->x_flag_strict_volatile_bitfields = 1;
11562
11563 if (aarch64_stack_protector_guard == SSP_GLOBAL
11564 && opts->x_aarch64_stack_protector_guard_offset_str)
11565 {
11566 error ("incompatible options %<-mstack-protector-guard=global%> and "
11567 "%<-mstack-protector-guard-offset=%s%>",
11568 aarch64_stack_protector_guard_offset_str);
11569 }
11570
11571 if (aarch64_stack_protector_guard == SSP_SYSREG
11572 && !(opts->x_aarch64_stack_protector_guard_offset_str
11573 && opts->x_aarch64_stack_protector_guard_reg_str))
11574 {
11575 error ("both %<-mstack-protector-guard-offset%> and "
11576 "%<-mstack-protector-guard-reg%> must be used "
11577 "with %<-mstack-protector-guard=sysreg%>");
11578 }
11579
11580 if (opts->x_aarch64_stack_protector_guard_reg_str)
11581 {
11582 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11583 error ("specify a system register with a small string length.");
11584 }
11585
11586 if (opts->x_aarch64_stack_protector_guard_offset_str)
11587 {
11588 char *end;
11589 const char *str = aarch64_stack_protector_guard_offset_str;
11590 errno = 0;
11591 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11592 if (!*str || *end || errno)
11593 error ("%qs is not a valid offset in %qs", str,
11594 "-mstack-protector-guard-offset=");
11595 aarch64_stack_protector_guard_offset = offs;
11596 }
11597
11598 initialize_aarch64_code_model (opts);
11599 initialize_aarch64_tls_size (opts);
11600
11601 int queue_depth = 0;
11602 switch (aarch64_tune_params.autoprefetcher_model)
11603 {
11604 case tune_params::AUTOPREFETCHER_OFF:
11605 queue_depth = -1;
11606 break;
11607 case tune_params::AUTOPREFETCHER_WEAK:
11608 queue_depth = 0;
11609 break;
11610 case tune_params::AUTOPREFETCHER_STRONG:
11611 queue_depth = max_insn_queue_index + 1;
11612 break;
11613 default:
11614 gcc_unreachable ();
11615 }
11616
11617 /* We don't mind passing in global_options_set here as we don't use
11618 the *options_set structs anyway. */
11619 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11620 queue_depth,
11621 opts->x_param_values,
11622 global_options_set.x_param_values);
11623
11624 /* Set up parameters to be used in prefetching algorithm. Do not
11625 override the defaults unless we are tuning for a core we have
11626 researched values for. */
11627 if (aarch64_tune_params.prefetch->num_slots > 0)
11628 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11629 aarch64_tune_params.prefetch->num_slots,
11630 opts->x_param_values,
11631 global_options_set.x_param_values);
11632 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11633 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11634 aarch64_tune_params.prefetch->l1_cache_size,
11635 opts->x_param_values,
11636 global_options_set.x_param_values);
11637 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11638 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11639 aarch64_tune_params.prefetch->l1_cache_line_size,
11640 opts->x_param_values,
11641 global_options_set.x_param_values);
11642 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11643 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11644 aarch64_tune_params.prefetch->l2_cache_size,
11645 opts->x_param_values,
11646 global_options_set.x_param_values);
11647 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11648 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11649 0,
11650 opts->x_param_values,
11651 global_options_set.x_param_values);
11652 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11653 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11654 aarch64_tune_params.prefetch->minimum_stride,
11655 opts->x_param_values,
11656 global_options_set.x_param_values);
11657
11658 /* Use the alternative scheduling-pressure algorithm by default. */
11659 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11660 opts->x_param_values,
11661 global_options_set.x_param_values);
11662
11663 /* If the user hasn't changed it via configure then set the default to 64 KB
11664 for the backend. */
11665 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11666 DEFAULT_STK_CLASH_GUARD_SIZE == 0
11667 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11668 opts->x_param_values,
11669 global_options_set.x_param_values);
11670
11671 /* Validate the guard size. */
11672 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11673
11674 /* Enforce that interval is the same size as size so the mid-end does the
11675 right thing. */
11676 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11677 guard_size,
11678 opts->x_param_values,
11679 global_options_set.x_param_values);
11680
11681 /* The maybe_set calls won't update the value if the user has explicitly set
11682 one. Which means we need to validate that probing interval and guard size
11683 are equal. */
11684 int probe_interval
11685 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11686 if (guard_size != probe_interval)
11687 error ("stack clash guard size %<%d%> must be equal to probing interval "
11688 "%<%d%>", guard_size, probe_interval);
11689
11690 /* Enable sw prefetching at specified optimization level for
11691 CPUS that have prefetch. Lower optimization level threshold by 1
11692 when profiling is enabled. */
11693 if (opts->x_flag_prefetch_loop_arrays < 0
11694 && !opts->x_optimize_size
11695 && aarch64_tune_params.prefetch->default_opt_level >= 0
11696 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11697 opts->x_flag_prefetch_loop_arrays = 1;
11698
11699 if (opts->x_aarch64_arch_string == NULL)
11700 opts->x_aarch64_arch_string = selected_arch->name;
11701 if (opts->x_aarch64_cpu_string == NULL)
11702 opts->x_aarch64_cpu_string = selected_cpu->name;
11703 if (opts->x_aarch64_tune_string == NULL)
11704 opts->x_aarch64_tune_string = selected_tune->name;
11705
11706 aarch64_override_options_after_change_1 (opts);
11707 }
11708
11709 /* Print a hint with a suggestion for a core or architecture name that
11710 most closely resembles what the user passed in STR. ARCH is true if
11711 the user is asking for an architecture name. ARCH is false if the user
11712 is asking for a core name. */
11713
11714 static void
11715 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11716 {
11717 auto_vec<const char *> candidates;
11718 const struct processor *entry = arch ? all_architectures : all_cores;
11719 for (; entry->name != NULL; entry++)
11720 candidates.safe_push (entry->name);
11721
11722 #ifdef HAVE_LOCAL_CPU_DETECT
11723 /* Add also "native" as possible value. */
11724 if (arch)
11725 candidates.safe_push ("native");
11726 #endif
11727
11728 char *s;
11729 const char *hint = candidates_list_and_hint (str, s, candidates);
11730 if (hint)
11731 inform (input_location, "valid arguments are: %s;"
11732 " did you mean %qs?", s, hint);
11733 else
11734 inform (input_location, "valid arguments are: %s", s);
11735
11736 XDELETEVEC (s);
11737 }
11738
11739 /* Print a hint with a suggestion for a core name that most closely resembles
11740 what the user passed in STR. */
11741
11742 inline static void
11743 aarch64_print_hint_for_core (const char *str)
11744 {
11745 aarch64_print_hint_for_core_or_arch (str, false);
11746 }
11747
11748 /* Print a hint with a suggestion for an architecture name that most closely
11749 resembles what the user passed in STR. */
11750
11751 inline static void
11752 aarch64_print_hint_for_arch (const char *str)
11753 {
11754 aarch64_print_hint_for_core_or_arch (str, true);
11755 }
11756
11757
11758 /* Print a hint with a suggestion for an extension name
11759 that most closely resembles what the user passed in STR. */
11760
11761 void
11762 aarch64_print_hint_for_extensions (const std::string &str)
11763 {
11764 auto_vec<const char *> candidates;
11765 aarch64_get_all_extension_candidates (&candidates);
11766 char *s;
11767 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11768 if (hint)
11769 inform (input_location, "valid arguments are: %s;"
11770 " did you mean %qs?", s, hint);
11771 else
11772 inform (input_location, "valid arguments are: %s;", s);
11773
11774 XDELETEVEC (s);
11775 }
11776
11777 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
11778 specified in STR and throw errors if appropriate. Put the results if
11779 they are valid in RES and ISA_FLAGS. Return whether the option is
11780 valid. */
11781
11782 static bool
11783 aarch64_validate_mcpu (const char *str, const struct processor **res,
11784 uint64_t *isa_flags)
11785 {
11786 std::string invalid_extension;
11787 enum aarch64_parse_opt_result parse_res
11788 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11789
11790 if (parse_res == AARCH64_PARSE_OK)
11791 return true;
11792
11793 switch (parse_res)
11794 {
11795 case AARCH64_PARSE_MISSING_ARG:
11796 error ("missing cpu name in %<-mcpu=%s%>", str);
11797 break;
11798 case AARCH64_PARSE_INVALID_ARG:
11799 error ("unknown value %qs for %<-mcpu%>", str);
11800 aarch64_print_hint_for_core (str);
11801 break;
11802 case AARCH64_PARSE_INVALID_FEATURE:
11803 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11804 invalid_extension.c_str (), str);
11805 aarch64_print_hint_for_extensions (invalid_extension);
11806 break;
11807 default:
11808 gcc_unreachable ();
11809 }
11810
11811 return false;
11812 }
11813
11814 /* Parses CONST_STR for branch protection features specified in
11815 aarch64_branch_protect_types, and set any global variables required. Returns
11816 the parsing result and assigns LAST_STR to the last processed token from
11817 CONST_STR so that it can be used for error reporting. */
11818
11819 static enum
11820 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11821 char** last_str)
11822 {
11823 char *str_root = xstrdup (const_str);
11824 char* token_save = NULL;
11825 char *str = strtok_r (str_root, "+", &token_save);
11826 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11827 if (!str)
11828 res = AARCH64_PARSE_MISSING_ARG;
11829 else
11830 {
11831 char *next_str = strtok_r (NULL, "+", &token_save);
11832 /* Reset the branch protection features to their defaults. */
11833 aarch64_handle_no_branch_protection (NULL, NULL);
11834
11835 while (str && res == AARCH64_PARSE_OK)
11836 {
11837 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11838 bool found = false;
11839 /* Search for this type. */
11840 while (type && type->name && !found && res == AARCH64_PARSE_OK)
11841 {
11842 if (strcmp (str, type->name) == 0)
11843 {
11844 found = true;
11845 res = type->handler (str, next_str);
11846 str = next_str;
11847 next_str = strtok_r (NULL, "+", &token_save);
11848 }
11849 else
11850 type++;
11851 }
11852 if (found && res == AARCH64_PARSE_OK)
11853 {
11854 bool found_subtype = true;
11855 /* Loop through each token until we find one that isn't a
11856 subtype. */
11857 while (found_subtype)
11858 {
11859 found_subtype = false;
11860 const aarch64_branch_protect_type *subtype = type->subtypes;
11861 /* Search for the subtype. */
11862 while (str && subtype && subtype->name && !found_subtype
11863 && res == AARCH64_PARSE_OK)
11864 {
11865 if (strcmp (str, subtype->name) == 0)
11866 {
11867 found_subtype = true;
11868 res = subtype->handler (str, next_str);
11869 str = next_str;
11870 next_str = strtok_r (NULL, "+", &token_save);
11871 }
11872 else
11873 subtype++;
11874 }
11875 }
11876 }
11877 else if (!found)
11878 res = AARCH64_PARSE_INVALID_ARG;
11879 }
11880 }
11881 /* Copy the last processed token into the argument to pass it back.
11882 Used by option and attribute validation to print the offending token. */
11883 if (last_str)
11884 {
11885 if (str) strcpy (*last_str, str);
11886 else *last_str = NULL;
11887 }
11888 if (res == AARCH64_PARSE_OK)
11889 {
11890 /* If needed, alloc the accepted string then copy in const_str.
11891 Used by override_option_after_change_1. */
11892 if (!accepted_branch_protection_string)
11893 accepted_branch_protection_string = (char *) xmalloc (
11894 BRANCH_PROTECT_STR_MAX
11895 + 1);
11896 strncpy (accepted_branch_protection_string, const_str,
11897 BRANCH_PROTECT_STR_MAX + 1);
11898 /* Forcibly null-terminate. */
11899 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11900 }
11901 return res;
11902 }
11903
11904 static bool
11905 aarch64_validate_mbranch_protection (const char *const_str)
11906 {
11907 char *str = (char *) xmalloc (strlen (const_str));
11908 enum aarch64_parse_opt_result res =
11909 aarch64_parse_branch_protection (const_str, &str);
11910 if (res == AARCH64_PARSE_INVALID_ARG)
11911 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
11912 else if (res == AARCH64_PARSE_MISSING_ARG)
11913 error ("missing argument for %<-mbranch-protection=%>");
11914 free (str);
11915 return res == AARCH64_PARSE_OK;
11916 }
11917
11918 /* Validate a command-line -march option. Parse the arch and extensions
11919 (if any) specified in STR and throw errors if appropriate. Put the
11920 results, if they are valid, in RES and ISA_FLAGS. Return whether the
11921 option is valid. */
11922
11923 static bool
11924 aarch64_validate_march (const char *str, const struct processor **res,
11925 uint64_t *isa_flags)
11926 {
11927 std::string invalid_extension;
11928 enum aarch64_parse_opt_result parse_res
11929 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11930
11931 if (parse_res == AARCH64_PARSE_OK)
11932 return true;
11933
11934 switch (parse_res)
11935 {
11936 case AARCH64_PARSE_MISSING_ARG:
11937 error ("missing arch name in %<-march=%s%>", str);
11938 break;
11939 case AARCH64_PARSE_INVALID_ARG:
11940 error ("unknown value %qs for %<-march%>", str);
11941 aarch64_print_hint_for_arch (str);
11942 break;
11943 case AARCH64_PARSE_INVALID_FEATURE:
11944 error ("invalid feature modifier %qs in %<-march=%s%>",
11945 invalid_extension.c_str (), str);
11946 aarch64_print_hint_for_extensions (invalid_extension);
11947 break;
11948 default:
11949 gcc_unreachable ();
11950 }
11951
11952 return false;
11953 }
11954
11955 /* Validate a command-line -mtune option. Parse the cpu
11956 specified in STR and throw errors if appropriate. Put the
11957 result, if it is valid, in RES. Return whether the option is
11958 valid. */
11959
11960 static bool
11961 aarch64_validate_mtune (const char *str, const struct processor **res)
11962 {
11963 enum aarch64_parse_opt_result parse_res
11964 = aarch64_parse_tune (str, res);
11965
11966 if (parse_res == AARCH64_PARSE_OK)
11967 return true;
11968
11969 switch (parse_res)
11970 {
11971 case AARCH64_PARSE_MISSING_ARG:
11972 error ("missing cpu name in %<-mtune=%s%>", str);
11973 break;
11974 case AARCH64_PARSE_INVALID_ARG:
11975 error ("unknown value %qs for %<-mtune%>", str);
11976 aarch64_print_hint_for_core (str);
11977 break;
11978 default:
11979 gcc_unreachable ();
11980 }
11981 return false;
11982 }
11983
11984 /* Return the CPU corresponding to the enum CPU.
11985 If it doesn't specify a cpu, return the default. */
11986
11987 static const struct processor *
11988 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11989 {
11990 if (cpu != aarch64_none)
11991 return &all_cores[cpu];
11992
11993 /* The & 0x3f is to extract the bottom 6 bits that encode the
11994 default cpu as selected by the --with-cpu GCC configure option
11995 in config.gcc.
11996 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11997 flags mechanism should be reworked to make it more sane. */
11998 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11999 }
12000
12001 /* Return the architecture corresponding to the enum ARCH.
12002 If it doesn't specify a valid architecture, return the default. */
12003
12004 static const struct processor *
12005 aarch64_get_arch (enum aarch64_arch arch)
12006 {
12007 if (arch != aarch64_no_arch)
12008 return &all_architectures[arch];
12009
12010 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12011
12012 return &all_architectures[cpu->arch];
12013 }
12014
12015 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12016
12017 static poly_uint16
12018 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12019 {
12020 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12021 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12022 deciding which .md file patterns to use and when deciding whether
12023 something is a legitimate address or constant. */
12024 if (value == SVE_SCALABLE || value == SVE_128)
12025 return poly_uint16 (2, 2);
12026 else
12027 return (int) value / 64;
12028 }
12029
12030 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12031 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12032 tuning structs. In particular it must set selected_tune and
12033 aarch64_isa_flags that define the available ISA features and tuning
12034 decisions. It must also set selected_arch as this will be used to
12035 output the .arch asm tags for each function. */
12036
12037 static void
12038 aarch64_override_options (void)
12039 {
12040 uint64_t cpu_isa = 0;
12041 uint64_t arch_isa = 0;
12042 aarch64_isa_flags = 0;
12043
12044 bool valid_cpu = true;
12045 bool valid_tune = true;
12046 bool valid_arch = true;
12047
12048 selected_cpu = NULL;
12049 selected_arch = NULL;
12050 selected_tune = NULL;
12051
12052 if (aarch64_branch_protection_string)
12053 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12054
12055 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12056 If either of -march or -mtune is given, they override their
12057 respective component of -mcpu. */
12058 if (aarch64_cpu_string)
12059 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12060 &cpu_isa);
12061
12062 if (aarch64_arch_string)
12063 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12064 &arch_isa);
12065
12066 if (aarch64_tune_string)
12067 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12068
12069 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12070 SUBTARGET_OVERRIDE_OPTIONS;
12071 #endif
12072
12073 /* If the user did not specify a processor, choose the default
12074 one for them. This will be the CPU set during configuration using
12075 --with-cpu, otherwise it is "generic". */
12076 if (!selected_cpu)
12077 {
12078 if (selected_arch)
12079 {
12080 selected_cpu = &all_cores[selected_arch->ident];
12081 aarch64_isa_flags = arch_isa;
12082 explicit_arch = selected_arch->arch;
12083 }
12084 else
12085 {
12086 /* Get default configure-time CPU. */
12087 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12088 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12089 }
12090
12091 if (selected_tune)
12092 explicit_tune_core = selected_tune->ident;
12093 }
12094 /* If both -mcpu and -march are specified check that they are architecturally
12095 compatible, warn if they're not and prefer the -march ISA flags. */
12096 else if (selected_arch)
12097 {
12098 if (selected_arch->arch != selected_cpu->arch)
12099 {
12100 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12101 all_architectures[selected_cpu->arch].name,
12102 selected_arch->name);
12103 }
12104 aarch64_isa_flags = arch_isa;
12105 explicit_arch = selected_arch->arch;
12106 explicit_tune_core = selected_tune ? selected_tune->ident
12107 : selected_cpu->ident;
12108 }
12109 else
12110 {
12111 /* -mcpu but no -march. */
12112 aarch64_isa_flags = cpu_isa;
12113 explicit_tune_core = selected_tune ? selected_tune->ident
12114 : selected_cpu->ident;
12115 gcc_assert (selected_cpu);
12116 selected_arch = &all_architectures[selected_cpu->arch];
12117 explicit_arch = selected_arch->arch;
12118 }
12119
12120 /* Set the arch as well as we will need it when outputing
12121 the .arch directive in assembly. */
12122 if (!selected_arch)
12123 {
12124 gcc_assert (selected_cpu);
12125 selected_arch = &all_architectures[selected_cpu->arch];
12126 }
12127
12128 if (!selected_tune)
12129 selected_tune = selected_cpu;
12130
12131 if (aarch64_enable_bti == 2)
12132 {
12133 #ifdef TARGET_ENABLE_BTI
12134 aarch64_enable_bti = 1;
12135 #else
12136 aarch64_enable_bti = 0;
12137 #endif
12138 }
12139
12140 /* Return address signing is currently not supported for ILP32 targets. For
12141 LP64 targets use the configured option in the absence of a command-line
12142 option for -mbranch-protection. */
12143 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12144 {
12145 #ifdef TARGET_ENABLE_PAC_RET
12146 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12147 #else
12148 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12149 #endif
12150 }
12151
12152 #ifndef HAVE_AS_MABI_OPTION
12153 /* The compiler may have been configured with 2.23.* binutils, which does
12154 not have support for ILP32. */
12155 if (TARGET_ILP32)
12156 error ("assembler does not support %<-mabi=ilp32%>");
12157 #endif
12158
12159 /* Convert -msve-vector-bits to a VG count. */
12160 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12161
12162 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12163 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12164
12165 /* Make sure we properly set up the explicit options. */
12166 if ((aarch64_cpu_string && valid_cpu)
12167 || (aarch64_tune_string && valid_tune))
12168 gcc_assert (explicit_tune_core != aarch64_none);
12169
12170 if ((aarch64_cpu_string && valid_cpu)
12171 || (aarch64_arch_string && valid_arch))
12172 gcc_assert (explicit_arch != aarch64_no_arch);
12173
12174 /* The pass to insert speculation tracking runs before
12175 shrink-wrapping and the latter does not know how to update the
12176 tracking status. So disable it in this case. */
12177 if (aarch64_track_speculation)
12178 flag_shrink_wrap = 0;
12179
12180 aarch64_override_options_internal (&global_options);
12181
12182 /* Save these options as the default ones in case we push and pop them later
12183 while processing functions with potential target attributes. */
12184 target_option_default_node = target_option_current_node
12185 = build_target_option_node (&global_options);
12186 }
12187
12188 /* Implement targetm.override_options_after_change. */
12189
12190 static void
12191 aarch64_override_options_after_change (void)
12192 {
12193 aarch64_override_options_after_change_1 (&global_options);
12194 }
12195
12196 static struct machine_function *
12197 aarch64_init_machine_status (void)
12198 {
12199 struct machine_function *machine;
12200 machine = ggc_cleared_alloc<machine_function> ();
12201 return machine;
12202 }
12203
12204 void
12205 aarch64_init_expanders (void)
12206 {
12207 init_machine_status = aarch64_init_machine_status;
12208 }
12209
12210 /* A checking mechanism for the implementation of the various code models. */
12211 static void
12212 initialize_aarch64_code_model (struct gcc_options *opts)
12213 {
12214 if (opts->x_flag_pic)
12215 {
12216 switch (opts->x_aarch64_cmodel_var)
12217 {
12218 case AARCH64_CMODEL_TINY:
12219 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12220 break;
12221 case AARCH64_CMODEL_SMALL:
12222 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12223 aarch64_cmodel = (flag_pic == 2
12224 ? AARCH64_CMODEL_SMALL_PIC
12225 : AARCH64_CMODEL_SMALL_SPIC);
12226 #else
12227 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12228 #endif
12229 break;
12230 case AARCH64_CMODEL_LARGE:
12231 sorry ("code model %qs with %<-f%s%>", "large",
12232 opts->x_flag_pic > 1 ? "PIC" : "pic");
12233 break;
12234 default:
12235 gcc_unreachable ();
12236 }
12237 }
12238 else
12239 aarch64_cmodel = opts->x_aarch64_cmodel_var;
12240 }
12241
12242 /* Implement TARGET_OPTION_SAVE. */
12243
12244 static void
12245 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12246 {
12247 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12248 ptr->x_aarch64_branch_protection_string
12249 = opts->x_aarch64_branch_protection_string;
12250 }
12251
12252 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12253 using the information saved in PTR. */
12254
12255 static void
12256 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12257 {
12258 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12259 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12260 opts->x_explicit_arch = ptr->x_explicit_arch;
12261 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12262 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12263 opts->x_aarch64_branch_protection_string
12264 = ptr->x_aarch64_branch_protection_string;
12265 if (opts->x_aarch64_branch_protection_string)
12266 {
12267 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12268 NULL);
12269 }
12270
12271 aarch64_override_options_internal (opts);
12272 }
12273
12274 /* Implement TARGET_OPTION_PRINT. */
12275
12276 static void
12277 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12278 {
12279 const struct processor *cpu
12280 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12281 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
12282 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12283 std::string extension
12284 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12285
12286 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12287 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12288 arch->name, extension.c_str ());
12289 }
12290
12291 static GTY(()) tree aarch64_previous_fndecl;
12292
12293 void
12294 aarch64_reset_previous_fndecl (void)
12295 {
12296 aarch64_previous_fndecl = NULL;
12297 }
12298
12299 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12300 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12301 make sure optab availability predicates are recomputed when necessary. */
12302
12303 void
12304 aarch64_save_restore_target_globals (tree new_tree)
12305 {
12306 if (TREE_TARGET_GLOBALS (new_tree))
12307 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12308 else if (new_tree == target_option_default_node)
12309 restore_target_globals (&default_target_globals);
12310 else
12311 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12312 }
12313
12314 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12315 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12316 of the function, if such exists. This function may be called multiple
12317 times on a single function so use aarch64_previous_fndecl to avoid
12318 setting up identical state. */
12319
12320 static void
12321 aarch64_set_current_function (tree fndecl)
12322 {
12323 if (!fndecl || fndecl == aarch64_previous_fndecl)
12324 return;
12325
12326 tree old_tree = (aarch64_previous_fndecl
12327 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12328 : NULL_TREE);
12329
12330 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12331
12332 /* If current function has no attributes but the previous one did,
12333 use the default node. */
12334 if (!new_tree && old_tree)
12335 new_tree = target_option_default_node;
12336
12337 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12338 the default have been handled by aarch64_save_restore_target_globals from
12339 aarch64_pragma_target_parse. */
12340 if (old_tree == new_tree)
12341 return;
12342
12343 aarch64_previous_fndecl = fndecl;
12344
12345 /* First set the target options. */
12346 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12347
12348 aarch64_save_restore_target_globals (new_tree);
12349 }
12350
12351 /* Enum describing the various ways we can handle attributes.
12352 In many cases we can reuse the generic option handling machinery. */
12353
12354 enum aarch64_attr_opt_type
12355 {
12356 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
12357 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
12358 aarch64_attr_enum, /* Attribute sets an enum variable. */
12359 aarch64_attr_custom /* Attribute requires a custom handling function. */
12360 };
12361
12362 /* All the information needed to handle a target attribute.
12363 NAME is the name of the attribute.
12364 ATTR_TYPE specifies the type of behavior of the attribute as described
12365 in the definition of enum aarch64_attr_opt_type.
12366 ALLOW_NEG is true if the attribute supports a "no-" form.
12367 HANDLER is the function that takes the attribute string as an argument
12368 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12369 OPT_NUM is the enum specifying the option that the attribute modifies.
12370 This is needed for attributes that mirror the behavior of a command-line
12371 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12372 aarch64_attr_enum. */
12373
12374 struct aarch64_attribute_info
12375 {
12376 const char *name;
12377 enum aarch64_attr_opt_type attr_type;
12378 bool allow_neg;
12379 bool (*handler) (const char *);
12380 enum opt_code opt_num;
12381 };
12382
12383 /* Handle the ARCH_STR argument to the arch= target attribute. */
12384
12385 static bool
12386 aarch64_handle_attr_arch (const char *str)
12387 {
12388 const struct processor *tmp_arch = NULL;
12389 std::string invalid_extension;
12390 enum aarch64_parse_opt_result parse_res
12391 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12392
12393 if (parse_res == AARCH64_PARSE_OK)
12394 {
12395 gcc_assert (tmp_arch);
12396 selected_arch = tmp_arch;
12397 explicit_arch = selected_arch->arch;
12398 return true;
12399 }
12400
12401 switch (parse_res)
12402 {
12403 case AARCH64_PARSE_MISSING_ARG:
12404 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12405 break;
12406 case AARCH64_PARSE_INVALID_ARG:
12407 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12408 aarch64_print_hint_for_arch (str);
12409 break;
12410 case AARCH64_PARSE_INVALID_FEATURE:
12411 error ("invalid feature modifier %s of value (\"%s\") in "
12412 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12413 aarch64_print_hint_for_extensions (invalid_extension);
12414 break;
12415 default:
12416 gcc_unreachable ();
12417 }
12418
12419 return false;
12420 }
12421
12422 /* Handle the argument CPU_STR to the cpu= target attribute. */
12423
12424 static bool
12425 aarch64_handle_attr_cpu (const char *str)
12426 {
12427 const struct processor *tmp_cpu = NULL;
12428 std::string invalid_extension;
12429 enum aarch64_parse_opt_result parse_res
12430 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12431
12432 if (parse_res == AARCH64_PARSE_OK)
12433 {
12434 gcc_assert (tmp_cpu);
12435 selected_tune = tmp_cpu;
12436 explicit_tune_core = selected_tune->ident;
12437
12438 selected_arch = &all_architectures[tmp_cpu->arch];
12439 explicit_arch = selected_arch->arch;
12440 return true;
12441 }
12442
12443 switch (parse_res)
12444 {
12445 case AARCH64_PARSE_MISSING_ARG:
12446 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12447 break;
12448 case AARCH64_PARSE_INVALID_ARG:
12449 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12450 aarch64_print_hint_for_core (str);
12451 break;
12452 case AARCH64_PARSE_INVALID_FEATURE:
12453 error ("invalid feature modifier %s of value (\"%s\") in "
12454 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12455 aarch64_print_hint_for_extensions (invalid_extension);
12456 break;
12457 default:
12458 gcc_unreachable ();
12459 }
12460
12461 return false;
12462 }
12463
12464 /* Handle the argument STR to the branch-protection= attribute. */
12465
12466 static bool
12467 aarch64_handle_attr_branch_protection (const char* str)
12468 {
12469 char *err_str = (char *) xmalloc (strlen (str));
12470 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12471 &err_str);
12472 bool success = false;
12473 switch (res)
12474 {
12475 case AARCH64_PARSE_MISSING_ARG:
12476 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12477 " attribute");
12478 break;
12479 case AARCH64_PARSE_INVALID_ARG:
12480 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12481 "=\")%> pragma or attribute", err_str);
12482 break;
12483 case AARCH64_PARSE_OK:
12484 success = true;
12485 /* Fall through. */
12486 case AARCH64_PARSE_INVALID_FEATURE:
12487 break;
12488 default:
12489 gcc_unreachable ();
12490 }
12491 free (err_str);
12492 return success;
12493 }
12494
12495 /* Handle the argument STR to the tune= target attribute. */
12496
12497 static bool
12498 aarch64_handle_attr_tune (const char *str)
12499 {
12500 const struct processor *tmp_tune = NULL;
12501 enum aarch64_parse_opt_result parse_res
12502 = aarch64_parse_tune (str, &tmp_tune);
12503
12504 if (parse_res == AARCH64_PARSE_OK)
12505 {
12506 gcc_assert (tmp_tune);
12507 selected_tune = tmp_tune;
12508 explicit_tune_core = selected_tune->ident;
12509 return true;
12510 }
12511
12512 switch (parse_res)
12513 {
12514 case AARCH64_PARSE_INVALID_ARG:
12515 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12516 aarch64_print_hint_for_core (str);
12517 break;
12518 default:
12519 gcc_unreachable ();
12520 }
12521
12522 return false;
12523 }
12524
12525 /* Parse an architecture extensions target attribute string specified in STR.
12526 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12527 if successful. Update aarch64_isa_flags to reflect the ISA features
12528 modified. */
12529
12530 static bool
12531 aarch64_handle_attr_isa_flags (char *str)
12532 {
12533 enum aarch64_parse_opt_result parse_res;
12534 uint64_t isa_flags = aarch64_isa_flags;
12535
12536 /* We allow "+nothing" in the beginning to clear out all architectural
12537 features if the user wants to handpick specific features. */
12538 if (strncmp ("+nothing", str, 8) == 0)
12539 {
12540 isa_flags = 0;
12541 str += 8;
12542 }
12543
12544 std::string invalid_extension;
12545 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12546
12547 if (parse_res == AARCH64_PARSE_OK)
12548 {
12549 aarch64_isa_flags = isa_flags;
12550 return true;
12551 }
12552
12553 switch (parse_res)
12554 {
12555 case AARCH64_PARSE_MISSING_ARG:
12556 error ("missing value in %<target()%> pragma or attribute");
12557 break;
12558
12559 case AARCH64_PARSE_INVALID_FEATURE:
12560 error ("invalid feature modifier %s of value (\"%s\") in "
12561 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12562 break;
12563
12564 default:
12565 gcc_unreachable ();
12566 }
12567
12568 return false;
12569 }
12570
12571 /* The target attributes that we support. On top of these we also support just
12572 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12573 handled explicitly in aarch64_process_one_target_attr. */
12574
12575 static const struct aarch64_attribute_info aarch64_attributes[] =
12576 {
12577 { "general-regs-only", aarch64_attr_mask, false, NULL,
12578 OPT_mgeneral_regs_only },
12579 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12580 OPT_mfix_cortex_a53_835769 },
12581 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12582 OPT_mfix_cortex_a53_843419 },
12583 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12584 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12585 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12586 OPT_momit_leaf_frame_pointer },
12587 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12588 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12589 OPT_march_ },
12590 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12591 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12592 OPT_mtune_ },
12593 { "branch-protection", aarch64_attr_custom, false,
12594 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12595 { "sign-return-address", aarch64_attr_enum, false, NULL,
12596 OPT_msign_return_address_ },
12597 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12598 };
12599
12600 /* Parse ARG_STR which contains the definition of one target attribute.
12601 Show appropriate errors if any or return true if the attribute is valid. */
12602
12603 static bool
12604 aarch64_process_one_target_attr (char *arg_str)
12605 {
12606 bool invert = false;
12607
12608 size_t len = strlen (arg_str);
12609
12610 if (len == 0)
12611 {
12612 error ("malformed %<target()%> pragma or attribute");
12613 return false;
12614 }
12615
12616 char *str_to_check = (char *) alloca (len + 1);
12617 strcpy (str_to_check, arg_str);
12618
12619 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12620 It is easier to detect and handle it explicitly here rather than going
12621 through the machinery for the rest of the target attributes in this
12622 function. */
12623 if (*str_to_check == '+')
12624 return aarch64_handle_attr_isa_flags (str_to_check);
12625
12626 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12627 {
12628 invert = true;
12629 str_to_check += 3;
12630 }
12631 char *arg = strchr (str_to_check, '=');
12632
12633 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12634 and point ARG to "foo". */
12635 if (arg)
12636 {
12637 *arg = '\0';
12638 arg++;
12639 }
12640 const struct aarch64_attribute_info *p_attr;
12641 bool found = false;
12642 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12643 {
12644 /* If the names don't match up, or the user has given an argument
12645 to an attribute that doesn't accept one, or didn't give an argument
12646 to an attribute that expects one, fail to match. */
12647 if (strcmp (str_to_check, p_attr->name) != 0)
12648 continue;
12649
12650 found = true;
12651 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12652 || p_attr->attr_type == aarch64_attr_enum;
12653
12654 if (attr_need_arg_p ^ (arg != NULL))
12655 {
12656 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12657 return false;
12658 }
12659
12660 /* If the name matches but the attribute does not allow "no-" versions
12661 then we can't match. */
12662 if (invert && !p_attr->allow_neg)
12663 {
12664 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12665 return false;
12666 }
12667
12668 switch (p_attr->attr_type)
12669 {
12670 /* Has a custom handler registered.
12671 For example, cpu=, arch=, tune=. */
12672 case aarch64_attr_custom:
12673 gcc_assert (p_attr->handler);
12674 if (!p_attr->handler (arg))
12675 return false;
12676 break;
12677
12678 /* Either set or unset a boolean option. */
12679 case aarch64_attr_bool:
12680 {
12681 struct cl_decoded_option decoded;
12682
12683 generate_option (p_attr->opt_num, NULL, !invert,
12684 CL_TARGET, &decoded);
12685 aarch64_handle_option (&global_options, &global_options_set,
12686 &decoded, input_location);
12687 break;
12688 }
12689 /* Set or unset a bit in the target_flags. aarch64_handle_option
12690 should know what mask to apply given the option number. */
12691 case aarch64_attr_mask:
12692 {
12693 struct cl_decoded_option decoded;
12694 /* We only need to specify the option number.
12695 aarch64_handle_option will know which mask to apply. */
12696 decoded.opt_index = p_attr->opt_num;
12697 decoded.value = !invert;
12698 aarch64_handle_option (&global_options, &global_options_set,
12699 &decoded, input_location);
12700 break;
12701 }
12702 /* Use the option setting machinery to set an option to an enum. */
12703 case aarch64_attr_enum:
12704 {
12705 gcc_assert (arg);
12706 bool valid;
12707 int value;
12708 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12709 &value, CL_TARGET);
12710 if (valid)
12711 {
12712 set_option (&global_options, NULL, p_attr->opt_num, value,
12713 NULL, DK_UNSPECIFIED, input_location,
12714 global_dc);
12715 }
12716 else
12717 {
12718 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12719 }
12720 break;
12721 }
12722 default:
12723 gcc_unreachable ();
12724 }
12725 }
12726
12727 /* If we reached here we either have found an attribute and validated
12728 it or didn't match any. If we matched an attribute but its arguments
12729 were malformed we will have returned false already. */
12730 return found;
12731 }
12732
12733 /* Count how many times the character C appears in
12734 NULL-terminated string STR. */
12735
12736 static unsigned int
12737 num_occurences_in_str (char c, char *str)
12738 {
12739 unsigned int res = 0;
12740 while (*str != '\0')
12741 {
12742 if (*str == c)
12743 res++;
12744
12745 str++;
12746 }
12747
12748 return res;
12749 }
12750
12751 /* Parse the tree in ARGS that contains the target attribute information
12752 and update the global target options space. */
12753
12754 bool
12755 aarch64_process_target_attr (tree args)
12756 {
12757 if (TREE_CODE (args) == TREE_LIST)
12758 {
12759 do
12760 {
12761 tree head = TREE_VALUE (args);
12762 if (head)
12763 {
12764 if (!aarch64_process_target_attr (head))
12765 return false;
12766 }
12767 args = TREE_CHAIN (args);
12768 } while (args);
12769
12770 return true;
12771 }
12772
12773 if (TREE_CODE (args) != STRING_CST)
12774 {
12775 error ("attribute %<target%> argument not a string");
12776 return false;
12777 }
12778
12779 size_t len = strlen (TREE_STRING_POINTER (args));
12780 char *str_to_check = (char *) alloca (len + 1);
12781 strcpy (str_to_check, TREE_STRING_POINTER (args));
12782
12783 if (len == 0)
12784 {
12785 error ("malformed %<target()%> pragma or attribute");
12786 return false;
12787 }
12788
12789 /* Used to catch empty spaces between commas i.e.
12790 attribute ((target ("attr1,,attr2"))). */
12791 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12792
12793 /* Handle multiple target attributes separated by ','. */
12794 char *token = strtok_r (str_to_check, ",", &str_to_check);
12795
12796 unsigned int num_attrs = 0;
12797 while (token)
12798 {
12799 num_attrs++;
12800 if (!aarch64_process_one_target_attr (token))
12801 {
12802 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12803 return false;
12804 }
12805
12806 token = strtok_r (NULL, ",", &str_to_check);
12807 }
12808
12809 if (num_attrs != num_commas + 1)
12810 {
12811 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12812 return false;
12813 }
12814
12815 return true;
12816 }
12817
12818 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
12819 process attribute ((target ("..."))). */
12820
12821 static bool
12822 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12823 {
12824 struct cl_target_option cur_target;
12825 bool ret;
12826 tree old_optimize;
12827 tree new_target, new_optimize;
12828 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12829
12830 /* If what we're processing is the current pragma string then the
12831 target option node is already stored in target_option_current_node
12832 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
12833 having to re-parse the string. This is especially useful to keep
12834 arm_neon.h compile times down since that header contains a lot
12835 of intrinsics enclosed in pragmas. */
12836 if (!existing_target && args == current_target_pragma)
12837 {
12838 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12839 return true;
12840 }
12841 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12842
12843 old_optimize = build_optimization_node (&global_options);
12844 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12845
12846 /* If the function changed the optimization levels as well as setting
12847 target options, start with the optimizations specified. */
12848 if (func_optimize && func_optimize != old_optimize)
12849 cl_optimization_restore (&global_options,
12850 TREE_OPTIMIZATION (func_optimize));
12851
12852 /* Save the current target options to restore at the end. */
12853 cl_target_option_save (&cur_target, &global_options);
12854
12855 /* If fndecl already has some target attributes applied to it, unpack
12856 them so that we add this attribute on top of them, rather than
12857 overwriting them. */
12858 if (existing_target)
12859 {
12860 struct cl_target_option *existing_options
12861 = TREE_TARGET_OPTION (existing_target);
12862
12863 if (existing_options)
12864 cl_target_option_restore (&global_options, existing_options);
12865 }
12866 else
12867 cl_target_option_restore (&global_options,
12868 TREE_TARGET_OPTION (target_option_current_node));
12869
12870 ret = aarch64_process_target_attr (args);
12871
12872 /* Set up any additional state. */
12873 if (ret)
12874 {
12875 aarch64_override_options_internal (&global_options);
12876 /* Initialize SIMD builtins if we haven't already.
12877 Set current_target_pragma to NULL for the duration so that
12878 the builtin initialization code doesn't try to tag the functions
12879 being built with the attributes specified by any current pragma, thus
12880 going into an infinite recursion. */
12881 if (TARGET_SIMD)
12882 {
12883 tree saved_current_target_pragma = current_target_pragma;
12884 current_target_pragma = NULL;
12885 aarch64_init_simd_builtins ();
12886 current_target_pragma = saved_current_target_pragma;
12887 }
12888 new_target = build_target_option_node (&global_options);
12889 }
12890 else
12891 new_target = NULL;
12892
12893 new_optimize = build_optimization_node (&global_options);
12894
12895 if (fndecl && ret)
12896 {
12897 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12898
12899 if (old_optimize != new_optimize)
12900 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12901 }
12902
12903 cl_target_option_restore (&global_options, &cur_target);
12904
12905 if (old_optimize != new_optimize)
12906 cl_optimization_restore (&global_options,
12907 TREE_OPTIMIZATION (old_optimize));
12908 return ret;
12909 }
12910
12911 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
12912 tri-bool options (yes, no, don't care) and the default value is
12913 DEF, determine whether to reject inlining. */
12914
12915 static bool
12916 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12917 int dont_care, int def)
12918 {
12919 /* If the callee doesn't care, always allow inlining. */
12920 if (callee == dont_care)
12921 return true;
12922
12923 /* If the caller doesn't care, always allow inlining. */
12924 if (caller == dont_care)
12925 return true;
12926
12927 /* Otherwise, allow inlining if either the callee and caller values
12928 agree, or if the callee is using the default value. */
12929 return (callee == caller || callee == def);
12930 }
12931
12932 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
12933 to inline CALLEE into CALLER based on target-specific info.
12934 Make sure that the caller and callee have compatible architectural
12935 features. Then go through the other possible target attributes
12936 and see if they can block inlining. Try not to reject always_inline
12937 callees unless they are incompatible architecturally. */
12938
12939 static bool
12940 aarch64_can_inline_p (tree caller, tree callee)
12941 {
12942 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12943 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12944
12945 struct cl_target_option *caller_opts
12946 = TREE_TARGET_OPTION (caller_tree ? caller_tree
12947 : target_option_default_node);
12948
12949 struct cl_target_option *callee_opts
12950 = TREE_TARGET_OPTION (callee_tree ? callee_tree
12951 : target_option_default_node);
12952
12953 /* Callee's ISA flags should be a subset of the caller's. */
12954 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12955 != callee_opts->x_aarch64_isa_flags)
12956 return false;
12957
12958 /* Allow non-strict aligned functions inlining into strict
12959 aligned ones. */
12960 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12961 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12962 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12963 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12964 return false;
12965
12966 bool always_inline = lookup_attribute ("always_inline",
12967 DECL_ATTRIBUTES (callee));
12968
12969 /* If the architectural features match up and the callee is always_inline
12970 then the other attributes don't matter. */
12971 if (always_inline)
12972 return true;
12973
12974 if (caller_opts->x_aarch64_cmodel_var
12975 != callee_opts->x_aarch64_cmodel_var)
12976 return false;
12977
12978 if (caller_opts->x_aarch64_tls_dialect
12979 != callee_opts->x_aarch64_tls_dialect)
12980 return false;
12981
12982 /* Honour explicit requests to workaround errata. */
12983 if (!aarch64_tribools_ok_for_inlining_p (
12984 caller_opts->x_aarch64_fix_a53_err835769,
12985 callee_opts->x_aarch64_fix_a53_err835769,
12986 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12987 return false;
12988
12989 if (!aarch64_tribools_ok_for_inlining_p (
12990 caller_opts->x_aarch64_fix_a53_err843419,
12991 callee_opts->x_aarch64_fix_a53_err843419,
12992 2, TARGET_FIX_ERR_A53_843419))
12993 return false;
12994
12995 /* If the user explicitly specified -momit-leaf-frame-pointer for the
12996 caller and calle and they don't match up, reject inlining. */
12997 if (!aarch64_tribools_ok_for_inlining_p (
12998 caller_opts->x_flag_omit_leaf_frame_pointer,
12999 callee_opts->x_flag_omit_leaf_frame_pointer,
13000 2, 1))
13001 return false;
13002
13003 /* If the callee has specific tuning overrides, respect them. */
13004 if (callee_opts->x_aarch64_override_tune_string != NULL
13005 && caller_opts->x_aarch64_override_tune_string == NULL)
13006 return false;
13007
13008 /* If the user specified tuning override strings for the
13009 caller and callee and they don't match up, reject inlining.
13010 We just do a string compare here, we don't analyze the meaning
13011 of the string, as it would be too costly for little gain. */
13012 if (callee_opts->x_aarch64_override_tune_string
13013 && caller_opts->x_aarch64_override_tune_string
13014 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13015 caller_opts->x_aarch64_override_tune_string) != 0))
13016 return false;
13017
13018 return true;
13019 }
13020
13021 /* Return true if SYMBOL_REF X binds locally. */
13022
13023 static bool
13024 aarch64_symbol_binds_local_p (const_rtx x)
13025 {
13026 return (SYMBOL_REF_DECL (x)
13027 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13028 : SYMBOL_REF_LOCAL_P (x));
13029 }
13030
13031 /* Return true if SYMBOL_REF X is thread local */
13032 static bool
13033 aarch64_tls_symbol_p (rtx x)
13034 {
13035 if (! TARGET_HAVE_TLS)
13036 return false;
13037
13038 if (GET_CODE (x) != SYMBOL_REF)
13039 return false;
13040
13041 return SYMBOL_REF_TLS_MODEL (x) != 0;
13042 }
13043
13044 /* Classify a TLS symbol into one of the TLS kinds. */
13045 enum aarch64_symbol_type
13046 aarch64_classify_tls_symbol (rtx x)
13047 {
13048 enum tls_model tls_kind = tls_symbolic_operand_type (x);
13049
13050 switch (tls_kind)
13051 {
13052 case TLS_MODEL_GLOBAL_DYNAMIC:
13053 case TLS_MODEL_LOCAL_DYNAMIC:
13054 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13055
13056 case TLS_MODEL_INITIAL_EXEC:
13057 switch (aarch64_cmodel)
13058 {
13059 case AARCH64_CMODEL_TINY:
13060 case AARCH64_CMODEL_TINY_PIC:
13061 return SYMBOL_TINY_TLSIE;
13062 default:
13063 return SYMBOL_SMALL_TLSIE;
13064 }
13065
13066 case TLS_MODEL_LOCAL_EXEC:
13067 if (aarch64_tls_size == 12)
13068 return SYMBOL_TLSLE12;
13069 else if (aarch64_tls_size == 24)
13070 return SYMBOL_TLSLE24;
13071 else if (aarch64_tls_size == 32)
13072 return SYMBOL_TLSLE32;
13073 else if (aarch64_tls_size == 48)
13074 return SYMBOL_TLSLE48;
13075 else
13076 gcc_unreachable ();
13077
13078 case TLS_MODEL_EMULATED:
13079 case TLS_MODEL_NONE:
13080 return SYMBOL_FORCE_TO_MEM;
13081
13082 default:
13083 gcc_unreachable ();
13084 }
13085 }
13086
13087 /* Return the correct method for accessing X + OFFSET, where X is either
13088 a SYMBOL_REF or LABEL_REF. */
13089
13090 enum aarch64_symbol_type
13091 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13092 {
13093 if (GET_CODE (x) == LABEL_REF)
13094 {
13095 switch (aarch64_cmodel)
13096 {
13097 case AARCH64_CMODEL_LARGE:
13098 return SYMBOL_FORCE_TO_MEM;
13099
13100 case AARCH64_CMODEL_TINY_PIC:
13101 case AARCH64_CMODEL_TINY:
13102 return SYMBOL_TINY_ABSOLUTE;
13103
13104 case AARCH64_CMODEL_SMALL_SPIC:
13105 case AARCH64_CMODEL_SMALL_PIC:
13106 case AARCH64_CMODEL_SMALL:
13107 return SYMBOL_SMALL_ABSOLUTE;
13108
13109 default:
13110 gcc_unreachable ();
13111 }
13112 }
13113
13114 if (GET_CODE (x) == SYMBOL_REF)
13115 {
13116 if (aarch64_tls_symbol_p (x))
13117 return aarch64_classify_tls_symbol (x);
13118
13119 switch (aarch64_cmodel)
13120 {
13121 case AARCH64_CMODEL_TINY:
13122 /* When we retrieve symbol + offset address, we have to make sure
13123 the offset does not cause overflow of the final address. But
13124 we have no way of knowing the address of symbol at compile time
13125 so we can't accurately say if the distance between the PC and
13126 symbol + offset is outside the addressible range of +/-1M in the
13127 TINY code model. So we rely on images not being greater than
13128 1M and cap the offset at 1M and anything beyond 1M will have to
13129 be loaded using an alternative mechanism. Furthermore if the
13130 symbol is a weak reference to something that isn't known to
13131 resolve to a symbol in this module, then force to memory. */
13132 if ((SYMBOL_REF_WEAK (x)
13133 && !aarch64_symbol_binds_local_p (x))
13134 || !IN_RANGE (offset, -1048575, 1048575))
13135 return SYMBOL_FORCE_TO_MEM;
13136 return SYMBOL_TINY_ABSOLUTE;
13137
13138 case AARCH64_CMODEL_SMALL:
13139 /* Same reasoning as the tiny code model, but the offset cap here is
13140 4G. */
13141 if ((SYMBOL_REF_WEAK (x)
13142 && !aarch64_symbol_binds_local_p (x))
13143 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13144 HOST_WIDE_INT_C (4294967264)))
13145 return SYMBOL_FORCE_TO_MEM;
13146 return SYMBOL_SMALL_ABSOLUTE;
13147
13148 case AARCH64_CMODEL_TINY_PIC:
13149 if (!aarch64_symbol_binds_local_p (x))
13150 return SYMBOL_TINY_GOT;
13151 return SYMBOL_TINY_ABSOLUTE;
13152
13153 case AARCH64_CMODEL_SMALL_SPIC:
13154 case AARCH64_CMODEL_SMALL_PIC:
13155 if (!aarch64_symbol_binds_local_p (x))
13156 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13157 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13158 return SYMBOL_SMALL_ABSOLUTE;
13159
13160 case AARCH64_CMODEL_LARGE:
13161 /* This is alright even in PIC code as the constant
13162 pool reference is always PC relative and within
13163 the same translation unit. */
13164 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13165 return SYMBOL_SMALL_ABSOLUTE;
13166 else
13167 return SYMBOL_FORCE_TO_MEM;
13168
13169 default:
13170 gcc_unreachable ();
13171 }
13172 }
13173
13174 /* By default push everything into the constant pool. */
13175 return SYMBOL_FORCE_TO_MEM;
13176 }
13177
13178 bool
13179 aarch64_constant_address_p (rtx x)
13180 {
13181 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13182 }
13183
13184 bool
13185 aarch64_legitimate_pic_operand_p (rtx x)
13186 {
13187 if (GET_CODE (x) == SYMBOL_REF
13188 || (GET_CODE (x) == CONST
13189 && GET_CODE (XEXP (x, 0)) == PLUS
13190 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13191 return false;
13192
13193 return true;
13194 }
13195
13196 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13197 that should be rematerialized rather than spilled. */
13198
13199 static bool
13200 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13201 {
13202 /* Support CSE and rematerialization of common constants. */
13203 if (CONST_INT_P (x)
13204 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13205 || GET_CODE (x) == CONST_VECTOR)
13206 return true;
13207
13208 /* Do not allow vector struct mode constants for Advanced SIMD.
13209 We could support 0 and -1 easily, but they need support in
13210 aarch64-simd.md. */
13211 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13212 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13213 return false;
13214
13215 /* Only accept variable-length vector constants if they can be
13216 handled directly.
13217
13218 ??? It would be possible to handle rematerialization of other
13219 constants via secondary reloads. */
13220 if (vec_flags & VEC_ANY_SVE)
13221 return aarch64_simd_valid_immediate (x, NULL);
13222
13223 if (GET_CODE (x) == HIGH)
13224 x = XEXP (x, 0);
13225
13226 /* Accept polynomial constants that can be calculated by using the
13227 destination of a move as the sole temporary. Constants that
13228 require a second temporary cannot be rematerialized (they can't be
13229 forced to memory and also aren't legitimate constants). */
13230 poly_int64 offset;
13231 if (poly_int_rtx_p (x, &offset))
13232 return aarch64_offset_temporaries (false, offset) <= 1;
13233
13234 /* If an offset is being added to something else, we need to allow the
13235 base to be moved into the destination register, meaning that there
13236 are no free temporaries for the offset. */
13237 x = strip_offset (x, &offset);
13238 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13239 return false;
13240
13241 /* Do not allow const (plus (anchor_symbol, const_int)). */
13242 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13243 return false;
13244
13245 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13246 so spilling them is better than rematerialization. */
13247 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13248 return true;
13249
13250 /* Label references are always constant. */
13251 if (GET_CODE (x) == LABEL_REF)
13252 return true;
13253
13254 return false;
13255 }
13256
13257 rtx
13258 aarch64_load_tp (rtx target)
13259 {
13260 if (!target
13261 || GET_MODE (target) != Pmode
13262 || !register_operand (target, Pmode))
13263 target = gen_reg_rtx (Pmode);
13264
13265 /* Can return in any reg. */
13266 emit_insn (gen_aarch64_load_tp_hard (target));
13267 return target;
13268 }
13269
13270 /* On AAPCS systems, this is the "struct __va_list". */
13271 static GTY(()) tree va_list_type;
13272
13273 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13274 Return the type to use as __builtin_va_list.
13275
13276 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13277
13278 struct __va_list
13279 {
13280 void *__stack;
13281 void *__gr_top;
13282 void *__vr_top;
13283 int __gr_offs;
13284 int __vr_offs;
13285 }; */
13286
13287 static tree
13288 aarch64_build_builtin_va_list (void)
13289 {
13290 tree va_list_name;
13291 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13292
13293 /* Create the type. */
13294 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13295 /* Give it the required name. */
13296 va_list_name = build_decl (BUILTINS_LOCATION,
13297 TYPE_DECL,
13298 get_identifier ("__va_list"),
13299 va_list_type);
13300 DECL_ARTIFICIAL (va_list_name) = 1;
13301 TYPE_NAME (va_list_type) = va_list_name;
13302 TYPE_STUB_DECL (va_list_type) = va_list_name;
13303
13304 /* Create the fields. */
13305 f_stack = build_decl (BUILTINS_LOCATION,
13306 FIELD_DECL, get_identifier ("__stack"),
13307 ptr_type_node);
13308 f_grtop = build_decl (BUILTINS_LOCATION,
13309 FIELD_DECL, get_identifier ("__gr_top"),
13310 ptr_type_node);
13311 f_vrtop = build_decl (BUILTINS_LOCATION,
13312 FIELD_DECL, get_identifier ("__vr_top"),
13313 ptr_type_node);
13314 f_groff = build_decl (BUILTINS_LOCATION,
13315 FIELD_DECL, get_identifier ("__gr_offs"),
13316 integer_type_node);
13317 f_vroff = build_decl (BUILTINS_LOCATION,
13318 FIELD_DECL, get_identifier ("__vr_offs"),
13319 integer_type_node);
13320
13321 /* Tell tree-stdarg pass about our internal offset fields.
13322 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13323 purpose to identify whether the code is updating va_list internal
13324 offset fields through irregular way. */
13325 va_list_gpr_counter_field = f_groff;
13326 va_list_fpr_counter_field = f_vroff;
13327
13328 DECL_ARTIFICIAL (f_stack) = 1;
13329 DECL_ARTIFICIAL (f_grtop) = 1;
13330 DECL_ARTIFICIAL (f_vrtop) = 1;
13331 DECL_ARTIFICIAL (f_groff) = 1;
13332 DECL_ARTIFICIAL (f_vroff) = 1;
13333
13334 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13335 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13336 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13337 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13338 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13339
13340 TYPE_FIELDS (va_list_type) = f_stack;
13341 DECL_CHAIN (f_stack) = f_grtop;
13342 DECL_CHAIN (f_grtop) = f_vrtop;
13343 DECL_CHAIN (f_vrtop) = f_groff;
13344 DECL_CHAIN (f_groff) = f_vroff;
13345
13346 /* Compute its layout. */
13347 layout_type (va_list_type);
13348
13349 return va_list_type;
13350 }
13351
13352 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13353 static void
13354 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13355 {
13356 const CUMULATIVE_ARGS *cum;
13357 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13358 tree stack, grtop, vrtop, groff, vroff;
13359 tree t;
13360 int gr_save_area_size = cfun->va_list_gpr_size;
13361 int vr_save_area_size = cfun->va_list_fpr_size;
13362 int vr_offset;
13363
13364 cum = &crtl->args.info;
13365 if (cfun->va_list_gpr_size)
13366 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13367 cfun->va_list_gpr_size);
13368 if (cfun->va_list_fpr_size)
13369 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13370 * UNITS_PER_VREG, cfun->va_list_fpr_size);
13371
13372 if (!TARGET_FLOAT)
13373 {
13374 gcc_assert (cum->aapcs_nvrn == 0);
13375 vr_save_area_size = 0;
13376 }
13377
13378 f_stack = TYPE_FIELDS (va_list_type_node);
13379 f_grtop = DECL_CHAIN (f_stack);
13380 f_vrtop = DECL_CHAIN (f_grtop);
13381 f_groff = DECL_CHAIN (f_vrtop);
13382 f_vroff = DECL_CHAIN (f_groff);
13383
13384 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13385 NULL_TREE);
13386 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13387 NULL_TREE);
13388 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13389 NULL_TREE);
13390 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13391 NULL_TREE);
13392 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13393 NULL_TREE);
13394
13395 /* Emit code to initialize STACK, which points to the next varargs stack
13396 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13397 by named arguments. STACK is 8-byte aligned. */
13398 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13399 if (cum->aapcs_stack_size > 0)
13400 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13401 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13402 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13403
13404 /* Emit code to initialize GRTOP, the top of the GR save area.
13405 virtual_incoming_args_rtx should have been 16 byte aligned. */
13406 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13407 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13408 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13409
13410 /* Emit code to initialize VRTOP, the top of the VR save area.
13411 This address is gr_save_area_bytes below GRTOP, rounded
13412 down to the next 16-byte boundary. */
13413 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13414 vr_offset = ROUND_UP (gr_save_area_size,
13415 STACK_BOUNDARY / BITS_PER_UNIT);
13416
13417 if (vr_offset)
13418 t = fold_build_pointer_plus_hwi (t, -vr_offset);
13419 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13420 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13421
13422 /* Emit code to initialize GROFF, the offset from GRTOP of the
13423 next GPR argument. */
13424 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13425 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13426 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13427
13428 /* Likewise emit code to initialize VROFF, the offset from FTOP
13429 of the next VR argument. */
13430 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13431 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13432 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13433 }
13434
13435 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13436
13437 static tree
13438 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13439 gimple_seq *post_p ATTRIBUTE_UNUSED)
13440 {
13441 tree addr;
13442 bool indirect_p;
13443 bool is_ha; /* is HFA or HVA. */
13444 bool dw_align; /* double-word align. */
13445 machine_mode ag_mode = VOIDmode;
13446 int nregs;
13447 machine_mode mode;
13448
13449 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13450 tree stack, f_top, f_off, off, arg, roundup, on_stack;
13451 HOST_WIDE_INT size, rsize, adjust, align;
13452 tree t, u, cond1, cond2;
13453
13454 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13455 if (indirect_p)
13456 type = build_pointer_type (type);
13457
13458 mode = TYPE_MODE (type);
13459
13460 f_stack = TYPE_FIELDS (va_list_type_node);
13461 f_grtop = DECL_CHAIN (f_stack);
13462 f_vrtop = DECL_CHAIN (f_grtop);
13463 f_groff = DECL_CHAIN (f_vrtop);
13464 f_vroff = DECL_CHAIN (f_groff);
13465
13466 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13467 f_stack, NULL_TREE);
13468 size = int_size_in_bytes (type);
13469
13470 bool abi_break;
13471 align
13472 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13473
13474 dw_align = false;
13475 adjust = 0;
13476 if (aarch64_vfp_is_call_or_return_candidate (mode,
13477 type,
13478 &ag_mode,
13479 &nregs,
13480 &is_ha))
13481 {
13482 /* No frontends can create types with variable-sized modes, so we
13483 shouldn't be asked to pass or return them. */
13484 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13485
13486 /* TYPE passed in fp/simd registers. */
13487 if (!TARGET_FLOAT)
13488 aarch64_err_no_fpadvsimd (mode);
13489
13490 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13491 unshare_expr (valist), f_vrtop, NULL_TREE);
13492 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13493 unshare_expr (valist), f_vroff, NULL_TREE);
13494
13495 rsize = nregs * UNITS_PER_VREG;
13496
13497 if (is_ha)
13498 {
13499 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13500 adjust = UNITS_PER_VREG - ag_size;
13501 }
13502 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13503 && size < UNITS_PER_VREG)
13504 {
13505 adjust = UNITS_PER_VREG - size;
13506 }
13507 }
13508 else
13509 {
13510 /* TYPE passed in general registers. */
13511 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13512 unshare_expr (valist), f_grtop, NULL_TREE);
13513 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13514 unshare_expr (valist), f_groff, NULL_TREE);
13515 rsize = ROUND_UP (size, UNITS_PER_WORD);
13516 nregs = rsize / UNITS_PER_WORD;
13517
13518 if (align > 8)
13519 {
13520 if (abi_break && warn_psabi)
13521 inform (input_location, "parameter passing for argument of type "
13522 "%qT changed in GCC 9.1", type);
13523 dw_align = true;
13524 }
13525
13526 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13527 && size < UNITS_PER_WORD)
13528 {
13529 adjust = UNITS_PER_WORD - size;
13530 }
13531 }
13532
13533 /* Get a local temporary for the field value. */
13534 off = get_initialized_tmp_var (f_off, pre_p, NULL);
13535
13536 /* Emit code to branch if off >= 0. */
13537 t = build2 (GE_EXPR, boolean_type_node, off,
13538 build_int_cst (TREE_TYPE (off), 0));
13539 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13540
13541 if (dw_align)
13542 {
13543 /* Emit: offs = (offs + 15) & -16. */
13544 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13545 build_int_cst (TREE_TYPE (off), 15));
13546 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13547 build_int_cst (TREE_TYPE (off), -16));
13548 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13549 }
13550 else
13551 roundup = NULL;
13552
13553 /* Update ap.__[g|v]r_offs */
13554 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13555 build_int_cst (TREE_TYPE (off), rsize));
13556 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13557
13558 /* String up. */
13559 if (roundup)
13560 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13561
13562 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13563 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13564 build_int_cst (TREE_TYPE (f_off), 0));
13565 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13566
13567 /* String up: make sure the assignment happens before the use. */
13568 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13569 COND_EXPR_ELSE (cond1) = t;
13570
13571 /* Prepare the trees handling the argument that is passed on the stack;
13572 the top level node will store in ON_STACK. */
13573 arg = get_initialized_tmp_var (stack, pre_p, NULL);
13574 if (align > 8)
13575 {
13576 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13577 t = fold_build_pointer_plus_hwi (arg, 15);
13578 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13579 build_int_cst (TREE_TYPE (t), -16));
13580 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13581 }
13582 else
13583 roundup = NULL;
13584 /* Advance ap.__stack */
13585 t = fold_build_pointer_plus_hwi (arg, size + 7);
13586 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13587 build_int_cst (TREE_TYPE (t), -8));
13588 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13589 /* String up roundup and advance. */
13590 if (roundup)
13591 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13592 /* String up with arg */
13593 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13594 /* Big-endianness related address adjustment. */
13595 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13596 && size < UNITS_PER_WORD)
13597 {
13598 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13599 size_int (UNITS_PER_WORD - size));
13600 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13601 }
13602
13603 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13604 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13605
13606 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13607 t = off;
13608 if (adjust)
13609 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13610 build_int_cst (TREE_TYPE (off), adjust));
13611
13612 t = fold_convert (sizetype, t);
13613 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13614
13615 if (is_ha)
13616 {
13617 /* type ha; // treat as "struct {ftype field[n];}"
13618 ... [computing offs]
13619 for (i = 0; i <nregs; ++i, offs += 16)
13620 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13621 return ha; */
13622 int i;
13623 tree tmp_ha, field_t, field_ptr_t;
13624
13625 /* Declare a local variable. */
13626 tmp_ha = create_tmp_var_raw (type, "ha");
13627 gimple_add_tmp_var (tmp_ha);
13628
13629 /* Establish the base type. */
13630 switch (ag_mode)
13631 {
13632 case E_SFmode:
13633 field_t = float_type_node;
13634 field_ptr_t = float_ptr_type_node;
13635 break;
13636 case E_DFmode:
13637 field_t = double_type_node;
13638 field_ptr_t = double_ptr_type_node;
13639 break;
13640 case E_TFmode:
13641 field_t = long_double_type_node;
13642 field_ptr_t = long_double_ptr_type_node;
13643 break;
13644 case E_HFmode:
13645 field_t = aarch64_fp16_type_node;
13646 field_ptr_t = aarch64_fp16_ptr_type_node;
13647 break;
13648 case E_V2SImode:
13649 case E_V4SImode:
13650 {
13651 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13652 field_t = build_vector_type_for_mode (innertype, ag_mode);
13653 field_ptr_t = build_pointer_type (field_t);
13654 }
13655 break;
13656 default:
13657 gcc_assert (0);
13658 }
13659
13660 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
13661 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13662 addr = t;
13663 t = fold_convert (field_ptr_t, addr);
13664 t = build2 (MODIFY_EXPR, field_t,
13665 build1 (INDIRECT_REF, field_t, tmp_ha),
13666 build1 (INDIRECT_REF, field_t, t));
13667
13668 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
13669 for (i = 1; i < nregs; ++i)
13670 {
13671 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13672 u = fold_convert (field_ptr_t, addr);
13673 u = build2 (MODIFY_EXPR, field_t,
13674 build2 (MEM_REF, field_t, tmp_ha,
13675 build_int_cst (field_ptr_t,
13676 (i *
13677 int_size_in_bytes (field_t)))),
13678 build1 (INDIRECT_REF, field_t, u));
13679 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13680 }
13681
13682 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13683 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13684 }
13685
13686 COND_EXPR_ELSE (cond2) = t;
13687 addr = fold_convert (build_pointer_type (type), cond1);
13688 addr = build_va_arg_indirect_ref (addr);
13689
13690 if (indirect_p)
13691 addr = build_va_arg_indirect_ref (addr);
13692
13693 return addr;
13694 }
13695
13696 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
13697
13698 static void
13699 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13700 tree type, int *pretend_size ATTRIBUTE_UNUSED,
13701 int no_rtl)
13702 {
13703 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13704 CUMULATIVE_ARGS local_cum;
13705 int gr_saved = cfun->va_list_gpr_size;
13706 int vr_saved = cfun->va_list_fpr_size;
13707
13708 /* The caller has advanced CUM up to, but not beyond, the last named
13709 argument. Advance a local copy of CUM past the last "real" named
13710 argument, to find out how many registers are left over. */
13711 local_cum = *cum;
13712 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13713
13714 /* Found out how many registers we need to save.
13715 Honor tree-stdvar analysis results. */
13716 if (cfun->va_list_gpr_size)
13717 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13718 cfun->va_list_gpr_size / UNITS_PER_WORD);
13719 if (cfun->va_list_fpr_size)
13720 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13721 cfun->va_list_fpr_size / UNITS_PER_VREG);
13722
13723 if (!TARGET_FLOAT)
13724 {
13725 gcc_assert (local_cum.aapcs_nvrn == 0);
13726 vr_saved = 0;
13727 }
13728
13729 if (!no_rtl)
13730 {
13731 if (gr_saved > 0)
13732 {
13733 rtx ptr, mem;
13734
13735 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
13736 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13737 - gr_saved * UNITS_PER_WORD);
13738 mem = gen_frame_mem (BLKmode, ptr);
13739 set_mem_alias_set (mem, get_varargs_alias_set ());
13740
13741 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13742 mem, gr_saved);
13743 }
13744 if (vr_saved > 0)
13745 {
13746 /* We can't use move_block_from_reg, because it will use
13747 the wrong mode, storing D regs only. */
13748 machine_mode mode = TImode;
13749 int off, i, vr_start;
13750
13751 /* Set OFF to the offset from virtual_incoming_args_rtx of
13752 the first vector register. The VR save area lies below
13753 the GR one, and is aligned to 16 bytes. */
13754 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13755 STACK_BOUNDARY / BITS_PER_UNIT);
13756 off -= vr_saved * UNITS_PER_VREG;
13757
13758 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13759 for (i = 0; i < vr_saved; ++i)
13760 {
13761 rtx ptr, mem;
13762
13763 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13764 mem = gen_frame_mem (mode, ptr);
13765 set_mem_alias_set (mem, get_varargs_alias_set ());
13766 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13767 off += UNITS_PER_VREG;
13768 }
13769 }
13770 }
13771
13772 /* We don't save the size into *PRETEND_SIZE because we want to avoid
13773 any complication of having crtl->args.pretend_args_size changed. */
13774 cfun->machine->frame.saved_varargs_size
13775 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13776 STACK_BOUNDARY / BITS_PER_UNIT)
13777 + vr_saved * UNITS_PER_VREG);
13778 }
13779
13780 static void
13781 aarch64_conditional_register_usage (void)
13782 {
13783 int i;
13784 if (!TARGET_FLOAT)
13785 {
13786 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13787 {
13788 fixed_regs[i] = 1;
13789 call_used_regs[i] = 1;
13790 }
13791 }
13792 if (!TARGET_SVE)
13793 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13794 {
13795 fixed_regs[i] = 1;
13796 call_used_regs[i] = 1;
13797 }
13798
13799 /* When tracking speculation, we need a couple of call-clobbered registers
13800 to track the speculation state. It would be nice to just use
13801 IP0 and IP1, but currently there are numerous places that just
13802 assume these registers are free for other uses (eg pointer
13803 authentication). */
13804 if (aarch64_track_speculation)
13805 {
13806 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13807 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13808 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13809 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13810 }
13811 }
13812
13813 /* Walk down the type tree of TYPE counting consecutive base elements.
13814 If *MODEP is VOIDmode, then set it to the first valid floating point
13815 type. If a non-floating point type is found, or if a floating point
13816 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13817 otherwise return the count in the sub-tree. */
13818 static int
13819 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13820 {
13821 machine_mode mode;
13822 HOST_WIDE_INT size;
13823
13824 switch (TREE_CODE (type))
13825 {
13826 case REAL_TYPE:
13827 mode = TYPE_MODE (type);
13828 if (mode != DFmode && mode != SFmode
13829 && mode != TFmode && mode != HFmode)
13830 return -1;
13831
13832 if (*modep == VOIDmode)
13833 *modep = mode;
13834
13835 if (*modep == mode)
13836 return 1;
13837
13838 break;
13839
13840 case COMPLEX_TYPE:
13841 mode = TYPE_MODE (TREE_TYPE (type));
13842 if (mode != DFmode && mode != SFmode
13843 && mode != TFmode && mode != HFmode)
13844 return -1;
13845
13846 if (*modep == VOIDmode)
13847 *modep = mode;
13848
13849 if (*modep == mode)
13850 return 2;
13851
13852 break;
13853
13854 case VECTOR_TYPE:
13855 /* Use V2SImode and V4SImode as representatives of all 64-bit
13856 and 128-bit vector types. */
13857 size = int_size_in_bytes (type);
13858 switch (size)
13859 {
13860 case 8:
13861 mode = V2SImode;
13862 break;
13863 case 16:
13864 mode = V4SImode;
13865 break;
13866 default:
13867 return -1;
13868 }
13869
13870 if (*modep == VOIDmode)
13871 *modep = mode;
13872
13873 /* Vector modes are considered to be opaque: two vectors are
13874 equivalent for the purposes of being homogeneous aggregates
13875 if they are the same size. */
13876 if (*modep == mode)
13877 return 1;
13878
13879 break;
13880
13881 case ARRAY_TYPE:
13882 {
13883 int count;
13884 tree index = TYPE_DOMAIN (type);
13885
13886 /* Can't handle incomplete types nor sizes that are not
13887 fixed. */
13888 if (!COMPLETE_TYPE_P (type)
13889 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13890 return -1;
13891
13892 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13893 if (count == -1
13894 || !index
13895 || !TYPE_MAX_VALUE (index)
13896 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13897 || !TYPE_MIN_VALUE (index)
13898 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13899 || count < 0)
13900 return -1;
13901
13902 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13903 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13904
13905 /* There must be no padding. */
13906 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13907 count * GET_MODE_BITSIZE (*modep)))
13908 return -1;
13909
13910 return count;
13911 }
13912
13913 case RECORD_TYPE:
13914 {
13915 int count = 0;
13916 int sub_count;
13917 tree field;
13918
13919 /* Can't handle incomplete types nor sizes that are not
13920 fixed. */
13921 if (!COMPLETE_TYPE_P (type)
13922 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13923 return -1;
13924
13925 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13926 {
13927 if (TREE_CODE (field) != FIELD_DECL)
13928 continue;
13929
13930 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13931 if (sub_count < 0)
13932 return -1;
13933 count += sub_count;
13934 }
13935
13936 /* There must be no padding. */
13937 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13938 count * GET_MODE_BITSIZE (*modep)))
13939 return -1;
13940
13941 return count;
13942 }
13943
13944 case UNION_TYPE:
13945 case QUAL_UNION_TYPE:
13946 {
13947 /* These aren't very interesting except in a degenerate case. */
13948 int count = 0;
13949 int sub_count;
13950 tree field;
13951
13952 /* Can't handle incomplete types nor sizes that are not
13953 fixed. */
13954 if (!COMPLETE_TYPE_P (type)
13955 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13956 return -1;
13957
13958 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13959 {
13960 if (TREE_CODE (field) != FIELD_DECL)
13961 continue;
13962
13963 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13964 if (sub_count < 0)
13965 return -1;
13966 count = count > sub_count ? count : sub_count;
13967 }
13968
13969 /* There must be no padding. */
13970 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13971 count * GET_MODE_BITSIZE (*modep)))
13972 return -1;
13973
13974 return count;
13975 }
13976
13977 default:
13978 break;
13979 }
13980
13981 return -1;
13982 }
13983
13984 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13985 type as described in AAPCS64 \S 4.1.2.
13986
13987 See the comment above aarch64_composite_type_p for the notes on MODE. */
13988
13989 static bool
13990 aarch64_short_vector_p (const_tree type,
13991 machine_mode mode)
13992 {
13993 poly_int64 size = -1;
13994
13995 if (type && TREE_CODE (type) == VECTOR_TYPE)
13996 size = int_size_in_bytes (type);
13997 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13998 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13999 size = GET_MODE_SIZE (mode);
14000
14001 return known_eq (size, 8) || known_eq (size, 16);
14002 }
14003
14004 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14005 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14006 array types. The C99 floating-point complex types are also considered
14007 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14008 types, which are GCC extensions and out of the scope of AAPCS64, are
14009 treated as composite types here as well.
14010
14011 Note that MODE itself is not sufficient in determining whether a type
14012 is such a composite type or not. This is because
14013 stor-layout.c:compute_record_mode may have already changed the MODE
14014 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14015 structure with only one field may have its MODE set to the mode of the
14016 field. Also an integer mode whose size matches the size of the
14017 RECORD_TYPE type may be used to substitute the original mode
14018 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14019 solely relied on. */
14020
14021 static bool
14022 aarch64_composite_type_p (const_tree type,
14023 machine_mode mode)
14024 {
14025 if (aarch64_short_vector_p (type, mode))
14026 return false;
14027
14028 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14029 return true;
14030
14031 if (mode == BLKmode
14032 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14033 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14034 return true;
14035
14036 return false;
14037 }
14038
14039 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14040 shall be passed or returned in simd/fp register(s) (providing these
14041 parameter passing registers are available).
14042
14043 Upon successful return, *COUNT returns the number of needed registers,
14044 *BASE_MODE returns the mode of the individual register and when IS_HAF
14045 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14046 floating-point aggregate or a homogeneous short-vector aggregate. */
14047
14048 static bool
14049 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14050 const_tree type,
14051 machine_mode *base_mode,
14052 int *count,
14053 bool *is_ha)
14054 {
14055 machine_mode new_mode = VOIDmode;
14056 bool composite_p = aarch64_composite_type_p (type, mode);
14057
14058 if (is_ha != NULL) *is_ha = false;
14059
14060 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14061 || aarch64_short_vector_p (type, mode))
14062 {
14063 *count = 1;
14064 new_mode = mode;
14065 }
14066 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14067 {
14068 if (is_ha != NULL) *is_ha = true;
14069 *count = 2;
14070 new_mode = GET_MODE_INNER (mode);
14071 }
14072 else if (type && composite_p)
14073 {
14074 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14075
14076 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14077 {
14078 if (is_ha != NULL) *is_ha = true;
14079 *count = ag_count;
14080 }
14081 else
14082 return false;
14083 }
14084 else
14085 return false;
14086
14087 *base_mode = new_mode;
14088 return true;
14089 }
14090
14091 /* Implement TARGET_STRUCT_VALUE_RTX. */
14092
14093 static rtx
14094 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14095 int incoming ATTRIBUTE_UNUSED)
14096 {
14097 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14098 }
14099
14100 /* Implements target hook vector_mode_supported_p. */
14101 static bool
14102 aarch64_vector_mode_supported_p (machine_mode mode)
14103 {
14104 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14105 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14106 }
14107
14108 /* Return appropriate SIMD container
14109 for MODE within a vector of WIDTH bits. */
14110 static machine_mode
14111 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14112 {
14113 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14114 switch (mode)
14115 {
14116 case E_DFmode:
14117 return VNx2DFmode;
14118 case E_SFmode:
14119 return VNx4SFmode;
14120 case E_HFmode:
14121 return VNx8HFmode;
14122 case E_DImode:
14123 return VNx2DImode;
14124 case E_SImode:
14125 return VNx4SImode;
14126 case E_HImode:
14127 return VNx8HImode;
14128 case E_QImode:
14129 return VNx16QImode;
14130 default:
14131 return word_mode;
14132 }
14133
14134 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14135 if (TARGET_SIMD)
14136 {
14137 if (known_eq (width, 128))
14138 switch (mode)
14139 {
14140 case E_DFmode:
14141 return V2DFmode;
14142 case E_SFmode:
14143 return V4SFmode;
14144 case E_HFmode:
14145 return V8HFmode;
14146 case E_SImode:
14147 return V4SImode;
14148 case E_HImode:
14149 return V8HImode;
14150 case E_QImode:
14151 return V16QImode;
14152 case E_DImode:
14153 return V2DImode;
14154 default:
14155 break;
14156 }
14157 else
14158 switch (mode)
14159 {
14160 case E_SFmode:
14161 return V2SFmode;
14162 case E_HFmode:
14163 return V4HFmode;
14164 case E_SImode:
14165 return V2SImode;
14166 case E_HImode:
14167 return V4HImode;
14168 case E_QImode:
14169 return V8QImode;
14170 default:
14171 break;
14172 }
14173 }
14174 return word_mode;
14175 }
14176
14177 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14178 static machine_mode
14179 aarch64_preferred_simd_mode (scalar_mode mode)
14180 {
14181 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14182 return aarch64_simd_container_mode (mode, bits);
14183 }
14184
14185 /* Return a list of possible vector sizes for the vectorizer
14186 to iterate over. */
14187 static void
14188 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14189 {
14190 if (TARGET_SVE)
14191 sizes->safe_push (BYTES_PER_SVE_VECTOR);
14192 sizes->safe_push (16);
14193 sizes->safe_push (8);
14194 }
14195
14196 /* Implement TARGET_MANGLE_TYPE. */
14197
14198 static const char *
14199 aarch64_mangle_type (const_tree type)
14200 {
14201 /* The AArch64 ABI documents say that "__va_list" has to be
14202 mangled as if it is in the "std" namespace. */
14203 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14204 return "St9__va_list";
14205
14206 /* Half-precision float. */
14207 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14208 return "Dh";
14209
14210 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14211 builtin types. */
14212 if (TYPE_NAME (type) != NULL)
14213 return aarch64_mangle_builtin_type (type);
14214
14215 /* Use the default mangling. */
14216 return NULL;
14217 }
14218
14219 /* Find the first rtx_insn before insn that will generate an assembly
14220 instruction. */
14221
14222 static rtx_insn *
14223 aarch64_prev_real_insn (rtx_insn *insn)
14224 {
14225 if (!insn)
14226 return NULL;
14227
14228 do
14229 {
14230 insn = prev_real_insn (insn);
14231 }
14232 while (insn && recog_memoized (insn) < 0);
14233
14234 return insn;
14235 }
14236
14237 static bool
14238 is_madd_op (enum attr_type t1)
14239 {
14240 unsigned int i;
14241 /* A number of these may be AArch32 only. */
14242 enum attr_type mlatypes[] = {
14243 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14244 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14245 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14246 };
14247
14248 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14249 {
14250 if (t1 == mlatypes[i])
14251 return true;
14252 }
14253
14254 return false;
14255 }
14256
14257 /* Check if there is a register dependency between a load and the insn
14258 for which we hold recog_data. */
14259
14260 static bool
14261 dep_between_memop_and_curr (rtx memop)
14262 {
14263 rtx load_reg;
14264 int opno;
14265
14266 gcc_assert (GET_CODE (memop) == SET);
14267
14268 if (!REG_P (SET_DEST (memop)))
14269 return false;
14270
14271 load_reg = SET_DEST (memop);
14272 for (opno = 1; opno < recog_data.n_operands; opno++)
14273 {
14274 rtx operand = recog_data.operand[opno];
14275 if (REG_P (operand)
14276 && reg_overlap_mentioned_p (load_reg, operand))
14277 return true;
14278
14279 }
14280 return false;
14281 }
14282
14283
14284 /* When working around the Cortex-A53 erratum 835769,
14285 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14286 instruction and has a preceding memory instruction such that a NOP
14287 should be inserted between them. */
14288
14289 bool
14290 aarch64_madd_needs_nop (rtx_insn* insn)
14291 {
14292 enum attr_type attr_type;
14293 rtx_insn *prev;
14294 rtx body;
14295
14296 if (!TARGET_FIX_ERR_A53_835769)
14297 return false;
14298
14299 if (!INSN_P (insn) || recog_memoized (insn) < 0)
14300 return false;
14301
14302 attr_type = get_attr_type (insn);
14303 if (!is_madd_op (attr_type))
14304 return false;
14305
14306 prev = aarch64_prev_real_insn (insn);
14307 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14308 Restore recog state to INSN to avoid state corruption. */
14309 extract_constrain_insn_cached (insn);
14310
14311 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14312 return false;
14313
14314 body = single_set (prev);
14315
14316 /* If the previous insn is a memory op and there is no dependency between
14317 it and the DImode madd, emit a NOP between them. If body is NULL then we
14318 have a complex memory operation, probably a load/store pair.
14319 Be conservative for now and emit a NOP. */
14320 if (GET_MODE (recog_data.operand[0]) == DImode
14321 && (!body || !dep_between_memop_and_curr (body)))
14322 return true;
14323
14324 return false;
14325
14326 }
14327
14328
14329 /* Implement FINAL_PRESCAN_INSN. */
14330
14331 void
14332 aarch64_final_prescan_insn (rtx_insn *insn)
14333 {
14334 if (aarch64_madd_needs_nop (insn))
14335 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14336 }
14337
14338
14339 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14340 instruction. */
14341
14342 bool
14343 aarch64_sve_index_immediate_p (rtx base_or_step)
14344 {
14345 return (CONST_INT_P (base_or_step)
14346 && IN_RANGE (INTVAL (base_or_step), -16, 15));
14347 }
14348
14349 /* Return true if X is a valid immediate for the SVE ADD and SUB
14350 instructions. Negate X first if NEGATE_P is true. */
14351
14352 bool
14353 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14354 {
14355 rtx elt;
14356
14357 if (!const_vec_duplicate_p (x, &elt)
14358 || !CONST_INT_P (elt))
14359 return false;
14360
14361 HOST_WIDE_INT val = INTVAL (elt);
14362 if (negate_p)
14363 val = -val;
14364 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14365
14366 if (val & 0xff)
14367 return IN_RANGE (val, 0, 0xff);
14368 return IN_RANGE (val, 0, 0xff00);
14369 }
14370
14371 /* Return true if X is a valid immediate operand for an SVE logical
14372 instruction such as AND. */
14373
14374 bool
14375 aarch64_sve_bitmask_immediate_p (rtx x)
14376 {
14377 rtx elt;
14378
14379 return (const_vec_duplicate_p (x, &elt)
14380 && CONST_INT_P (elt)
14381 && aarch64_bitmask_imm (INTVAL (elt),
14382 GET_MODE_INNER (GET_MODE (x))));
14383 }
14384
14385 /* Return true if X is a valid immediate for the SVE DUP and CPY
14386 instructions. */
14387
14388 bool
14389 aarch64_sve_dup_immediate_p (rtx x)
14390 {
14391 rtx elt;
14392
14393 if (!const_vec_duplicate_p (x, &elt)
14394 || !CONST_INT_P (elt))
14395 return false;
14396
14397 HOST_WIDE_INT val = INTVAL (elt);
14398 if (val & 0xff)
14399 return IN_RANGE (val, -0x80, 0x7f);
14400 return IN_RANGE (val, -0x8000, 0x7f00);
14401 }
14402
14403 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14404 SIGNED_P says whether the operand is signed rather than unsigned. */
14405
14406 bool
14407 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14408 {
14409 rtx elt;
14410
14411 return (const_vec_duplicate_p (x, &elt)
14412 && CONST_INT_P (elt)
14413 && (signed_p
14414 ? IN_RANGE (INTVAL (elt), -16, 15)
14415 : IN_RANGE (INTVAL (elt), 0, 127)));
14416 }
14417
14418 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14419 instruction. Negate X first if NEGATE_P is true. */
14420
14421 bool
14422 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14423 {
14424 rtx elt;
14425 REAL_VALUE_TYPE r;
14426
14427 if (!const_vec_duplicate_p (x, &elt)
14428 || GET_CODE (elt) != CONST_DOUBLE)
14429 return false;
14430
14431 r = *CONST_DOUBLE_REAL_VALUE (elt);
14432
14433 if (negate_p)
14434 r = real_value_negate (&r);
14435
14436 if (real_equal (&r, &dconst1))
14437 return true;
14438 if (real_equal (&r, &dconsthalf))
14439 return true;
14440 return false;
14441 }
14442
14443 /* Return true if X is a valid immediate operand for an SVE FMUL
14444 instruction. */
14445
14446 bool
14447 aarch64_sve_float_mul_immediate_p (rtx x)
14448 {
14449 rtx elt;
14450
14451 /* GCC will never generate a multiply with an immediate of 2, so there is no
14452 point testing for it (even though it is a valid constant). */
14453 return (const_vec_duplicate_p (x, &elt)
14454 && GET_CODE (elt) == CONST_DOUBLE
14455 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14456 }
14457
14458 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14459 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14460 is nonnull, use it to describe valid immediates. */
14461 static bool
14462 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14463 simd_immediate_info *info,
14464 enum simd_immediate_check which,
14465 simd_immediate_info::insn_type insn)
14466 {
14467 /* Try a 4-byte immediate with LSL. */
14468 for (unsigned int shift = 0; shift < 32; shift += 8)
14469 if ((val32 & (0xff << shift)) == val32)
14470 {
14471 if (info)
14472 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14473 simd_immediate_info::LSL, shift);
14474 return true;
14475 }
14476
14477 /* Try a 2-byte immediate with LSL. */
14478 unsigned int imm16 = val32 & 0xffff;
14479 if (imm16 == (val32 >> 16))
14480 for (unsigned int shift = 0; shift < 16; shift += 8)
14481 if ((imm16 & (0xff << shift)) == imm16)
14482 {
14483 if (info)
14484 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14485 simd_immediate_info::LSL, shift);
14486 return true;
14487 }
14488
14489 /* Try a 4-byte immediate with MSL, except for cases that MVN
14490 can handle. */
14491 if (which == AARCH64_CHECK_MOV)
14492 for (unsigned int shift = 8; shift < 24; shift += 8)
14493 {
14494 unsigned int low = (1 << shift) - 1;
14495 if (((val32 & (0xff << shift)) | low) == val32)
14496 {
14497 if (info)
14498 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14499 simd_immediate_info::MSL, shift);
14500 return true;
14501 }
14502 }
14503
14504 return false;
14505 }
14506
14507 /* Return true if replicating VAL64 is a valid immediate for the
14508 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14509 use it to describe valid immediates. */
14510 static bool
14511 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14512 simd_immediate_info *info,
14513 enum simd_immediate_check which)
14514 {
14515 unsigned int val32 = val64 & 0xffffffff;
14516 unsigned int val16 = val64 & 0xffff;
14517 unsigned int val8 = val64 & 0xff;
14518
14519 if (val32 == (val64 >> 32))
14520 {
14521 if ((which & AARCH64_CHECK_ORR) != 0
14522 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14523 simd_immediate_info::MOV))
14524 return true;
14525
14526 if ((which & AARCH64_CHECK_BIC) != 0
14527 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14528 simd_immediate_info::MVN))
14529 return true;
14530
14531 /* Try using a replicated byte. */
14532 if (which == AARCH64_CHECK_MOV
14533 && val16 == (val32 >> 16)
14534 && val8 == (val16 >> 8))
14535 {
14536 if (info)
14537 *info = simd_immediate_info (QImode, val8);
14538 return true;
14539 }
14540 }
14541
14542 /* Try using a bit-to-bytemask. */
14543 if (which == AARCH64_CHECK_MOV)
14544 {
14545 unsigned int i;
14546 for (i = 0; i < 64; i += 8)
14547 {
14548 unsigned char byte = (val64 >> i) & 0xff;
14549 if (byte != 0 && byte != 0xff)
14550 break;
14551 }
14552 if (i == 64)
14553 {
14554 if (info)
14555 *info = simd_immediate_info (DImode, val64);
14556 return true;
14557 }
14558 }
14559 return false;
14560 }
14561
14562 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14563 instruction. If INFO is nonnull, use it to describe valid immediates. */
14564
14565 static bool
14566 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14567 simd_immediate_info *info)
14568 {
14569 scalar_int_mode mode = DImode;
14570 unsigned int val32 = val64 & 0xffffffff;
14571 if (val32 == (val64 >> 32))
14572 {
14573 mode = SImode;
14574 unsigned int val16 = val32 & 0xffff;
14575 if (val16 == (val32 >> 16))
14576 {
14577 mode = HImode;
14578 unsigned int val8 = val16 & 0xff;
14579 if (val8 == (val16 >> 8))
14580 mode = QImode;
14581 }
14582 }
14583 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14584 if (IN_RANGE (val, -0x80, 0x7f))
14585 {
14586 /* DUP with no shift. */
14587 if (info)
14588 *info = simd_immediate_info (mode, val);
14589 return true;
14590 }
14591 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14592 {
14593 /* DUP with LSL #8. */
14594 if (info)
14595 *info = simd_immediate_info (mode, val);
14596 return true;
14597 }
14598 if (aarch64_bitmask_imm (val64, mode))
14599 {
14600 /* DUPM. */
14601 if (info)
14602 *info = simd_immediate_info (mode, val);
14603 return true;
14604 }
14605 return false;
14606 }
14607
14608 /* Return true if OP is a valid SIMD immediate for the operation
14609 described by WHICH. If INFO is nonnull, use it to describe valid
14610 immediates. */
14611 bool
14612 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14613 enum simd_immediate_check which)
14614 {
14615 machine_mode mode = GET_MODE (op);
14616 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14617 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14618 return false;
14619
14620 scalar_mode elt_mode = GET_MODE_INNER (mode);
14621 rtx base, step;
14622 unsigned int n_elts;
14623 if (GET_CODE (op) == CONST_VECTOR
14624 && CONST_VECTOR_DUPLICATE_P (op))
14625 n_elts = CONST_VECTOR_NPATTERNS (op);
14626 else if ((vec_flags & VEC_SVE_DATA)
14627 && const_vec_series_p (op, &base, &step))
14628 {
14629 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14630 if (!aarch64_sve_index_immediate_p (base)
14631 || !aarch64_sve_index_immediate_p (step))
14632 return false;
14633
14634 if (info)
14635 *info = simd_immediate_info (elt_mode, base, step);
14636 return true;
14637 }
14638 else if (GET_CODE (op) == CONST_VECTOR
14639 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14640 /* N_ELTS set above. */;
14641 else
14642 return false;
14643
14644 /* Handle PFALSE and PTRUE. */
14645 if (vec_flags & VEC_SVE_PRED)
14646 return (op == CONST0_RTX (mode)
14647 || op == CONSTM1_RTX (mode));
14648
14649 scalar_float_mode elt_float_mode;
14650 if (n_elts == 1
14651 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14652 {
14653 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14654 if (aarch64_float_const_zero_rtx_p (elt)
14655 || aarch64_float_const_representable_p (elt))
14656 {
14657 if (info)
14658 *info = simd_immediate_info (elt_float_mode, elt);
14659 return true;
14660 }
14661 }
14662
14663 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14664 if (elt_size > 8)
14665 return false;
14666
14667 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14668
14669 /* Expand the vector constant out into a byte vector, with the least
14670 significant byte of the register first. */
14671 auto_vec<unsigned char, 16> bytes;
14672 bytes.reserve (n_elts * elt_size);
14673 for (unsigned int i = 0; i < n_elts; i++)
14674 {
14675 /* The vector is provided in gcc endian-neutral fashion.
14676 For aarch64_be Advanced SIMD, it must be laid out in the vector
14677 register in reverse order. */
14678 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14679 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14680
14681 if (elt_mode != elt_int_mode)
14682 elt = gen_lowpart (elt_int_mode, elt);
14683
14684 if (!CONST_INT_P (elt))
14685 return false;
14686
14687 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14688 for (unsigned int byte = 0; byte < elt_size; byte++)
14689 {
14690 bytes.quick_push (elt_val & 0xff);
14691 elt_val >>= BITS_PER_UNIT;
14692 }
14693 }
14694
14695 /* The immediate must repeat every eight bytes. */
14696 unsigned int nbytes = bytes.length ();
14697 for (unsigned i = 8; i < nbytes; ++i)
14698 if (bytes[i] != bytes[i - 8])
14699 return false;
14700
14701 /* Get the repeating 8-byte value as an integer. No endian correction
14702 is needed here because bytes is already in lsb-first order. */
14703 unsigned HOST_WIDE_INT val64 = 0;
14704 for (unsigned int i = 0; i < 8; i++)
14705 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14706 << (i * BITS_PER_UNIT));
14707
14708 if (vec_flags & VEC_SVE_DATA)
14709 return aarch64_sve_valid_immediate (val64, info);
14710 else
14711 return aarch64_advsimd_valid_immediate (val64, info, which);
14712 }
14713
14714 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14715 has a step in the range of INDEX. Return the index expression if so,
14716 otherwise return null. */
14717 rtx
14718 aarch64_check_zero_based_sve_index_immediate (rtx x)
14719 {
14720 rtx base, step;
14721 if (const_vec_series_p (x, &base, &step)
14722 && base == const0_rtx
14723 && aarch64_sve_index_immediate_p (step))
14724 return step;
14725 return NULL_RTX;
14726 }
14727
14728 /* Check of immediate shift constants are within range. */
14729 bool
14730 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14731 {
14732 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14733 if (left)
14734 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14735 else
14736 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14737 }
14738
14739 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14740 operation of width WIDTH at bit position POS. */
14741
14742 rtx
14743 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14744 {
14745 gcc_assert (CONST_INT_P (width));
14746 gcc_assert (CONST_INT_P (pos));
14747
14748 unsigned HOST_WIDE_INT mask
14749 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14750 return GEN_INT (mask << UINTVAL (pos));
14751 }
14752
14753 bool
14754 aarch64_mov_operand_p (rtx x, machine_mode mode)
14755 {
14756 if (GET_CODE (x) == HIGH
14757 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14758 return true;
14759
14760 if (CONST_INT_P (x))
14761 return true;
14762
14763 if (VECTOR_MODE_P (GET_MODE (x)))
14764 return aarch64_simd_valid_immediate (x, NULL);
14765
14766 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14767 return true;
14768
14769 if (aarch64_sve_cnt_immediate_p (x))
14770 return true;
14771
14772 return aarch64_classify_symbolic_expression (x)
14773 == SYMBOL_TINY_ABSOLUTE;
14774 }
14775
14776 /* Return a const_int vector of VAL. */
14777 rtx
14778 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14779 {
14780 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14781 return gen_const_vec_duplicate (mode, c);
14782 }
14783
14784 /* Check OP is a legal scalar immediate for the MOVI instruction. */
14785
14786 bool
14787 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14788 {
14789 machine_mode vmode;
14790
14791 vmode = aarch64_simd_container_mode (mode, 64);
14792 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14793 return aarch64_simd_valid_immediate (op_v, NULL);
14794 }
14795
14796 /* Construct and return a PARALLEL RTX vector with elements numbering the
14797 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14798 the vector - from the perspective of the architecture. This does not
14799 line up with GCC's perspective on lane numbers, so we end up with
14800 different masks depending on our target endian-ness. The diagram
14801 below may help. We must draw the distinction when building masks
14802 which select one half of the vector. An instruction selecting
14803 architectural low-lanes for a big-endian target, must be described using
14804 a mask selecting GCC high-lanes.
14805
14806 Big-Endian Little-Endian
14807
14808 GCC 0 1 2 3 3 2 1 0
14809 | x | x | x | x | | x | x | x | x |
14810 Architecture 3 2 1 0 3 2 1 0
14811
14812 Low Mask: { 2, 3 } { 0, 1 }
14813 High Mask: { 0, 1 } { 2, 3 }
14814
14815 MODE Is the mode of the vector and NUNITS is the number of units in it. */
14816
14817 rtx
14818 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14819 {
14820 rtvec v = rtvec_alloc (nunits / 2);
14821 int high_base = nunits / 2;
14822 int low_base = 0;
14823 int base;
14824 rtx t1;
14825 int i;
14826
14827 if (BYTES_BIG_ENDIAN)
14828 base = high ? low_base : high_base;
14829 else
14830 base = high ? high_base : low_base;
14831
14832 for (i = 0; i < nunits / 2; i++)
14833 RTVEC_ELT (v, i) = GEN_INT (base + i);
14834
14835 t1 = gen_rtx_PARALLEL (mode, v);
14836 return t1;
14837 }
14838
14839 /* Check OP for validity as a PARALLEL RTX vector with elements
14840 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14841 from the perspective of the architecture. See the diagram above
14842 aarch64_simd_vect_par_cnst_half for more details. */
14843
14844 bool
14845 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14846 bool high)
14847 {
14848 int nelts;
14849 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14850 return false;
14851
14852 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14853 HOST_WIDE_INT count_op = XVECLEN (op, 0);
14854 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14855 int i = 0;
14856
14857 if (count_op != count_ideal)
14858 return false;
14859
14860 for (i = 0; i < count_ideal; i++)
14861 {
14862 rtx elt_op = XVECEXP (op, 0, i);
14863 rtx elt_ideal = XVECEXP (ideal, 0, i);
14864
14865 if (!CONST_INT_P (elt_op)
14866 || INTVAL (elt_ideal) != INTVAL (elt_op))
14867 return false;
14868 }
14869 return true;
14870 }
14871
14872 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
14873 HIGH (exclusive). */
14874 void
14875 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14876 const_tree exp)
14877 {
14878 HOST_WIDE_INT lane;
14879 gcc_assert (CONST_INT_P (operand));
14880 lane = INTVAL (operand);
14881
14882 if (lane < low || lane >= high)
14883 {
14884 if (exp)
14885 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14886 else
14887 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14888 }
14889 }
14890
14891 /* Peform endian correction on lane number N, which indexes a vector
14892 of mode MODE, and return the result as an SImode rtx. */
14893
14894 rtx
14895 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14896 {
14897 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14898 }
14899
14900 /* Return TRUE if OP is a valid vector addressing mode. */
14901
14902 bool
14903 aarch64_simd_mem_operand_p (rtx op)
14904 {
14905 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14906 || REG_P (XEXP (op, 0)));
14907 }
14908
14909 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
14910
14911 bool
14912 aarch64_sve_ld1r_operand_p (rtx op)
14913 {
14914 struct aarch64_address_info addr;
14915 scalar_mode mode;
14916
14917 return (MEM_P (op)
14918 && is_a <scalar_mode> (GET_MODE (op), &mode)
14919 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14920 && addr.type == ADDRESS_REG_IMM
14921 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14922 }
14923
14924 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14925 The conditions for STR are the same. */
14926 bool
14927 aarch64_sve_ldr_operand_p (rtx op)
14928 {
14929 struct aarch64_address_info addr;
14930
14931 return (MEM_P (op)
14932 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14933 false, ADDR_QUERY_ANY)
14934 && addr.type == ADDRESS_REG_IMM);
14935 }
14936
14937 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14938 We need to be able to access the individual pieces, so the range
14939 is different from LD[234] and ST[234]. */
14940 bool
14941 aarch64_sve_struct_memory_operand_p (rtx op)
14942 {
14943 if (!MEM_P (op))
14944 return false;
14945
14946 machine_mode mode = GET_MODE (op);
14947 struct aarch64_address_info addr;
14948 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14949 ADDR_QUERY_ANY)
14950 || addr.type != ADDRESS_REG_IMM)
14951 return false;
14952
14953 poly_int64 first = addr.const_offset;
14954 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14955 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14956 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14957 }
14958
14959 /* Emit a register copy from operand to operand, taking care not to
14960 early-clobber source registers in the process.
14961
14962 COUNT is the number of components into which the copy needs to be
14963 decomposed. */
14964 void
14965 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14966 unsigned int count)
14967 {
14968 unsigned int i;
14969 int rdest = REGNO (operands[0]);
14970 int rsrc = REGNO (operands[1]);
14971
14972 if (!reg_overlap_mentioned_p (operands[0], operands[1])
14973 || rdest < rsrc)
14974 for (i = 0; i < count; i++)
14975 emit_move_insn (gen_rtx_REG (mode, rdest + i),
14976 gen_rtx_REG (mode, rsrc + i));
14977 else
14978 for (i = 0; i < count; i++)
14979 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14980 gen_rtx_REG (mode, rsrc + count - i - 1));
14981 }
14982
14983 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14984 one of VSTRUCT modes: OI, CI, or XI. */
14985 int
14986 aarch64_simd_attr_length_rglist (machine_mode mode)
14987 {
14988 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
14989 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14990 }
14991
14992 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
14993 alignment of a vector to 128 bits. SVE predicates have an alignment of
14994 16 bits. */
14995 static HOST_WIDE_INT
14996 aarch64_simd_vector_alignment (const_tree type)
14997 {
14998 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14999 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15000 be set for non-predicate vectors of booleans. Modes are the most
15001 direct way we have of identifying real SVE predicate types. */
15002 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15003 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15004 }
15005
15006 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15007 static poly_uint64
15008 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15009 {
15010 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15011 {
15012 /* If the length of the vector is fixed, try to align to that length,
15013 otherwise don't try to align at all. */
15014 HOST_WIDE_INT result;
15015 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15016 result = TYPE_ALIGN (TREE_TYPE (type));
15017 return result;
15018 }
15019 return TYPE_ALIGN (type);
15020 }
15021
15022 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15023 static bool
15024 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15025 {
15026 if (is_packed)
15027 return false;
15028
15029 /* For fixed-length vectors, check that the vectorizer will aim for
15030 full-vector alignment. This isn't true for generic GCC vectors
15031 that are wider than the ABI maximum of 128 bits. */
15032 poly_uint64 preferred_alignment =
15033 aarch64_vectorize_preferred_vector_alignment (type);
15034 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15035 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15036 preferred_alignment))
15037 return false;
15038
15039 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15040 return true;
15041 }
15042
15043 /* Return true if the vector misalignment factor is supported by the
15044 target. */
15045 static bool
15046 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15047 const_tree type, int misalignment,
15048 bool is_packed)
15049 {
15050 if (TARGET_SIMD && STRICT_ALIGNMENT)
15051 {
15052 /* Return if movmisalign pattern is not supported for this mode. */
15053 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15054 return false;
15055
15056 /* Misalignment factor is unknown at compile time. */
15057 if (misalignment == -1)
15058 return false;
15059 }
15060 return default_builtin_support_vector_misalignment (mode, type, misalignment,
15061 is_packed);
15062 }
15063
15064 /* If VALS is a vector constant that can be loaded into a register
15065 using DUP, generate instructions to do so and return an RTX to
15066 assign to the register. Otherwise return NULL_RTX. */
15067 static rtx
15068 aarch64_simd_dup_constant (rtx vals)
15069 {
15070 machine_mode mode = GET_MODE (vals);
15071 machine_mode inner_mode = GET_MODE_INNER (mode);
15072 rtx x;
15073
15074 if (!const_vec_duplicate_p (vals, &x))
15075 return NULL_RTX;
15076
15077 /* We can load this constant by using DUP and a constant in a
15078 single ARM register. This will be cheaper than a vector
15079 load. */
15080 x = copy_to_mode_reg (inner_mode, x);
15081 return gen_vec_duplicate (mode, x);
15082 }
15083
15084
15085 /* Generate code to load VALS, which is a PARALLEL containing only
15086 constants (for vec_init) or CONST_VECTOR, efficiently into a
15087 register. Returns an RTX to copy into the register, or NULL_RTX
15088 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15089 static rtx
15090 aarch64_simd_make_constant (rtx vals)
15091 {
15092 machine_mode mode = GET_MODE (vals);
15093 rtx const_dup;
15094 rtx const_vec = NULL_RTX;
15095 int n_const = 0;
15096 int i;
15097
15098 if (GET_CODE (vals) == CONST_VECTOR)
15099 const_vec = vals;
15100 else if (GET_CODE (vals) == PARALLEL)
15101 {
15102 /* A CONST_VECTOR must contain only CONST_INTs and
15103 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15104 Only store valid constants in a CONST_VECTOR. */
15105 int n_elts = XVECLEN (vals, 0);
15106 for (i = 0; i < n_elts; ++i)
15107 {
15108 rtx x = XVECEXP (vals, 0, i);
15109 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15110 n_const++;
15111 }
15112 if (n_const == n_elts)
15113 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15114 }
15115 else
15116 gcc_unreachable ();
15117
15118 if (const_vec != NULL_RTX
15119 && aarch64_simd_valid_immediate (const_vec, NULL))
15120 /* Load using MOVI/MVNI. */
15121 return const_vec;
15122 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15123 /* Loaded using DUP. */
15124 return const_dup;
15125 else if (const_vec != NULL_RTX)
15126 /* Load from constant pool. We cannot take advantage of single-cycle
15127 LD1 because we need a PC-relative addressing mode. */
15128 return const_vec;
15129 else
15130 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15131 We cannot construct an initializer. */
15132 return NULL_RTX;
15133 }
15134
15135 /* Expand a vector initialisation sequence, such that TARGET is
15136 initialised to contain VALS. */
15137
15138 void
15139 aarch64_expand_vector_init (rtx target, rtx vals)
15140 {
15141 machine_mode mode = GET_MODE (target);
15142 scalar_mode inner_mode = GET_MODE_INNER (mode);
15143 /* The number of vector elements. */
15144 int n_elts = XVECLEN (vals, 0);
15145 /* The number of vector elements which are not constant. */
15146 int n_var = 0;
15147 rtx any_const = NULL_RTX;
15148 /* The first element of vals. */
15149 rtx v0 = XVECEXP (vals, 0, 0);
15150 bool all_same = true;
15151
15152 /* This is a special vec_init<M><N> where N is not an element mode but a
15153 vector mode with half the elements of M. We expect to find two entries
15154 of mode N in VALS and we must put their concatentation into TARGET. */
15155 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
15156 {
15157 gcc_assert (known_eq (GET_MODE_SIZE (mode),
15158 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
15159 rtx lo = XVECEXP (vals, 0, 0);
15160 rtx hi = XVECEXP (vals, 0, 1);
15161 machine_mode narrow_mode = GET_MODE (lo);
15162 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
15163 gcc_assert (narrow_mode == GET_MODE (hi));
15164
15165 /* When we want to concatenate a half-width vector with zeroes we can
15166 use the aarch64_combinez[_be] patterns. Just make sure that the
15167 zeroes are in the right half. */
15168 if (BYTES_BIG_ENDIAN
15169 && aarch64_simd_imm_zero (lo, narrow_mode)
15170 && general_operand (hi, narrow_mode))
15171 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
15172 else if (!BYTES_BIG_ENDIAN
15173 && aarch64_simd_imm_zero (hi, narrow_mode)
15174 && general_operand (lo, narrow_mode))
15175 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
15176 else
15177 {
15178 /* Else create the two half-width registers and combine them. */
15179 if (!REG_P (lo))
15180 lo = force_reg (GET_MODE (lo), lo);
15181 if (!REG_P (hi))
15182 hi = force_reg (GET_MODE (hi), hi);
15183
15184 if (BYTES_BIG_ENDIAN)
15185 std::swap (lo, hi);
15186 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
15187 }
15188 return;
15189 }
15190
15191 /* Count the number of variable elements to initialise. */
15192 for (int i = 0; i < n_elts; ++i)
15193 {
15194 rtx x = XVECEXP (vals, 0, i);
15195 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15196 ++n_var;
15197 else
15198 any_const = x;
15199
15200 all_same &= rtx_equal_p (x, v0);
15201 }
15202
15203 /* No variable elements, hand off to aarch64_simd_make_constant which knows
15204 how best to handle this. */
15205 if (n_var == 0)
15206 {
15207 rtx constant = aarch64_simd_make_constant (vals);
15208 if (constant != NULL_RTX)
15209 {
15210 emit_move_insn (target, constant);
15211 return;
15212 }
15213 }
15214
15215 /* Splat a single non-constant element if we can. */
15216 if (all_same)
15217 {
15218 rtx x = copy_to_mode_reg (inner_mode, v0);
15219 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15220 return;
15221 }
15222
15223 enum insn_code icode = optab_handler (vec_set_optab, mode);
15224 gcc_assert (icode != CODE_FOR_nothing);
15225
15226 /* If there are only variable elements, try to optimize
15227 the insertion using dup for the most common element
15228 followed by insertions. */
15229
15230 /* The algorithm will fill matches[*][0] with the earliest matching element,
15231 and matches[X][1] with the count of duplicate elements (if X is the
15232 earliest element which has duplicates). */
15233
15234 if (n_var == n_elts && n_elts <= 16)
15235 {
15236 int matches[16][2] = {0};
15237 for (int i = 0; i < n_elts; i++)
15238 {
15239 for (int j = 0; j <= i; j++)
15240 {
15241 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15242 {
15243 matches[i][0] = j;
15244 matches[j][1]++;
15245 break;
15246 }
15247 }
15248 }
15249 int maxelement = 0;
15250 int maxv = 0;
15251 for (int i = 0; i < n_elts; i++)
15252 if (matches[i][1] > maxv)
15253 {
15254 maxelement = i;
15255 maxv = matches[i][1];
15256 }
15257
15258 /* Create a duplicate of the most common element, unless all elements
15259 are equally useless to us, in which case just immediately set the
15260 vector register using the first element. */
15261
15262 if (maxv == 1)
15263 {
15264 /* For vectors of two 64-bit elements, we can do even better. */
15265 if (n_elts == 2
15266 && (inner_mode == E_DImode
15267 || inner_mode == E_DFmode))
15268
15269 {
15270 rtx x0 = XVECEXP (vals, 0, 0);
15271 rtx x1 = XVECEXP (vals, 0, 1);
15272 /* Combine can pick up this case, but handling it directly
15273 here leaves clearer RTL.
15274
15275 This is load_pair_lanes<mode>, and also gives us a clean-up
15276 for store_pair_lanes<mode>. */
15277 if (memory_operand (x0, inner_mode)
15278 && memory_operand (x1, inner_mode)
15279 && !STRICT_ALIGNMENT
15280 && rtx_equal_p (XEXP (x1, 0),
15281 plus_constant (Pmode,
15282 XEXP (x0, 0),
15283 GET_MODE_SIZE (inner_mode))))
15284 {
15285 rtx t;
15286 if (inner_mode == DFmode)
15287 t = gen_load_pair_lanesdf (target, x0, x1);
15288 else
15289 t = gen_load_pair_lanesdi (target, x0, x1);
15290 emit_insn (t);
15291 return;
15292 }
15293 }
15294 /* The subreg-move sequence below will move into lane zero of the
15295 vector register. For big-endian we want that position to hold
15296 the last element of VALS. */
15297 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15298 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15299 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15300 }
15301 else
15302 {
15303 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15304 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15305 }
15306
15307 /* Insert the rest. */
15308 for (int i = 0; i < n_elts; i++)
15309 {
15310 rtx x = XVECEXP (vals, 0, i);
15311 if (matches[i][0] == maxelement)
15312 continue;
15313 x = copy_to_mode_reg (inner_mode, x);
15314 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15315 }
15316 return;
15317 }
15318
15319 /* Initialise a vector which is part-variable. We want to first try
15320 to build those lanes which are constant in the most efficient way we
15321 can. */
15322 if (n_var != n_elts)
15323 {
15324 rtx copy = copy_rtx (vals);
15325
15326 /* Load constant part of vector. We really don't care what goes into the
15327 parts we will overwrite, but we're more likely to be able to load the
15328 constant efficiently if it has fewer, larger, repeating parts
15329 (see aarch64_simd_valid_immediate). */
15330 for (int i = 0; i < n_elts; i++)
15331 {
15332 rtx x = XVECEXP (vals, 0, i);
15333 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15334 continue;
15335 rtx subst = any_const;
15336 for (int bit = n_elts / 2; bit > 0; bit /= 2)
15337 {
15338 /* Look in the copied vector, as more elements are const. */
15339 rtx test = XVECEXP (copy, 0, i ^ bit);
15340 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15341 {
15342 subst = test;
15343 break;
15344 }
15345 }
15346 XVECEXP (copy, 0, i) = subst;
15347 }
15348 aarch64_expand_vector_init (target, copy);
15349 }
15350
15351 /* Insert the variable lanes directly. */
15352 for (int i = 0; i < n_elts; i++)
15353 {
15354 rtx x = XVECEXP (vals, 0, i);
15355 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15356 continue;
15357 x = copy_to_mode_reg (inner_mode, x);
15358 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15359 }
15360 }
15361
15362 /* Emit RTL corresponding to:
15363 insr TARGET, ELEM. */
15364
15365 static void
15366 emit_insr (rtx target, rtx elem)
15367 {
15368 machine_mode mode = GET_MODE (target);
15369 scalar_mode elem_mode = GET_MODE_INNER (mode);
15370 elem = force_reg (elem_mode, elem);
15371
15372 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
15373 gcc_assert (icode != CODE_FOR_nothing);
15374 emit_insn (GEN_FCN (icode) (target, target, elem));
15375 }
15376
15377 /* Subroutine of aarch64_sve_expand_vector_init for handling
15378 trailing constants.
15379 This function works as follows:
15380 (a) Create a new vector consisting of trailing constants.
15381 (b) Initialize TARGET with the constant vector using emit_move_insn.
15382 (c) Insert remaining elements in TARGET using insr.
15383 NELTS is the total number of elements in original vector while
15384 while NELTS_REQD is the number of elements that are actually
15385 significant.
15386
15387 ??? The heuristic used is to do above only if number of constants
15388 is at least half the total number of elements. May need fine tuning. */
15389
15390 static bool
15391 aarch64_sve_expand_vector_init_handle_trailing_constants
15392 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
15393 {
15394 machine_mode mode = GET_MODE (target);
15395 scalar_mode elem_mode = GET_MODE_INNER (mode);
15396 int n_trailing_constants = 0;
15397
15398 for (int i = nelts_reqd - 1;
15399 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
15400 i--)
15401 n_trailing_constants++;
15402
15403 if (n_trailing_constants >= nelts_reqd / 2)
15404 {
15405 rtx_vector_builder v (mode, 1, nelts);
15406 for (int i = 0; i < nelts; i++)
15407 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
15408 rtx const_vec = v.build ();
15409 emit_move_insn (target, const_vec);
15410
15411 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
15412 emit_insr (target, builder.elt (i));
15413
15414 return true;
15415 }
15416
15417 return false;
15418 }
15419
15420 /* Subroutine of aarch64_sve_expand_vector_init.
15421 Works as follows:
15422 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
15423 (b) Skip trailing elements from BUILDER, which are the same as
15424 element NELTS_REQD - 1.
15425 (c) Insert earlier elements in reverse order in TARGET using insr. */
15426
15427 static void
15428 aarch64_sve_expand_vector_init_insert_elems (rtx target,
15429 const rtx_vector_builder &builder,
15430 int nelts_reqd)
15431 {
15432 machine_mode mode = GET_MODE (target);
15433 scalar_mode elem_mode = GET_MODE_INNER (mode);
15434
15435 struct expand_operand ops[2];
15436 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
15437 gcc_assert (icode != CODE_FOR_nothing);
15438
15439 create_output_operand (&ops[0], target, mode);
15440 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
15441 expand_insn (icode, 2, ops);
15442
15443 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15444 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
15445 emit_insr (target, builder.elt (i));
15446 }
15447
15448 /* Subroutine of aarch64_sve_expand_vector_init to handle case
15449 when all trailing elements of builder are same.
15450 This works as follows:
15451 (a) Use expand_insn interface to broadcast last vector element in TARGET.
15452 (b) Insert remaining elements in TARGET using insr.
15453
15454 ??? The heuristic used is to do above if number of same trailing elements
15455 is at least 3/4 of total number of elements, loosely based on
15456 heuristic from mostly_zeros_p. May need fine-tuning. */
15457
15458 static bool
15459 aarch64_sve_expand_vector_init_handle_trailing_same_elem
15460 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
15461 {
15462 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15463 if (ndups >= (3 * nelts_reqd) / 4)
15464 {
15465 aarch64_sve_expand_vector_init_insert_elems (target, builder,
15466 nelts_reqd - ndups + 1);
15467 return true;
15468 }
15469
15470 return false;
15471 }
15472
15473 /* Initialize register TARGET from BUILDER. NELTS is the constant number
15474 of elements in BUILDER.
15475
15476 The function tries to initialize TARGET from BUILDER if it fits one
15477 of the special cases outlined below.
15478
15479 Failing that, the function divides BUILDER into two sub-vectors:
15480 v_even = even elements of BUILDER;
15481 v_odd = odd elements of BUILDER;
15482
15483 and recursively calls itself with v_even and v_odd.
15484
15485 if (recursive call succeeded for v_even or v_odd)
15486 TARGET = zip (v_even, v_odd)
15487
15488 The function returns true if it managed to build TARGET from BUILDER
15489 with one of the special cases, false otherwise.
15490
15491 Example: {a, 1, b, 2, c, 3, d, 4}
15492
15493 The vector gets divided into:
15494 v_even = {a, b, c, d}
15495 v_odd = {1, 2, 3, 4}
15496
15497 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
15498 initialize tmp2 from constant vector v_odd using emit_move_insn.
15499
15500 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
15501 4 elements, so we construct tmp1 from v_even using insr:
15502 tmp1 = dup(d)
15503 insr tmp1, c
15504 insr tmp1, b
15505 insr tmp1, a
15506
15507 And finally:
15508 TARGET = zip (tmp1, tmp2)
15509 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
15510
15511 static bool
15512 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
15513 int nelts, int nelts_reqd)
15514 {
15515 machine_mode mode = GET_MODE (target);
15516
15517 /* Case 1: Vector contains trailing constants. */
15518
15519 if (aarch64_sve_expand_vector_init_handle_trailing_constants
15520 (target, builder, nelts, nelts_reqd))
15521 return true;
15522
15523 /* Case 2: Vector contains leading constants. */
15524
15525 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
15526 for (int i = 0; i < nelts_reqd; i++)
15527 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
15528 rev_builder.finalize ();
15529
15530 if (aarch64_sve_expand_vector_init_handle_trailing_constants
15531 (target, rev_builder, nelts, nelts_reqd))
15532 {
15533 emit_insn (gen_aarch64_sve_rev (mode, target, target));
15534 return true;
15535 }
15536
15537 /* Case 3: Vector contains trailing same element. */
15538
15539 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15540 (target, builder, nelts_reqd))
15541 return true;
15542
15543 /* Case 4: Vector contains leading same element. */
15544
15545 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
15546 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
15547 {
15548 emit_insn (gen_aarch64_sve_rev (mode, target, target));
15549 return true;
15550 }
15551
15552 /* Avoid recursing below 4-elements.
15553 ??? The threshold 4 may need fine-tuning. */
15554
15555 if (nelts_reqd <= 4)
15556 return false;
15557
15558 rtx_vector_builder v_even (mode, 1, nelts);
15559 rtx_vector_builder v_odd (mode, 1, nelts);
15560
15561 for (int i = 0; i < nelts * 2; i += 2)
15562 {
15563 v_even.quick_push (builder.elt (i));
15564 v_odd.quick_push (builder.elt (i + 1));
15565 }
15566
15567 v_even.finalize ();
15568 v_odd.finalize ();
15569
15570 rtx tmp1 = gen_reg_rtx (mode);
15571 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
15572 nelts, nelts_reqd / 2);
15573
15574 rtx tmp2 = gen_reg_rtx (mode);
15575 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
15576 nelts, nelts_reqd / 2);
15577
15578 if (!did_even_p && !did_odd_p)
15579 return false;
15580
15581 /* Initialize v_even and v_odd using INSR if it didn't match any of the
15582 special cases and zip v_even, v_odd. */
15583
15584 if (!did_even_p)
15585 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
15586
15587 if (!did_odd_p)
15588 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
15589
15590 rtvec v = gen_rtvec (2, tmp1, tmp2);
15591 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
15592 return true;
15593 }
15594
15595 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
15596
15597 void
15598 aarch64_sve_expand_vector_init (rtx target, rtx vals)
15599 {
15600 machine_mode mode = GET_MODE (target);
15601 int nelts = XVECLEN (vals, 0);
15602
15603 rtx_vector_builder v (mode, 1, nelts);
15604 for (int i = 0; i < nelts; i++)
15605 v.quick_push (XVECEXP (vals, 0, i));
15606 v.finalize ();
15607
15608 /* If neither sub-vectors of v could be initialized specially,
15609 then use INSR to insert all elements from v into TARGET.
15610 ??? This might not be optimal for vectors with large
15611 initializers like 16-element or above.
15612 For nelts < 4, it probably isn't useful to handle specially. */
15613
15614 if (nelts < 4
15615 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
15616 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
15617 }
15618
15619 static unsigned HOST_WIDE_INT
15620 aarch64_shift_truncation_mask (machine_mode mode)
15621 {
15622 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15623 return 0;
15624 return GET_MODE_UNIT_BITSIZE (mode) - 1;
15625 }
15626
15627 /* Select a format to encode pointers in exception handling data. */
15628 int
15629 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15630 {
15631 int type;
15632 switch (aarch64_cmodel)
15633 {
15634 case AARCH64_CMODEL_TINY:
15635 case AARCH64_CMODEL_TINY_PIC:
15636 case AARCH64_CMODEL_SMALL:
15637 case AARCH64_CMODEL_SMALL_PIC:
15638 case AARCH64_CMODEL_SMALL_SPIC:
15639 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
15640 for everything. */
15641 type = DW_EH_PE_sdata4;
15642 break;
15643 default:
15644 /* No assumptions here. 8-byte relocs required. */
15645 type = DW_EH_PE_sdata8;
15646 break;
15647 }
15648 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15649 }
15650
15651 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
15652
15653 static void
15654 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
15655 {
15656 if (aarch64_simd_decl_p (decl))
15657 {
15658 fprintf (stream, "\t.variant_pcs\t");
15659 assemble_name (stream, name);
15660 fprintf (stream, "\n");
15661 }
15662 }
15663
15664 /* The last .arch and .tune assembly strings that we printed. */
15665 static std::string aarch64_last_printed_arch_string;
15666 static std::string aarch64_last_printed_tune_string;
15667
15668 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
15669 by the function fndecl. */
15670
15671 void
15672 aarch64_declare_function_name (FILE *stream, const char* name,
15673 tree fndecl)
15674 {
15675 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15676
15677 struct cl_target_option *targ_options;
15678 if (target_parts)
15679 targ_options = TREE_TARGET_OPTION (target_parts);
15680 else
15681 targ_options = TREE_TARGET_OPTION (target_option_current_node);
15682 gcc_assert (targ_options);
15683
15684 const struct processor *this_arch
15685 = aarch64_get_arch (targ_options->x_explicit_arch);
15686
15687 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
15688 std::string extension
15689 = aarch64_get_extension_string_for_isa_flags (isa_flags,
15690 this_arch->flags);
15691 /* Only update the assembler .arch string if it is distinct from the last
15692 such string we printed. */
15693 std::string to_print = this_arch->name + extension;
15694 if (to_print != aarch64_last_printed_arch_string)
15695 {
15696 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15697 aarch64_last_printed_arch_string = to_print;
15698 }
15699
15700 /* Print the cpu name we're tuning for in the comments, might be
15701 useful to readers of the generated asm. Do it only when it changes
15702 from function to function and verbose assembly is requested. */
15703 const struct processor *this_tune
15704 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15705
15706 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15707 {
15708 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15709 this_tune->name);
15710 aarch64_last_printed_tune_string = this_tune->name;
15711 }
15712
15713 aarch64_asm_output_variant_pcs (stream, fndecl, name);
15714
15715 /* Don't forget the type directive for ELF. */
15716 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15717 ASM_OUTPUT_LABEL (stream, name);
15718 }
15719
15720 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
15721
15722 void
15723 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
15724 {
15725 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
15726 const char *value = IDENTIFIER_POINTER (target);
15727 aarch64_asm_output_variant_pcs (stream, decl, name);
15728 ASM_OUTPUT_DEF (stream, name, value);
15729 }
15730
15731 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
15732 function symbol references. */
15733
15734 void
15735 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
15736 {
15737 default_elf_asm_output_external (stream, decl, name);
15738 aarch64_asm_output_variant_pcs (stream, decl, name);
15739 }
15740
15741 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
15742 Used to output the .cfi_b_key_frame directive when signing the current
15743 function with the B key. */
15744
15745 void
15746 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
15747 {
15748 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
15749 && aarch64_ra_sign_key == AARCH64_KEY_B)
15750 asm_fprintf (f, "\t.cfi_b_key_frame\n");
15751 }
15752
15753 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
15754
15755 static void
15756 aarch64_start_file (void)
15757 {
15758 struct cl_target_option *default_options
15759 = TREE_TARGET_OPTION (target_option_default_node);
15760
15761 const struct processor *default_arch
15762 = aarch64_get_arch (default_options->x_explicit_arch);
15763 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
15764 std::string extension
15765 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15766 default_arch->flags);
15767
15768 aarch64_last_printed_arch_string = default_arch->name + extension;
15769 aarch64_last_printed_tune_string = "";
15770 asm_fprintf (asm_out_file, "\t.arch %s\n",
15771 aarch64_last_printed_arch_string.c_str ());
15772
15773 default_file_start ();
15774 }
15775
15776 /* Emit load exclusive. */
15777
15778 static void
15779 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15780 rtx mem, rtx model_rtx)
15781 {
15782 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15783 }
15784
15785 /* Emit store exclusive. */
15786
15787 static void
15788 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15789 rtx rval, rtx mem, rtx model_rtx)
15790 {
15791 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15792 }
15793
15794 /* Mark the previous jump instruction as unlikely. */
15795
15796 static void
15797 aarch64_emit_unlikely_jump (rtx insn)
15798 {
15799 rtx_insn *jump = emit_jump_insn (insn);
15800 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15801 }
15802
15803 /* Expand a compare and swap pattern. */
15804
15805 void
15806 aarch64_expand_compare_and_swap (rtx operands[])
15807 {
15808 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15809 machine_mode mode, r_mode;
15810
15811 bval = operands[0];
15812 rval = operands[1];
15813 mem = operands[2];
15814 oldval = operands[3];
15815 newval = operands[4];
15816 is_weak = operands[5];
15817 mod_s = operands[6];
15818 mod_f = operands[7];
15819 mode = GET_MODE (mem);
15820
15821 /* Normally the succ memory model must be stronger than fail, but in the
15822 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15823 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
15824 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15825 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15826 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15827
15828 r_mode = mode;
15829 if (mode == QImode || mode == HImode)
15830 {
15831 r_mode = SImode;
15832 rval = gen_reg_rtx (r_mode);
15833 }
15834
15835 if (TARGET_LSE)
15836 {
15837 /* The CAS insn requires oldval and rval overlap, but we need to
15838 have a copy of oldval saved across the operation to tell if
15839 the operation is successful. */
15840 if (reg_overlap_mentioned_p (rval, oldval))
15841 rval = copy_to_mode_reg (r_mode, oldval);
15842 else
15843 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15844
15845 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15846 newval, mod_s));
15847 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15848 }
15849 else
15850 {
15851 /* The oldval predicate varies by mode. Test it and force to reg. */
15852 insn_code code = code_for_aarch64_compare_and_swap (mode);
15853 if (!insn_data[code].operand[2].predicate (oldval, mode))
15854 oldval = force_reg (mode, oldval);
15855
15856 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15857 is_weak, mod_s, mod_f));
15858 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15859 }
15860
15861 if (r_mode != mode)
15862 rval = gen_lowpart (mode, rval);
15863 emit_move_insn (operands[1], rval);
15864
15865 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15866 emit_insn (gen_rtx_SET (bval, x));
15867 }
15868
15869 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15870 sequence implementing an atomic operation. */
15871
15872 static void
15873 aarch64_emit_post_barrier (enum memmodel model)
15874 {
15875 const enum memmodel base_model = memmodel_base (model);
15876
15877 if (is_mm_sync (model)
15878 && (base_model == MEMMODEL_ACQUIRE
15879 || base_model == MEMMODEL_ACQ_REL
15880 || base_model == MEMMODEL_SEQ_CST))
15881 {
15882 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15883 }
15884 }
15885
15886 /* Split a compare and swap pattern. */
15887
15888 void
15889 aarch64_split_compare_and_swap (rtx operands[])
15890 {
15891 rtx rval, mem, oldval, newval, scratch;
15892 machine_mode mode;
15893 bool is_weak;
15894 rtx_code_label *label1, *label2;
15895 rtx x, cond;
15896 enum memmodel model;
15897 rtx model_rtx;
15898
15899 rval = operands[0];
15900 mem = operands[1];
15901 oldval = operands[2];
15902 newval = operands[3];
15903 is_weak = (operands[4] != const0_rtx);
15904 model_rtx = operands[5];
15905 scratch = operands[7];
15906 mode = GET_MODE (mem);
15907 model = memmodel_from_int (INTVAL (model_rtx));
15908
15909 /* When OLDVAL is zero and we want the strong version we can emit a tighter
15910 loop:
15911 .label1:
15912 LD[A]XR rval, [mem]
15913 CBNZ rval, .label2
15914 ST[L]XR scratch, newval, [mem]
15915 CBNZ scratch, .label1
15916 .label2:
15917 CMP rval, 0. */
15918 bool strong_zero_p = !is_weak && oldval == const0_rtx;
15919
15920 label1 = NULL;
15921 if (!is_weak)
15922 {
15923 label1 = gen_label_rtx ();
15924 emit_label (label1);
15925 }
15926 label2 = gen_label_rtx ();
15927
15928 /* The initial load can be relaxed for a __sync operation since a final
15929 barrier will be emitted to stop code hoisting. */
15930 if (is_mm_sync (model))
15931 aarch64_emit_load_exclusive (mode, rval, mem,
15932 GEN_INT (MEMMODEL_RELAXED));
15933 else
15934 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15935
15936 if (strong_zero_p)
15937 {
15938 if (aarch64_track_speculation)
15939 {
15940 /* Emit an explicit compare instruction, so that we can correctly
15941 track the condition codes. */
15942 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15943 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15944 }
15945 else
15946 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15947
15948 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15949 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15950 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15951 }
15952 else
15953 {
15954 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15955 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15956 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15957 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15958 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15959 }
15960
15961 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15962
15963 if (!is_weak)
15964 {
15965 if (aarch64_track_speculation)
15966 {
15967 /* Emit an explicit compare instruction, so that we can correctly
15968 track the condition codes. */
15969 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15970 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15971 }
15972 else
15973 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15974
15975 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15976 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15977 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15978 }
15979 else
15980 {
15981 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15982 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15983 emit_insn (gen_rtx_SET (cond, x));
15984 }
15985
15986 emit_label (label2);
15987 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15988 to set the condition flags. If this is not used it will be removed by
15989 later passes. */
15990 if (strong_zero_p)
15991 {
15992 cond = gen_rtx_REG (CCmode, CC_REGNUM);
15993 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15994 emit_insn (gen_rtx_SET (cond, x));
15995 }
15996 /* Emit any final barrier needed for a __sync operation. */
15997 if (is_mm_sync (model))
15998 aarch64_emit_post_barrier (model);
15999 }
16000
16001 /* Split an atomic operation. */
16002
16003 void
16004 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16005 rtx value, rtx model_rtx, rtx cond)
16006 {
16007 machine_mode mode = GET_MODE (mem);
16008 machine_mode wmode = (mode == DImode ? DImode : SImode);
16009 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16010 const bool is_sync = is_mm_sync (model);
16011 rtx_code_label *label;
16012 rtx x;
16013
16014 /* Split the atomic operation into a sequence. */
16015 label = gen_label_rtx ();
16016 emit_label (label);
16017
16018 if (new_out)
16019 new_out = gen_lowpart (wmode, new_out);
16020 if (old_out)
16021 old_out = gen_lowpart (wmode, old_out);
16022 else
16023 old_out = new_out;
16024 value = simplify_gen_subreg (wmode, value, mode, 0);
16025
16026 /* The initial load can be relaxed for a __sync operation since a final
16027 barrier will be emitted to stop code hoisting. */
16028 if (is_sync)
16029 aarch64_emit_load_exclusive (mode, old_out, mem,
16030 GEN_INT (MEMMODEL_RELAXED));
16031 else
16032 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16033
16034 switch (code)
16035 {
16036 case SET:
16037 new_out = value;
16038 break;
16039
16040 case NOT:
16041 x = gen_rtx_AND (wmode, old_out, value);
16042 emit_insn (gen_rtx_SET (new_out, x));
16043 x = gen_rtx_NOT (wmode, new_out);
16044 emit_insn (gen_rtx_SET (new_out, x));
16045 break;
16046
16047 case MINUS:
16048 if (CONST_INT_P (value))
16049 {
16050 value = GEN_INT (-INTVAL (value));
16051 code = PLUS;
16052 }
16053 /* Fall through. */
16054
16055 default:
16056 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16057 emit_insn (gen_rtx_SET (new_out, x));
16058 break;
16059 }
16060
16061 aarch64_emit_store_exclusive (mode, cond, mem,
16062 gen_lowpart (mode, new_out), model_rtx);
16063
16064 if (aarch64_track_speculation)
16065 {
16066 /* Emit an explicit compare instruction, so that we can correctly
16067 track the condition codes. */
16068 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16069 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16070 }
16071 else
16072 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16073
16074 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16075 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16076 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16077
16078 /* Emit any final barrier needed for a __sync operation. */
16079 if (is_sync)
16080 aarch64_emit_post_barrier (model);
16081 }
16082
16083 static void
16084 aarch64_init_libfuncs (void)
16085 {
16086 /* Half-precision float operations. The compiler handles all operations
16087 with NULL libfuncs by converting to SFmode. */
16088
16089 /* Conversions. */
16090 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16091 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16092
16093 /* Arithmetic. */
16094 set_optab_libfunc (add_optab, HFmode, NULL);
16095 set_optab_libfunc (sdiv_optab, HFmode, NULL);
16096 set_optab_libfunc (smul_optab, HFmode, NULL);
16097 set_optab_libfunc (neg_optab, HFmode, NULL);
16098 set_optab_libfunc (sub_optab, HFmode, NULL);
16099
16100 /* Comparisons. */
16101 set_optab_libfunc (eq_optab, HFmode, NULL);
16102 set_optab_libfunc (ne_optab, HFmode, NULL);
16103 set_optab_libfunc (lt_optab, HFmode, NULL);
16104 set_optab_libfunc (le_optab, HFmode, NULL);
16105 set_optab_libfunc (ge_optab, HFmode, NULL);
16106 set_optab_libfunc (gt_optab, HFmode, NULL);
16107 set_optab_libfunc (unord_optab, HFmode, NULL);
16108 }
16109
16110 /* Target hook for c_mode_for_suffix. */
16111 static machine_mode
16112 aarch64_c_mode_for_suffix (char suffix)
16113 {
16114 if (suffix == 'q')
16115 return TFmode;
16116
16117 return VOIDmode;
16118 }
16119
16120 /* We can only represent floating point constants which will fit in
16121 "quarter-precision" values. These values are characterised by
16122 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
16123 by:
16124
16125 (-1)^s * (n/16) * 2^r
16126
16127 Where:
16128 's' is the sign bit.
16129 'n' is an integer in the range 16 <= n <= 31.
16130 'r' is an integer in the range -3 <= r <= 4. */
16131
16132 /* Return true iff X can be represented by a quarter-precision
16133 floating point immediate operand X. Note, we cannot represent 0.0. */
16134 bool
16135 aarch64_float_const_representable_p (rtx x)
16136 {
16137 /* This represents our current view of how many bits
16138 make up the mantissa. */
16139 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16140 int exponent;
16141 unsigned HOST_WIDE_INT mantissa, mask;
16142 REAL_VALUE_TYPE r, m;
16143 bool fail;
16144
16145 if (!CONST_DOUBLE_P (x))
16146 return false;
16147
16148 if (GET_MODE (x) == VOIDmode
16149 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16150 return false;
16151
16152 r = *CONST_DOUBLE_REAL_VALUE (x);
16153
16154 /* We cannot represent infinities, NaNs or +/-zero. We won't
16155 know if we have +zero until we analyse the mantissa, but we
16156 can reject the other invalid values. */
16157 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16158 || REAL_VALUE_MINUS_ZERO (r))
16159 return false;
16160
16161 /* Extract exponent. */
16162 r = real_value_abs (&r);
16163 exponent = REAL_EXP (&r);
16164
16165 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16166 highest (sign) bit, with a fixed binary point at bit point_pos.
16167 m1 holds the low part of the mantissa, m2 the high part.
16168 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16169 bits for the mantissa, this can fail (low bits will be lost). */
16170 real_ldexp (&m, &r, point_pos - exponent);
16171 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16172
16173 /* If the low part of the mantissa has bits set we cannot represent
16174 the value. */
16175 if (w.ulow () != 0)
16176 return false;
16177 /* We have rejected the lower HOST_WIDE_INT, so update our
16178 understanding of how many bits lie in the mantissa and
16179 look only at the high HOST_WIDE_INT. */
16180 mantissa = w.elt (1);
16181 point_pos -= HOST_BITS_PER_WIDE_INT;
16182
16183 /* We can only represent values with a mantissa of the form 1.xxxx. */
16184 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
16185 if ((mantissa & mask) != 0)
16186 return false;
16187
16188 /* Having filtered unrepresentable values, we may now remove all
16189 but the highest 5 bits. */
16190 mantissa >>= point_pos - 5;
16191
16192 /* We cannot represent the value 0.0, so reject it. This is handled
16193 elsewhere. */
16194 if (mantissa == 0)
16195 return false;
16196
16197 /* Then, as bit 4 is always set, we can mask it off, leaving
16198 the mantissa in the range [0, 15]. */
16199 mantissa &= ~(1 << 4);
16200 gcc_assert (mantissa <= 15);
16201
16202 /* GCC internally does not use IEEE754-like encoding (where normalized
16203 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
16204 Our mantissa values are shifted 4 places to the left relative to
16205 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16206 by 5 places to correct for GCC's representation. */
16207 exponent = 5 - exponent;
16208
16209 return (exponent >= 0 && exponent <= 7);
16210 }
16211
16212 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16213 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
16214 output MOVI/MVNI, ORR or BIC immediate. */
16215 char*
16216 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
16217 enum simd_immediate_check which)
16218 {
16219 bool is_valid;
16220 static char templ[40];
16221 const char *mnemonic;
16222 const char *shift_op;
16223 unsigned int lane_count = 0;
16224 char element_char;
16225
16226 struct simd_immediate_info info;
16227
16228 /* This will return true to show const_vector is legal for use as either
16229 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16230 It will also update INFO to show how the immediate should be generated.
16231 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
16232 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
16233 gcc_assert (is_valid);
16234
16235 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16236 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
16237
16238 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16239 {
16240 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
16241 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16242 move immediate path. */
16243 if (aarch64_float_const_zero_rtx_p (info.value))
16244 info.value = GEN_INT (0);
16245 else
16246 {
16247 const unsigned int buf_size = 20;
16248 char float_buf[buf_size] = {'\0'};
16249 real_to_decimal_for_mode (float_buf,
16250 CONST_DOUBLE_REAL_VALUE (info.value),
16251 buf_size, buf_size, 1, info.elt_mode);
16252
16253 if (lane_count == 1)
16254 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
16255 else
16256 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
16257 lane_count, element_char, float_buf);
16258 return templ;
16259 }
16260 }
16261
16262 gcc_assert (CONST_INT_P (info.value));
16263
16264 if (which == AARCH64_CHECK_MOV)
16265 {
16266 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
16267 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
16268 if (lane_count == 1)
16269 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
16270 mnemonic, UINTVAL (info.value));
16271 else if (info.shift)
16272 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16273 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
16274 element_char, UINTVAL (info.value), shift_op, info.shift);
16275 else
16276 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16277 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
16278 element_char, UINTVAL (info.value));
16279 }
16280 else
16281 {
16282 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
16283 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
16284 if (info.shift)
16285 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16286 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
16287 element_char, UINTVAL (info.value), "lsl", info.shift);
16288 else
16289 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16290 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
16291 element_char, UINTVAL (info.value));
16292 }
16293 return templ;
16294 }
16295
16296 char*
16297 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
16298 {
16299
16300 /* If a floating point number was passed and we desire to use it in an
16301 integer mode do the conversion to integer. */
16302 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
16303 {
16304 unsigned HOST_WIDE_INT ival;
16305 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
16306 gcc_unreachable ();
16307 immediate = gen_int_mode (ival, mode);
16308 }
16309
16310 machine_mode vmode;
16311 /* use a 64 bit mode for everything except for DI/DF mode, where we use
16312 a 128 bit vector mode. */
16313 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
16314
16315 vmode = aarch64_simd_container_mode (mode, width);
16316 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
16317 return aarch64_output_simd_mov_immediate (v_op, width);
16318 }
16319
16320 /* Return the output string to use for moving immediate CONST_VECTOR
16321 into an SVE register. */
16322
16323 char *
16324 aarch64_output_sve_mov_immediate (rtx const_vector)
16325 {
16326 static char templ[40];
16327 struct simd_immediate_info info;
16328 char element_char;
16329
16330 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
16331 gcc_assert (is_valid);
16332
16333 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16334
16335 if (info.step)
16336 {
16337 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
16338 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
16339 element_char, INTVAL (info.value), INTVAL (info.step));
16340 return templ;
16341 }
16342
16343 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16344 {
16345 if (aarch64_float_const_zero_rtx_p (info.value))
16346 info.value = GEN_INT (0);
16347 else
16348 {
16349 const int buf_size = 20;
16350 char float_buf[buf_size] = {};
16351 real_to_decimal_for_mode (float_buf,
16352 CONST_DOUBLE_REAL_VALUE (info.value),
16353 buf_size, buf_size, 1, info.elt_mode);
16354
16355 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
16356 element_char, float_buf);
16357 return templ;
16358 }
16359 }
16360
16361 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
16362 element_char, INTVAL (info.value));
16363 return templ;
16364 }
16365
16366 /* Return the asm format for a PTRUE instruction whose destination has
16367 mode MODE. SUFFIX is the element size suffix. */
16368
16369 char *
16370 aarch64_output_ptrue (machine_mode mode, char suffix)
16371 {
16372 unsigned int nunits;
16373 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16374 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
16375 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
16376 else
16377 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
16378 return buf;
16379 }
16380
16381 /* Split operands into moves from op[1] + op[2] into op[0]. */
16382
16383 void
16384 aarch64_split_combinev16qi (rtx operands[3])
16385 {
16386 unsigned int dest = REGNO (operands[0]);
16387 unsigned int src1 = REGNO (operands[1]);
16388 unsigned int src2 = REGNO (operands[2]);
16389 machine_mode halfmode = GET_MODE (operands[1]);
16390 unsigned int halfregs = REG_NREGS (operands[1]);
16391 rtx destlo, desthi;
16392
16393 gcc_assert (halfmode == V16QImode);
16394
16395 if (src1 == dest && src2 == dest + halfregs)
16396 {
16397 /* No-op move. Can't split to nothing; emit something. */
16398 emit_note (NOTE_INSN_DELETED);
16399 return;
16400 }
16401
16402 /* Preserve register attributes for variable tracking. */
16403 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
16404 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
16405 GET_MODE_SIZE (halfmode));
16406
16407 /* Special case of reversed high/low parts. */
16408 if (reg_overlap_mentioned_p (operands[2], destlo)
16409 && reg_overlap_mentioned_p (operands[1], desthi))
16410 {
16411 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16412 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
16413 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16414 }
16415 else if (!reg_overlap_mentioned_p (operands[2], destlo))
16416 {
16417 /* Try to avoid unnecessary moves if part of the result
16418 is in the right place already. */
16419 if (src1 != dest)
16420 emit_move_insn (destlo, operands[1]);
16421 if (src2 != dest + halfregs)
16422 emit_move_insn (desthi, operands[2]);
16423 }
16424 else
16425 {
16426 if (src2 != dest + halfregs)
16427 emit_move_insn (desthi, operands[2]);
16428 if (src1 != dest)
16429 emit_move_insn (destlo, operands[1]);
16430 }
16431 }
16432
16433 /* vec_perm support. */
16434
16435 struct expand_vec_perm_d
16436 {
16437 rtx target, op0, op1;
16438 vec_perm_indices perm;
16439 machine_mode vmode;
16440 unsigned int vec_flags;
16441 bool one_vector_p;
16442 bool testing_p;
16443 };
16444
16445 /* Generate a variable permutation. */
16446
16447 static void
16448 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16449 {
16450 machine_mode vmode = GET_MODE (target);
16451 bool one_vector_p = rtx_equal_p (op0, op1);
16452
16453 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16454 gcc_checking_assert (GET_MODE (op0) == vmode);
16455 gcc_checking_assert (GET_MODE (op1) == vmode);
16456 gcc_checking_assert (GET_MODE (sel) == vmode);
16457 gcc_checking_assert (TARGET_SIMD);
16458
16459 if (one_vector_p)
16460 {
16461 if (vmode == V8QImode)
16462 {
16463 /* Expand the argument to a V16QI mode by duplicating it. */
16464 rtx pair = gen_reg_rtx (V16QImode);
16465 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16466 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16467 }
16468 else
16469 {
16470 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16471 }
16472 }
16473 else
16474 {
16475 rtx pair;
16476
16477 if (vmode == V8QImode)
16478 {
16479 pair = gen_reg_rtx (V16QImode);
16480 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16481 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16482 }
16483 else
16484 {
16485 pair = gen_reg_rtx (OImode);
16486 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16487 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16488 }
16489 }
16490 }
16491
16492 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16493 NELT is the number of elements in the vector. */
16494
16495 void
16496 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16497 unsigned int nelt)
16498 {
16499 machine_mode vmode = GET_MODE (target);
16500 bool one_vector_p = rtx_equal_p (op0, op1);
16501 rtx mask;
16502
16503 /* The TBL instruction does not use a modulo index, so we must take care
16504 of that ourselves. */
16505 mask = aarch64_simd_gen_const_vector_dup (vmode,
16506 one_vector_p ? nelt - 1 : 2 * nelt - 1);
16507 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16508
16509 /* For big-endian, we also need to reverse the index within the vector
16510 (but not which vector). */
16511 if (BYTES_BIG_ENDIAN)
16512 {
16513 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
16514 if (!one_vector_p)
16515 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16516 sel = expand_simple_binop (vmode, XOR, sel, mask,
16517 NULL, 0, OPTAB_LIB_WIDEN);
16518 }
16519 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16520 }
16521
16522 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
16523
16524 static void
16525 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16526 {
16527 emit_insn (gen_rtx_SET (target,
16528 gen_rtx_UNSPEC (GET_MODE (target),
16529 gen_rtvec (2, op0, op1), code)));
16530 }
16531
16532 /* Expand an SVE vec_perm with the given operands. */
16533
16534 void
16535 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16536 {
16537 machine_mode data_mode = GET_MODE (target);
16538 machine_mode sel_mode = GET_MODE (sel);
16539 /* Enforced by the pattern condition. */
16540 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16541
16542 /* Note: vec_perm indices are supposed to wrap when they go beyond the
16543 size of the two value vectors, i.e. the upper bits of the indices
16544 are effectively ignored. SVE TBL instead produces 0 for any
16545 out-of-range indices, so we need to modulo all the vec_perm indices
16546 to ensure they are all in range. */
16547 rtx sel_reg = force_reg (sel_mode, sel);
16548
16549 /* Check if the sel only references the first values vector. */
16550 if (GET_CODE (sel) == CONST_VECTOR
16551 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16552 {
16553 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16554 return;
16555 }
16556
16557 /* Check if the two values vectors are the same. */
16558 if (rtx_equal_p (op0, op1))
16559 {
16560 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16561 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16562 NULL, 0, OPTAB_DIRECT);
16563 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16564 return;
16565 }
16566
16567 /* Run TBL on for each value vector and combine the results. */
16568
16569 rtx res0 = gen_reg_rtx (data_mode);
16570 rtx res1 = gen_reg_rtx (data_mode);
16571 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16572 if (GET_CODE (sel) != CONST_VECTOR
16573 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16574 {
16575 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16576 2 * nunits - 1);
16577 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16578 NULL, 0, OPTAB_DIRECT);
16579 }
16580 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16581 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16582 NULL, 0, OPTAB_DIRECT);
16583 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16584 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16585 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16586 else
16587 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16588 }
16589
16590 /* Recognize patterns suitable for the TRN instructions. */
16591 static bool
16592 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16593 {
16594 HOST_WIDE_INT odd;
16595 poly_uint64 nelt = d->perm.length ();
16596 rtx out, in0, in1, x;
16597 machine_mode vmode = d->vmode;
16598
16599 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16600 return false;
16601
16602 /* Note that these are little-endian tests.
16603 We correct for big-endian later. */
16604 if (!d->perm[0].is_constant (&odd)
16605 || (odd != 0 && odd != 1)
16606 || !d->perm.series_p (0, 2, odd, 2)
16607 || !d->perm.series_p (1, 2, nelt + odd, 2))
16608 return false;
16609
16610 /* Success! */
16611 if (d->testing_p)
16612 return true;
16613
16614 in0 = d->op0;
16615 in1 = d->op1;
16616 /* We don't need a big-endian lane correction for SVE; see the comment
16617 at the head of aarch64-sve.md for details. */
16618 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16619 {
16620 x = in0, in0 = in1, in1 = x;
16621 odd = !odd;
16622 }
16623 out = d->target;
16624
16625 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16626 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16627 return true;
16628 }
16629
16630 /* Recognize patterns suitable for the UZP instructions. */
16631 static bool
16632 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16633 {
16634 HOST_WIDE_INT odd;
16635 rtx out, in0, in1, x;
16636 machine_mode vmode = d->vmode;
16637
16638 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16639 return false;
16640
16641 /* Note that these are little-endian tests.
16642 We correct for big-endian later. */
16643 if (!d->perm[0].is_constant (&odd)
16644 || (odd != 0 && odd != 1)
16645 || !d->perm.series_p (0, 1, odd, 2))
16646 return false;
16647
16648 /* Success! */
16649 if (d->testing_p)
16650 return true;
16651
16652 in0 = d->op0;
16653 in1 = d->op1;
16654 /* We don't need a big-endian lane correction for SVE; see the comment
16655 at the head of aarch64-sve.md for details. */
16656 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16657 {
16658 x = in0, in0 = in1, in1 = x;
16659 odd = !odd;
16660 }
16661 out = d->target;
16662
16663 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16664 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16665 return true;
16666 }
16667
16668 /* Recognize patterns suitable for the ZIP instructions. */
16669 static bool
16670 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16671 {
16672 unsigned int high;
16673 poly_uint64 nelt = d->perm.length ();
16674 rtx out, in0, in1, x;
16675 machine_mode vmode = d->vmode;
16676
16677 if (GET_MODE_UNIT_SIZE (vmode) > 8)
16678 return false;
16679
16680 /* Note that these are little-endian tests.
16681 We correct for big-endian later. */
16682 poly_uint64 first = d->perm[0];
16683 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16684 || !d->perm.series_p (0, 2, first, 1)
16685 || !d->perm.series_p (1, 2, first + nelt, 1))
16686 return false;
16687 high = maybe_ne (first, 0U);
16688
16689 /* Success! */
16690 if (d->testing_p)
16691 return true;
16692
16693 in0 = d->op0;
16694 in1 = d->op1;
16695 /* We don't need a big-endian lane correction for SVE; see the comment
16696 at the head of aarch64-sve.md for details. */
16697 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16698 {
16699 x = in0, in0 = in1, in1 = x;
16700 high = !high;
16701 }
16702 out = d->target;
16703
16704 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16705 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16706 return true;
16707 }
16708
16709 /* Recognize patterns for the EXT insn. */
16710
16711 static bool
16712 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16713 {
16714 HOST_WIDE_INT location;
16715 rtx offset;
16716
16717 /* The first element always refers to the first vector.
16718 Check if the extracted indices are increasing by one. */
16719 if (d->vec_flags == VEC_SVE_PRED
16720 || !d->perm[0].is_constant (&location)
16721 || !d->perm.series_p (0, 1, location, 1))
16722 return false;
16723
16724 /* Success! */
16725 if (d->testing_p)
16726 return true;
16727
16728 /* The case where (location == 0) is a no-op for both big- and little-endian,
16729 and is removed by the mid-end at optimization levels -O1 and higher.
16730
16731 We don't need a big-endian lane correction for SVE; see the comment
16732 at the head of aarch64-sve.md for details. */
16733 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16734 {
16735 /* After setup, we want the high elements of the first vector (stored
16736 at the LSB end of the register), and the low elements of the second
16737 vector (stored at the MSB end of the register). So swap. */
16738 std::swap (d->op0, d->op1);
16739 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16740 to_constant () is safe since this is restricted to Advanced SIMD
16741 vectors. */
16742 location = d->perm.length ().to_constant () - location;
16743 }
16744
16745 offset = GEN_INT (location);
16746 emit_set_insn (d->target,
16747 gen_rtx_UNSPEC (d->vmode,
16748 gen_rtvec (3, d->op0, d->op1, offset),
16749 UNSPEC_EXT));
16750 return true;
16751 }
16752
16753 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16754 within each 64-bit, 32-bit or 16-bit granule. */
16755
16756 static bool
16757 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16758 {
16759 HOST_WIDE_INT diff;
16760 unsigned int i, size, unspec;
16761 machine_mode pred_mode;
16762
16763 if (d->vec_flags == VEC_SVE_PRED
16764 || !d->one_vector_p
16765 || !d->perm[0].is_constant (&diff))
16766 return false;
16767
16768 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16769 if (size == 8)
16770 {
16771 unspec = UNSPEC_REV64;
16772 pred_mode = VNx2BImode;
16773 }
16774 else if (size == 4)
16775 {
16776 unspec = UNSPEC_REV32;
16777 pred_mode = VNx4BImode;
16778 }
16779 else if (size == 2)
16780 {
16781 unspec = UNSPEC_REV16;
16782 pred_mode = VNx8BImode;
16783 }
16784 else
16785 return false;
16786
16787 unsigned int step = diff + 1;
16788 for (i = 0; i < step; ++i)
16789 if (!d->perm.series_p (i, step, diff - i, step))
16790 return false;
16791
16792 /* Success! */
16793 if (d->testing_p)
16794 return true;
16795
16796 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16797 if (d->vec_flags == VEC_SVE_DATA)
16798 {
16799 rtx pred = aarch64_ptrue_reg (pred_mode);
16800 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16801 UNSPEC_MERGE_PTRUE);
16802 }
16803 emit_set_insn (d->target, src);
16804 return true;
16805 }
16806
16807 /* Recognize patterns for the REV insn, which reverses elements within
16808 a full vector. */
16809
16810 static bool
16811 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16812 {
16813 poly_uint64 nelt = d->perm.length ();
16814
16815 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16816 return false;
16817
16818 if (!d->perm.series_p (0, 1, nelt - 1, -1))
16819 return false;
16820
16821 /* Success! */
16822 if (d->testing_p)
16823 return true;
16824
16825 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16826 emit_set_insn (d->target, src);
16827 return true;
16828 }
16829
16830 static bool
16831 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16832 {
16833 rtx out = d->target;
16834 rtx in0;
16835 HOST_WIDE_INT elt;
16836 machine_mode vmode = d->vmode;
16837 rtx lane;
16838
16839 if (d->vec_flags == VEC_SVE_PRED
16840 || d->perm.encoding ().encoded_nelts () != 1
16841 || !d->perm[0].is_constant (&elt))
16842 return false;
16843
16844 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16845 return false;
16846
16847 /* Success! */
16848 if (d->testing_p)
16849 return true;
16850
16851 /* The generic preparation in aarch64_expand_vec_perm_const_1
16852 swaps the operand order and the permute indices if it finds
16853 d->perm[0] to be in the second operand. Thus, we can always
16854 use d->op0 and need not do any extra arithmetic to get the
16855 correct lane number. */
16856 in0 = d->op0;
16857 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
16858
16859 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16860 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16861 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16862 return true;
16863 }
16864
16865 static bool
16866 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16867 {
16868 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16869 machine_mode vmode = d->vmode;
16870
16871 /* Make sure that the indices are constant. */
16872 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16873 for (unsigned int i = 0; i < encoded_nelts; ++i)
16874 if (!d->perm[i].is_constant ())
16875 return false;
16876
16877 if (d->testing_p)
16878 return true;
16879
16880 /* Generic code will try constant permutation twice. Once with the
16881 original mode and again with the elements lowered to QImode.
16882 So wait and don't do the selector expansion ourselves. */
16883 if (vmode != V8QImode && vmode != V16QImode)
16884 return false;
16885
16886 /* to_constant is safe since this routine is specific to Advanced SIMD
16887 vectors. */
16888 unsigned int nelt = d->perm.length ().to_constant ();
16889 for (unsigned int i = 0; i < nelt; ++i)
16890 /* If big-endian and two vectors we end up with a weird mixed-endian
16891 mode on NEON. Reverse the index within each word but not the word
16892 itself. to_constant is safe because we checked is_constant above. */
16893 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16894 ? d->perm[i].to_constant () ^ (nelt - 1)
16895 : d->perm[i].to_constant ());
16896
16897 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16898 sel = force_reg (vmode, sel);
16899
16900 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16901 return true;
16902 }
16903
16904 /* Try to implement D using an SVE TBL instruction. */
16905
16906 static bool
16907 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16908 {
16909 unsigned HOST_WIDE_INT nelt;
16910
16911 /* Permuting two variable-length vectors could overflow the
16912 index range. */
16913 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16914 return false;
16915
16916 if (d->testing_p)
16917 return true;
16918
16919 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16920 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16921 if (d->one_vector_p)
16922 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16923 else
16924 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16925 return true;
16926 }
16927
16928 static bool
16929 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16930 {
16931 /* The pattern matching functions above are written to look for a small
16932 number to begin the sequence (0, 1, N/2). If we begin with an index
16933 from the second operand, we can swap the operands. */
16934 poly_int64 nelt = d->perm.length ();
16935 if (known_ge (d->perm[0], nelt))
16936 {
16937 d->perm.rotate_inputs (1);
16938 std::swap (d->op0, d->op1);
16939 }
16940
16941 if ((d->vec_flags == VEC_ADVSIMD
16942 || d->vec_flags == VEC_SVE_DATA
16943 || d->vec_flags == VEC_SVE_PRED)
16944 && known_gt (nelt, 1))
16945 {
16946 if (aarch64_evpc_rev_local (d))
16947 return true;
16948 else if (aarch64_evpc_rev_global (d))
16949 return true;
16950 else if (aarch64_evpc_ext (d))
16951 return true;
16952 else if (aarch64_evpc_dup (d))
16953 return true;
16954 else if (aarch64_evpc_zip (d))
16955 return true;
16956 else if (aarch64_evpc_uzp (d))
16957 return true;
16958 else if (aarch64_evpc_trn (d))
16959 return true;
16960 if (d->vec_flags == VEC_SVE_DATA)
16961 return aarch64_evpc_sve_tbl (d);
16962 else if (d->vec_flags == VEC_ADVSIMD)
16963 return aarch64_evpc_tbl (d);
16964 }
16965 return false;
16966 }
16967
16968 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
16969
16970 static bool
16971 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16972 rtx op1, const vec_perm_indices &sel)
16973 {
16974 struct expand_vec_perm_d d;
16975
16976 /* Check whether the mask can be applied to a single vector. */
16977 if (sel.ninputs () == 1
16978 || (op0 && rtx_equal_p (op0, op1)))
16979 d.one_vector_p = true;
16980 else if (sel.all_from_input_p (0))
16981 {
16982 d.one_vector_p = true;
16983 op1 = op0;
16984 }
16985 else if (sel.all_from_input_p (1))
16986 {
16987 d.one_vector_p = true;
16988 op0 = op1;
16989 }
16990 else
16991 d.one_vector_p = false;
16992
16993 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16994 sel.nelts_per_input ());
16995 d.vmode = vmode;
16996 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16997 d.target = target;
16998 d.op0 = op0;
16999 d.op1 = op1;
17000 d.testing_p = !target;
17001
17002 if (!d.testing_p)
17003 return aarch64_expand_vec_perm_const_1 (&d);
17004
17005 rtx_insn *last = get_last_insn ();
17006 bool ret = aarch64_expand_vec_perm_const_1 (&d);
17007 gcc_assert (last == get_last_insn ());
17008
17009 return ret;
17010 }
17011
17012 /* Generate a byte permute mask for a register of mode MODE,
17013 which has NUNITS units. */
17014
17015 rtx
17016 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17017 {
17018 /* We have to reverse each vector because we dont have
17019 a permuted load that can reverse-load according to ABI rules. */
17020 rtx mask;
17021 rtvec v = rtvec_alloc (16);
17022 unsigned int i, j;
17023 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17024
17025 gcc_assert (BYTES_BIG_ENDIAN);
17026 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17027
17028 for (i = 0; i < nunits; i++)
17029 for (j = 0; j < usize; j++)
17030 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17031 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17032 return force_reg (V16QImode, mask);
17033 }
17034
17035 /* Return true if X is a valid second operand for the SVE instruction
17036 that implements integer comparison OP_CODE. */
17037
17038 static bool
17039 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
17040 {
17041 if (register_operand (x, VOIDmode))
17042 return true;
17043
17044 switch (op_code)
17045 {
17046 case LTU:
17047 case LEU:
17048 case GEU:
17049 case GTU:
17050 return aarch64_sve_cmp_immediate_p (x, false);
17051 case LT:
17052 case LE:
17053 case GE:
17054 case GT:
17055 case NE:
17056 case EQ:
17057 return aarch64_sve_cmp_immediate_p (x, true);
17058 default:
17059 gcc_unreachable ();
17060 }
17061 }
17062
17063 /* Use predicated SVE instructions to implement the equivalent of:
17064
17065 (set TARGET OP)
17066
17067 given that PTRUE is an all-true predicate of the appropriate mode. */
17068
17069 static void
17070 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
17071 {
17072 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17073 gen_rtvec (2, ptrue, op),
17074 UNSPEC_MERGE_PTRUE);
17075 rtx_insn *insn = emit_set_insn (target, unspec);
17076 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17077 }
17078
17079 /* Likewise, but also clobber the condition codes. */
17080
17081 static void
17082 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
17083 {
17084 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17085 gen_rtvec (2, ptrue, op),
17086 UNSPEC_MERGE_PTRUE);
17087 rtx_insn *insn = emit_insn (gen_set_clobber_cc_nzc (target, unspec));
17088 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17089 }
17090
17091 /* Return the UNSPEC_COND_* code for comparison CODE. */
17092
17093 static unsigned int
17094 aarch64_unspec_cond_code (rtx_code code)
17095 {
17096 switch (code)
17097 {
17098 case NE:
17099 return UNSPEC_COND_NE;
17100 case EQ:
17101 return UNSPEC_COND_EQ;
17102 case LT:
17103 return UNSPEC_COND_LT;
17104 case GT:
17105 return UNSPEC_COND_GT;
17106 case LE:
17107 return UNSPEC_COND_LE;
17108 case GE:
17109 return UNSPEC_COND_GE;
17110 default:
17111 gcc_unreachable ();
17112 }
17113 }
17114
17115 /* Emit:
17116
17117 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17118
17119 where <X> is the operation associated with comparison CODE. This form
17120 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17121 semantics, such as when PRED might not be all-true and when comparing
17122 inactive lanes could have side effects. */
17123
17124 static void
17125 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
17126 rtx pred, rtx op0, rtx op1)
17127 {
17128 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17129 gen_rtvec (3, pred, op0, op1),
17130 aarch64_unspec_cond_code (code));
17131 emit_set_insn (target, unspec);
17132 }
17133
17134 /* Expand an SVE integer comparison using the SVE equivalent of:
17135
17136 (set TARGET (CODE OP0 OP1)). */
17137
17138 void
17139 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17140 {
17141 machine_mode pred_mode = GET_MODE (target);
17142 machine_mode data_mode = GET_MODE (op0);
17143
17144 if (!aarch64_sve_cmp_operand_p (code, op1))
17145 op1 = force_reg (data_mode, op1);
17146
17147 rtx ptrue = aarch64_ptrue_reg (pred_mode);
17148 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17149 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
17150 }
17151
17152 /* Emit the SVE equivalent of:
17153
17154 (set TMP1 (CODE1 OP0 OP1))
17155 (set TMP2 (CODE2 OP0 OP1))
17156 (set TARGET (ior:PRED_MODE TMP1 TMP2))
17157
17158 PTRUE is an all-true predicate with the same mode as TARGET. */
17159
17160 static void
17161 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
17162 rtx ptrue, rtx op0, rtx op1)
17163 {
17164 machine_mode pred_mode = GET_MODE (ptrue);
17165 rtx tmp1 = gen_reg_rtx (pred_mode);
17166 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
17167 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
17168 rtx tmp2 = gen_reg_rtx (pred_mode);
17169 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
17170 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
17171 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17172 }
17173
17174 /* Emit the SVE equivalent of:
17175
17176 (set TMP (CODE OP0 OP1))
17177 (set TARGET (not TMP))
17178
17179 PTRUE is an all-true predicate with the same mode as TARGET. */
17180
17181 static void
17182 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
17183 rtx op0, rtx op1)
17184 {
17185 machine_mode pred_mode = GET_MODE (ptrue);
17186 rtx tmp = gen_reg_rtx (pred_mode);
17187 aarch64_emit_sve_ptrue_op (tmp, ptrue,
17188 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
17189 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17190 }
17191
17192 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17193
17194 (set TARGET (CODE OP0 OP1))
17195
17196 If CAN_INVERT_P is true, the caller can also handle inverted results;
17197 return true if the result is in fact inverted. */
17198
17199 bool
17200 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17201 rtx op0, rtx op1, bool can_invert_p)
17202 {
17203 machine_mode pred_mode = GET_MODE (target);
17204 machine_mode data_mode = GET_MODE (op0);
17205
17206 rtx ptrue = aarch64_ptrue_reg (pred_mode);
17207 switch (code)
17208 {
17209 case UNORDERED:
17210 /* UNORDERED has no immediate form. */
17211 op1 = force_reg (data_mode, op1);
17212 /* fall through */
17213 case LT:
17214 case LE:
17215 case GT:
17216 case GE:
17217 case EQ:
17218 case NE:
17219 {
17220 /* There is native support for the comparison. */
17221 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17222 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17223 return false;
17224 }
17225
17226 case LTGT:
17227 /* This is a trapping operation (LT or GT). */
17228 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
17229 return false;
17230
17231 case UNEQ:
17232 if (!flag_trapping_math)
17233 {
17234 /* This would trap for signaling NaNs. */
17235 op1 = force_reg (data_mode, op1);
17236 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
17237 return false;
17238 }
17239 /* fall through */
17240 case UNLT:
17241 case UNLE:
17242 case UNGT:
17243 case UNGE:
17244 if (flag_trapping_math)
17245 {
17246 /* Work out which elements are ordered. */
17247 rtx ordered = gen_reg_rtx (pred_mode);
17248 op1 = force_reg (data_mode, op1);
17249 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
17250
17251 /* Test the opposite condition for the ordered elements,
17252 then invert the result. */
17253 if (code == UNEQ)
17254 code = NE;
17255 else
17256 code = reverse_condition_maybe_unordered (code);
17257 if (can_invert_p)
17258 {
17259 aarch64_emit_sve_predicated_cond (target, code,
17260 ordered, op0, op1);
17261 return true;
17262 }
17263 rtx tmp = gen_reg_rtx (pred_mode);
17264 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
17265 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17266 return false;
17267 }
17268 break;
17269
17270 case ORDERED:
17271 /* ORDERED has no immediate form. */
17272 op1 = force_reg (data_mode, op1);
17273 break;
17274
17275 default:
17276 gcc_unreachable ();
17277 }
17278
17279 /* There is native support for the inverse comparison. */
17280 code = reverse_condition_maybe_unordered (code);
17281 if (can_invert_p)
17282 {
17283 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17284 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17285 return true;
17286 }
17287 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
17288 return false;
17289 }
17290
17291 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
17292 of the data being selected and CMP_MODE is the mode of the values being
17293 compared. */
17294
17295 void
17296 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
17297 rtx *ops)
17298 {
17299 machine_mode pred_mode
17300 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
17301 GET_MODE_SIZE (cmp_mode)).require ();
17302 rtx pred = gen_reg_rtx (pred_mode);
17303 if (FLOAT_MODE_P (cmp_mode))
17304 {
17305 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
17306 ops[4], ops[5], true))
17307 std::swap (ops[1], ops[2]);
17308 }
17309 else
17310 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
17311
17312 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
17313 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
17314 }
17315
17316 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
17317 true. However due to issues with register allocation it is preferable
17318 to avoid tieing integer scalar and FP scalar modes. Executing integer
17319 operations in general registers is better than treating them as scalar
17320 vector operations. This reduces latency and avoids redundant int<->FP
17321 moves. So tie modes if they are either the same class, or vector modes
17322 with other vector modes, vector structs or any scalar mode. */
17323
17324 static bool
17325 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
17326 {
17327 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
17328 return true;
17329
17330 /* We specifically want to allow elements of "structure" modes to
17331 be tieable to the structure. This more general condition allows
17332 other rarer situations too. The reason we don't extend this to
17333 predicate modes is that there are no predicate structure modes
17334 nor any specific instructions for extracting part of a predicate
17335 register. */
17336 if (aarch64_vector_data_mode_p (mode1)
17337 && aarch64_vector_data_mode_p (mode2))
17338 return true;
17339
17340 /* Also allow any scalar modes with vectors. */
17341 if (aarch64_vector_mode_supported_p (mode1)
17342 || aarch64_vector_mode_supported_p (mode2))
17343 return true;
17344
17345 return false;
17346 }
17347
17348 /* Return a new RTX holding the result of moving POINTER forward by
17349 AMOUNT bytes. */
17350
17351 static rtx
17352 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17353 {
17354 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17355
17356 return adjust_automodify_address (pointer, GET_MODE (pointer),
17357 next, amount);
17358 }
17359
17360 /* Return a new RTX holding the result of moving POINTER forward by the
17361 size of the mode it points to. */
17362
17363 static rtx
17364 aarch64_progress_pointer (rtx pointer)
17365 {
17366 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
17367 }
17368
17369 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17370 MODE bytes. */
17371
17372 static void
17373 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
17374 machine_mode mode)
17375 {
17376 rtx reg = gen_reg_rtx (mode);
17377
17378 /* "Cast" the pointers to the correct mode. */
17379 *src = adjust_address (*src, mode, 0);
17380 *dst = adjust_address (*dst, mode, 0);
17381 /* Emit the memcpy. */
17382 emit_move_insn (reg, *src);
17383 emit_move_insn (*dst, reg);
17384 /* Move the pointers forward. */
17385 *src = aarch64_progress_pointer (*src);
17386 *dst = aarch64_progress_pointer (*dst);
17387 }
17388
17389 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
17390 we succeed, otherwise return false. */
17391
17392 bool
17393 aarch64_expand_cpymem (rtx *operands)
17394 {
17395 int n, mode_bits;
17396 rtx dst = operands[0];
17397 rtx src = operands[1];
17398 rtx base;
17399 machine_mode cur_mode = BLKmode, next_mode;
17400 bool speed_p = !optimize_function_for_size_p (cfun);
17401
17402 /* When optimizing for size, give a better estimate of the length of a
17403 memcpy call, but use the default otherwise. Moves larger than 8 bytes
17404 will always require an even number of instructions to do now. And each
17405 operation requires both a load+store, so devide the max number by 2. */
17406 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
17407
17408 /* We can't do anything smart if the amount to copy is not constant. */
17409 if (!CONST_INT_P (operands[2]))
17410 return false;
17411
17412 n = INTVAL (operands[2]);
17413
17414 /* Try to keep the number of instructions low. For all cases we will do at
17415 most two moves for the residual amount, since we'll always overlap the
17416 remainder. */
17417 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
17418 return false;
17419
17420 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
17421 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
17422
17423 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
17424 src = adjust_automodify_address (src, VOIDmode, base, 0);
17425
17426 /* Convert n to bits to make the rest of the code simpler. */
17427 n = n * BITS_PER_UNIT;
17428
17429 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
17430 larger than TImode, but we should not use them for loads/stores here. */
17431 const int copy_limit = GET_MODE_BITSIZE (TImode);
17432
17433 while (n > 0)
17434 {
17435 /* Find the largest mode in which to do the copy in without over reading
17436 or writing. */
17437 opt_scalar_int_mode mode_iter;
17438 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
17439 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
17440 cur_mode = mode_iter.require ();
17441
17442 gcc_assert (cur_mode != BLKmode);
17443
17444 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
17445 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
17446
17447 n -= mode_bits;
17448
17449 /* Do certain trailing copies as overlapping if it's going to be
17450 cheaper. i.e. less instructions to do so. For instance doing a 15
17451 byte copy it's more efficient to do two overlapping 8 byte copies than
17452 8 + 6 + 1. */
17453 if (n > 0 && n <= 8 * BITS_PER_UNIT)
17454 {
17455 next_mode = smallest_mode_for_size (n, MODE_INT);
17456 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17457 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17458 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17459 n = n_bits;
17460 }
17461 }
17462
17463 return true;
17464 }
17465
17466 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17467 SImode stores. Handle the case when the constant has identical
17468 bottom and top halves. This is beneficial when the two stores can be
17469 merged into an STP and we avoid synthesising potentially expensive
17470 immediates twice. Return true if such a split is possible. */
17471
17472 bool
17473 aarch64_split_dimode_const_store (rtx dst, rtx src)
17474 {
17475 rtx lo = gen_lowpart (SImode, src);
17476 rtx hi = gen_highpart_mode (SImode, DImode, src);
17477
17478 bool size_p = optimize_function_for_size_p (cfun);
17479
17480 if (!rtx_equal_p (lo, hi))
17481 return false;
17482
17483 unsigned int orig_cost
17484 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17485 unsigned int lo_cost
17486 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17487
17488 /* We want to transform:
17489 MOV x1, 49370
17490 MOVK x1, 0x140, lsl 16
17491 MOVK x1, 0xc0da, lsl 32
17492 MOVK x1, 0x140, lsl 48
17493 STR x1, [x0]
17494 into:
17495 MOV w1, 49370
17496 MOVK w1, 0x140, lsl 16
17497 STP w1, w1, [x0]
17498 So we want to perform this only when we save two instructions
17499 or more. When optimizing for size, however, accept any code size
17500 savings we can. */
17501 if (size_p && orig_cost <= lo_cost)
17502 return false;
17503
17504 if (!size_p
17505 && (orig_cost <= lo_cost + 1))
17506 return false;
17507
17508 rtx mem_lo = adjust_address (dst, SImode, 0);
17509 if (!aarch64_mem_pair_operand (mem_lo, SImode))
17510 return false;
17511
17512 rtx tmp_reg = gen_reg_rtx (SImode);
17513 aarch64_expand_mov_immediate (tmp_reg, lo);
17514 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17515 /* Don't emit an explicit store pair as this may not be always profitable.
17516 Let the sched-fusion logic decide whether to merge them. */
17517 emit_move_insn (mem_lo, tmp_reg);
17518 emit_move_insn (mem_hi, tmp_reg);
17519
17520 return true;
17521 }
17522
17523 /* Generate RTL for a conditional branch with rtx comparison CODE in
17524 mode CC_MODE. The destination of the unlikely conditional branch
17525 is LABEL_REF. */
17526
17527 void
17528 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17529 rtx label_ref)
17530 {
17531 rtx x;
17532 x = gen_rtx_fmt_ee (code, VOIDmode,
17533 gen_rtx_REG (cc_mode, CC_REGNUM),
17534 const0_rtx);
17535
17536 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17537 gen_rtx_LABEL_REF (VOIDmode, label_ref),
17538 pc_rtx);
17539 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17540 }
17541
17542 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17543
17544 OP1 represents the TImode destination operand 1
17545 OP2 represents the TImode destination operand 2
17546 LOW_DEST represents the low half (DImode) of TImode operand 0
17547 LOW_IN1 represents the low half (DImode) of TImode operand 1
17548 LOW_IN2 represents the low half (DImode) of TImode operand 2
17549 HIGH_DEST represents the high half (DImode) of TImode operand 0
17550 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17551 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17552
17553 void
17554 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17555 rtx *low_in1, rtx *low_in2,
17556 rtx *high_dest, rtx *high_in1,
17557 rtx *high_in2)
17558 {
17559 *low_dest = gen_reg_rtx (DImode);
17560 *low_in1 = gen_lowpart (DImode, op1);
17561 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17562 subreg_lowpart_offset (DImode, TImode));
17563 *high_dest = gen_reg_rtx (DImode);
17564 *high_in1 = gen_highpart (DImode, op1);
17565 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17566 subreg_highpart_offset (DImode, TImode));
17567 }
17568
17569 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17570
17571 This function differs from 'arch64_addti_scratch_regs' in that
17572 OP1 can be an immediate constant (zero). We must call
17573 subreg_highpart_offset with DImode and TImode arguments, otherwise
17574 VOIDmode will be used for the const_int which generates an internal
17575 error from subreg_size_highpart_offset which does not expect a size of zero.
17576
17577 OP1 represents the TImode destination operand 1
17578 OP2 represents the TImode destination operand 2
17579 LOW_DEST represents the low half (DImode) of TImode operand 0
17580 LOW_IN1 represents the low half (DImode) of TImode operand 1
17581 LOW_IN2 represents the low half (DImode) of TImode operand 2
17582 HIGH_DEST represents the high half (DImode) of TImode operand 0
17583 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17584 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
17585
17586
17587 void
17588 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17589 rtx *low_in1, rtx *low_in2,
17590 rtx *high_dest, rtx *high_in1,
17591 rtx *high_in2)
17592 {
17593 *low_dest = gen_reg_rtx (DImode);
17594 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17595 subreg_lowpart_offset (DImode, TImode));
17596
17597 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17598 subreg_lowpart_offset (DImode, TImode));
17599 *high_dest = gen_reg_rtx (DImode);
17600
17601 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17602 subreg_highpart_offset (DImode, TImode));
17603 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17604 subreg_highpart_offset (DImode, TImode));
17605 }
17606
17607 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17608
17609 OP0 represents the TImode destination operand 0
17610 LOW_DEST represents the low half (DImode) of TImode operand 0
17611 LOW_IN1 represents the low half (DImode) of TImode operand 1
17612 LOW_IN2 represents the low half (DImode) of TImode operand 2
17613 HIGH_DEST represents the high half (DImode) of TImode operand 0
17614 HIGH_IN1 represents the high half (DImode) of TImode operand 1
17615 HIGH_IN2 represents the high half (DImode) of TImode operand 2
17616 UNSIGNED_P is true if the operation is being performed on unsigned
17617 values. */
17618 void
17619 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17620 rtx low_in2, rtx high_dest, rtx high_in1,
17621 rtx high_in2, bool unsigned_p)
17622 {
17623 if (low_in2 == const0_rtx)
17624 {
17625 low_dest = low_in1;
17626 high_in2 = force_reg (DImode, high_in2);
17627 if (unsigned_p)
17628 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17629 else
17630 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17631 }
17632 else
17633 {
17634 if (CONST_INT_P (low_in2))
17635 {
17636 high_in2 = force_reg (DImode, high_in2);
17637 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17638 GEN_INT (-INTVAL (low_in2))));
17639 }
17640 else
17641 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17642
17643 if (unsigned_p)
17644 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17645 else
17646 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17647 }
17648
17649 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17650 emit_move_insn (gen_highpart (DImode, op0), high_dest);
17651
17652 }
17653
17654 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
17655
17656 static unsigned HOST_WIDE_INT
17657 aarch64_asan_shadow_offset (void)
17658 {
17659 if (TARGET_ILP32)
17660 return (HOST_WIDE_INT_1 << 29);
17661 else
17662 return (HOST_WIDE_INT_1 << 36);
17663 }
17664
17665 static rtx
17666 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17667 int code, tree treeop0, tree treeop1)
17668 {
17669 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17670 rtx op0, op1;
17671 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17672 insn_code icode;
17673 struct expand_operand ops[4];
17674
17675 start_sequence ();
17676 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17677
17678 op_mode = GET_MODE (op0);
17679 if (op_mode == VOIDmode)
17680 op_mode = GET_MODE (op1);
17681
17682 switch (op_mode)
17683 {
17684 case E_QImode:
17685 case E_HImode:
17686 case E_SImode:
17687 cmp_mode = SImode;
17688 icode = CODE_FOR_cmpsi;
17689 break;
17690
17691 case E_DImode:
17692 cmp_mode = DImode;
17693 icode = CODE_FOR_cmpdi;
17694 break;
17695
17696 case E_SFmode:
17697 cmp_mode = SFmode;
17698 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17699 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17700 break;
17701
17702 case E_DFmode:
17703 cmp_mode = DFmode;
17704 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17705 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17706 break;
17707
17708 default:
17709 end_sequence ();
17710 return NULL_RTX;
17711 }
17712
17713 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17714 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17715 if (!op0 || !op1)
17716 {
17717 end_sequence ();
17718 return NULL_RTX;
17719 }
17720 *prep_seq = get_insns ();
17721 end_sequence ();
17722
17723 create_fixed_operand (&ops[0], op0);
17724 create_fixed_operand (&ops[1], op1);
17725
17726 start_sequence ();
17727 if (!maybe_expand_insn (icode, 2, ops))
17728 {
17729 end_sequence ();
17730 return NULL_RTX;
17731 }
17732 *gen_seq = get_insns ();
17733 end_sequence ();
17734
17735 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17736 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17737 }
17738
17739 static rtx
17740 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17741 int cmp_code, tree treeop0, tree treeop1, int bit_code)
17742 {
17743 rtx op0, op1, target;
17744 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17745 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17746 insn_code icode;
17747 struct expand_operand ops[6];
17748 int aarch64_cond;
17749
17750 push_to_sequence (*prep_seq);
17751 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17752
17753 op_mode = GET_MODE (op0);
17754 if (op_mode == VOIDmode)
17755 op_mode = GET_MODE (op1);
17756
17757 switch (op_mode)
17758 {
17759 case E_QImode:
17760 case E_HImode:
17761 case E_SImode:
17762 cmp_mode = SImode;
17763 icode = CODE_FOR_ccmpsi;
17764 break;
17765
17766 case E_DImode:
17767 cmp_mode = DImode;
17768 icode = CODE_FOR_ccmpdi;
17769 break;
17770
17771 case E_SFmode:
17772 cmp_mode = SFmode;
17773 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17774 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17775 break;
17776
17777 case E_DFmode:
17778 cmp_mode = DFmode;
17779 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17780 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17781 break;
17782
17783 default:
17784 end_sequence ();
17785 return NULL_RTX;
17786 }
17787
17788 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17789 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17790 if (!op0 || !op1)
17791 {
17792 end_sequence ();
17793 return NULL_RTX;
17794 }
17795 *prep_seq = get_insns ();
17796 end_sequence ();
17797
17798 target = gen_rtx_REG (cc_mode, CC_REGNUM);
17799 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17800
17801 if (bit_code != AND)
17802 {
17803 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17804 GET_MODE (XEXP (prev, 0))),
17805 VOIDmode, XEXP (prev, 0), const0_rtx);
17806 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17807 }
17808
17809 create_fixed_operand (&ops[0], XEXP (prev, 0));
17810 create_fixed_operand (&ops[1], target);
17811 create_fixed_operand (&ops[2], op0);
17812 create_fixed_operand (&ops[3], op1);
17813 create_fixed_operand (&ops[4], prev);
17814 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17815
17816 push_to_sequence (*gen_seq);
17817 if (!maybe_expand_insn (icode, 6, ops))
17818 {
17819 end_sequence ();
17820 return NULL_RTX;
17821 }
17822
17823 *gen_seq = get_insns ();
17824 end_sequence ();
17825
17826 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17827 }
17828
17829 #undef TARGET_GEN_CCMP_FIRST
17830 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17831
17832 #undef TARGET_GEN_CCMP_NEXT
17833 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17834
17835 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
17836 instruction fusion of some sort. */
17837
17838 static bool
17839 aarch64_macro_fusion_p (void)
17840 {
17841 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17842 }
17843
17844
17845 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
17846 should be kept together during scheduling. */
17847
17848 static bool
17849 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17850 {
17851 rtx set_dest;
17852 rtx prev_set = single_set (prev);
17853 rtx curr_set = single_set (curr);
17854 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
17855 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17856
17857 if (!aarch64_macro_fusion_p ())
17858 return false;
17859
17860 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17861 {
17862 /* We are trying to match:
17863 prev (mov) == (set (reg r0) (const_int imm16))
17864 curr (movk) == (set (zero_extract (reg r0)
17865 (const_int 16)
17866 (const_int 16))
17867 (const_int imm16_1)) */
17868
17869 set_dest = SET_DEST (curr_set);
17870
17871 if (GET_CODE (set_dest) == ZERO_EXTRACT
17872 && CONST_INT_P (SET_SRC (curr_set))
17873 && CONST_INT_P (SET_SRC (prev_set))
17874 && CONST_INT_P (XEXP (set_dest, 2))
17875 && INTVAL (XEXP (set_dest, 2)) == 16
17876 && REG_P (XEXP (set_dest, 0))
17877 && REG_P (SET_DEST (prev_set))
17878 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17879 {
17880 return true;
17881 }
17882 }
17883
17884 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17885 {
17886
17887 /* We're trying to match:
17888 prev (adrp) == (set (reg r1)
17889 (high (symbol_ref ("SYM"))))
17890 curr (add) == (set (reg r0)
17891 (lo_sum (reg r1)
17892 (symbol_ref ("SYM"))))
17893 Note that r0 need not necessarily be the same as r1, especially
17894 during pre-regalloc scheduling. */
17895
17896 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17897 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17898 {
17899 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17900 && REG_P (XEXP (SET_SRC (curr_set), 0))
17901 && REGNO (XEXP (SET_SRC (curr_set), 0))
17902 == REGNO (SET_DEST (prev_set))
17903 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17904 XEXP (SET_SRC (curr_set), 1)))
17905 return true;
17906 }
17907 }
17908
17909 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17910 {
17911
17912 /* We're trying to match:
17913 prev (movk) == (set (zero_extract (reg r0)
17914 (const_int 16)
17915 (const_int 32))
17916 (const_int imm16_1))
17917 curr (movk) == (set (zero_extract (reg r0)
17918 (const_int 16)
17919 (const_int 48))
17920 (const_int imm16_2)) */
17921
17922 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17923 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17924 && REG_P (XEXP (SET_DEST (prev_set), 0))
17925 && REG_P (XEXP (SET_DEST (curr_set), 0))
17926 && REGNO (XEXP (SET_DEST (prev_set), 0))
17927 == REGNO (XEXP (SET_DEST (curr_set), 0))
17928 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17929 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17930 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17931 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17932 && CONST_INT_P (SET_SRC (prev_set))
17933 && CONST_INT_P (SET_SRC (curr_set)))
17934 return true;
17935
17936 }
17937 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17938 {
17939 /* We're trying to match:
17940 prev (adrp) == (set (reg r0)
17941 (high (symbol_ref ("SYM"))))
17942 curr (ldr) == (set (reg r1)
17943 (mem (lo_sum (reg r0)
17944 (symbol_ref ("SYM")))))
17945 or
17946 curr (ldr) == (set (reg r1)
17947 (zero_extend (mem
17948 (lo_sum (reg r0)
17949 (symbol_ref ("SYM")))))) */
17950 if (satisfies_constraint_Ush (SET_SRC (prev_set))
17951 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17952 {
17953 rtx curr_src = SET_SRC (curr_set);
17954
17955 if (GET_CODE (curr_src) == ZERO_EXTEND)
17956 curr_src = XEXP (curr_src, 0);
17957
17958 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17959 && REG_P (XEXP (XEXP (curr_src, 0), 0))
17960 && REGNO (XEXP (XEXP (curr_src, 0), 0))
17961 == REGNO (SET_DEST (prev_set))
17962 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17963 XEXP (SET_SRC (prev_set), 0)))
17964 return true;
17965 }
17966 }
17967
17968 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17969 && any_condjump_p (curr))
17970 {
17971 unsigned int condreg1, condreg2;
17972 rtx cc_reg_1;
17973 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17974 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17975
17976 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17977 && prev
17978 && modified_in_p (cc_reg_1, prev))
17979 {
17980 enum attr_type prev_type = get_attr_type (prev);
17981
17982 /* FIXME: this misses some which is considered simple arthematic
17983 instructions for ThunderX. Simple shifts are missed here. */
17984 if (prev_type == TYPE_ALUS_SREG
17985 || prev_type == TYPE_ALUS_IMM
17986 || prev_type == TYPE_LOGICS_REG
17987 || prev_type == TYPE_LOGICS_IMM)
17988 return true;
17989 }
17990 }
17991
17992 if (prev_set
17993 && curr_set
17994 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17995 && any_condjump_p (curr))
17996 {
17997 /* We're trying to match:
17998 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17999 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18000 (const_int 0))
18001 (label_ref ("SYM"))
18002 (pc)) */
18003 if (SET_DEST (curr_set) == (pc_rtx)
18004 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18005 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18006 && REG_P (SET_DEST (prev_set))
18007 && REGNO (SET_DEST (prev_set))
18008 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18009 {
18010 /* Fuse ALU operations followed by conditional branch instruction. */
18011 switch (get_attr_type (prev))
18012 {
18013 case TYPE_ALU_IMM:
18014 case TYPE_ALU_SREG:
18015 case TYPE_ADC_REG:
18016 case TYPE_ADC_IMM:
18017 case TYPE_ADCS_REG:
18018 case TYPE_ADCS_IMM:
18019 case TYPE_LOGIC_REG:
18020 case TYPE_LOGIC_IMM:
18021 case TYPE_CSEL:
18022 case TYPE_ADR:
18023 case TYPE_MOV_IMM:
18024 case TYPE_SHIFT_REG:
18025 case TYPE_SHIFT_IMM:
18026 case TYPE_BFM:
18027 case TYPE_RBIT:
18028 case TYPE_REV:
18029 case TYPE_EXTEND:
18030 return true;
18031
18032 default:;
18033 }
18034 }
18035 }
18036
18037 return false;
18038 }
18039
18040 /* Return true iff the instruction fusion described by OP is enabled. */
18041
18042 bool
18043 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18044 {
18045 return (aarch64_tune_params.fusible_ops & op) != 0;
18046 }
18047
18048 /* If MEM is in the form of [base+offset], extract the two parts
18049 of address and set to BASE and OFFSET, otherwise return false
18050 after clearing BASE and OFFSET. */
18051
18052 bool
18053 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18054 {
18055 rtx addr;
18056
18057 gcc_assert (MEM_P (mem));
18058
18059 addr = XEXP (mem, 0);
18060
18061 if (REG_P (addr))
18062 {
18063 *base = addr;
18064 *offset = const0_rtx;
18065 return true;
18066 }
18067
18068 if (GET_CODE (addr) == PLUS
18069 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18070 {
18071 *base = XEXP (addr, 0);
18072 *offset = XEXP (addr, 1);
18073 return true;
18074 }
18075
18076 *base = NULL_RTX;
18077 *offset = NULL_RTX;
18078
18079 return false;
18080 }
18081
18082 /* Types for scheduling fusion. */
18083 enum sched_fusion_type
18084 {
18085 SCHED_FUSION_NONE = 0,
18086 SCHED_FUSION_LD_SIGN_EXTEND,
18087 SCHED_FUSION_LD_ZERO_EXTEND,
18088 SCHED_FUSION_LD,
18089 SCHED_FUSION_ST,
18090 SCHED_FUSION_NUM
18091 };
18092
18093 /* If INSN is a load or store of address in the form of [base+offset],
18094 extract the two parts and set to BASE and OFFSET. Return scheduling
18095 fusion type this INSN is. */
18096
18097 static enum sched_fusion_type
18098 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18099 {
18100 rtx x, dest, src;
18101 enum sched_fusion_type fusion = SCHED_FUSION_LD;
18102
18103 gcc_assert (INSN_P (insn));
18104 x = PATTERN (insn);
18105 if (GET_CODE (x) != SET)
18106 return SCHED_FUSION_NONE;
18107
18108 src = SET_SRC (x);
18109 dest = SET_DEST (x);
18110
18111 machine_mode dest_mode = GET_MODE (dest);
18112
18113 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18114 return SCHED_FUSION_NONE;
18115
18116 if (GET_CODE (src) == SIGN_EXTEND)
18117 {
18118 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18119 src = XEXP (src, 0);
18120 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18121 return SCHED_FUSION_NONE;
18122 }
18123 else if (GET_CODE (src) == ZERO_EXTEND)
18124 {
18125 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18126 src = XEXP (src, 0);
18127 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18128 return SCHED_FUSION_NONE;
18129 }
18130
18131 if (GET_CODE (src) == MEM && REG_P (dest))
18132 extract_base_offset_in_addr (src, base, offset);
18133 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18134 {
18135 fusion = SCHED_FUSION_ST;
18136 extract_base_offset_in_addr (dest, base, offset);
18137 }
18138 else
18139 return SCHED_FUSION_NONE;
18140
18141 if (*base == NULL_RTX || *offset == NULL_RTX)
18142 fusion = SCHED_FUSION_NONE;
18143
18144 return fusion;
18145 }
18146
18147 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18148
18149 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18150 and PRI are only calculated for these instructions. For other instruction,
18151 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
18152 type instruction fusion can be added by returning different priorities.
18153
18154 It's important that irrelevant instructions get the largest FUSION_PRI. */
18155
18156 static void
18157 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18158 int *fusion_pri, int *pri)
18159 {
18160 int tmp, off_val;
18161 rtx base, offset;
18162 enum sched_fusion_type fusion;
18163
18164 gcc_assert (INSN_P (insn));
18165
18166 tmp = max_pri - 1;
18167 fusion = fusion_load_store (insn, &base, &offset);
18168 if (fusion == SCHED_FUSION_NONE)
18169 {
18170 *pri = tmp;
18171 *fusion_pri = tmp;
18172 return;
18173 }
18174
18175 /* Set FUSION_PRI according to fusion type and base register. */
18176 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18177
18178 /* Calculate PRI. */
18179 tmp /= 2;
18180
18181 /* INSN with smaller offset goes first. */
18182 off_val = (int)(INTVAL (offset));
18183 if (off_val >= 0)
18184 tmp -= (off_val & 0xfffff);
18185 else
18186 tmp += ((- off_val) & 0xfffff);
18187
18188 *pri = tmp;
18189 return;
18190 }
18191
18192 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18193 Adjust priority of sha1h instructions so they are scheduled before
18194 other SHA1 instructions. */
18195
18196 static int
18197 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18198 {
18199 rtx x = PATTERN (insn);
18200
18201 if (GET_CODE (x) == SET)
18202 {
18203 x = SET_SRC (x);
18204
18205 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18206 return priority + 10;
18207 }
18208
18209 return priority;
18210 }
18211
18212 /* Given OPERANDS of consecutive load/store, check if we can merge
18213 them into ldp/stp. LOAD is true if they are load instructions.
18214 MODE is the mode of memory operands. */
18215
18216 bool
18217 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18218 machine_mode mode)
18219 {
18220 HOST_WIDE_INT offval_1, offval_2, msize;
18221 enum reg_class rclass_1, rclass_2;
18222 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
18223
18224 if (load)
18225 {
18226 mem_1 = operands[1];
18227 mem_2 = operands[3];
18228 reg_1 = operands[0];
18229 reg_2 = operands[2];
18230 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
18231 if (REGNO (reg_1) == REGNO (reg_2))
18232 return false;
18233 }
18234 else
18235 {
18236 mem_1 = operands[0];
18237 mem_2 = operands[2];
18238 reg_1 = operands[1];
18239 reg_2 = operands[3];
18240 }
18241
18242 /* The mems cannot be volatile. */
18243 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
18244 return false;
18245
18246 /* If we have SImode and slow unaligned ldp,
18247 check the alignment to be at least 8 byte. */
18248 if (mode == SImode
18249 && (aarch64_tune_params.extra_tuning_flags
18250 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18251 && !optimize_size
18252 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
18253 return false;
18254
18255 /* Check if the addresses are in the form of [base+offset]. */
18256 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18257 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
18258 return false;
18259 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18260 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
18261 return false;
18262
18263 /* Check if the bases are same. */
18264 if (!rtx_equal_p (base_1, base_2))
18265 return false;
18266
18267 /* The operands must be of the same size. */
18268 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
18269 GET_MODE_SIZE (GET_MODE (mem_2))));
18270
18271 offval_1 = INTVAL (offset_1);
18272 offval_2 = INTVAL (offset_2);
18273 /* We should only be trying this for fixed-sized modes. There is no
18274 SVE LDP/STP instruction. */
18275 msize = GET_MODE_SIZE (mode).to_constant ();
18276 /* Check if the offsets are consecutive. */
18277 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
18278 return false;
18279
18280 /* Check if the addresses are clobbered by load. */
18281 if (load)
18282 {
18283 if (reg_mentioned_p (reg_1, mem_1))
18284 return false;
18285
18286 /* In increasing order, the last load can clobber the address. */
18287 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
18288 return false;
18289 }
18290
18291 /* One of the memory accesses must be a mempair operand.
18292 If it is not the first one, they need to be swapped by the
18293 peephole. */
18294 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
18295 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
18296 return false;
18297
18298 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
18299 rclass_1 = FP_REGS;
18300 else
18301 rclass_1 = GENERAL_REGS;
18302
18303 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
18304 rclass_2 = FP_REGS;
18305 else
18306 rclass_2 = GENERAL_REGS;
18307
18308 /* Check if the registers are of same class. */
18309 if (rclass_1 != rclass_2)
18310 return false;
18311
18312 return true;
18313 }
18314
18315 /* Given OPERANDS of consecutive load/store that can be merged,
18316 swap them if they are not in ascending order. */
18317 void
18318 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
18319 {
18320 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
18321 HOST_WIDE_INT offval_1, offval_2;
18322
18323 if (load)
18324 {
18325 mem_1 = operands[1];
18326 mem_2 = operands[3];
18327 }
18328 else
18329 {
18330 mem_1 = operands[0];
18331 mem_2 = operands[2];
18332 }
18333
18334 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18335 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18336
18337 offval_1 = INTVAL (offset_1);
18338 offval_2 = INTVAL (offset_2);
18339
18340 if (offval_1 > offval_2)
18341 {
18342 /* Irrespective of whether this is a load or a store,
18343 we do the same swap. */
18344 std::swap (operands[0], operands[2]);
18345 std::swap (operands[1], operands[3]);
18346 }
18347 }
18348
18349 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18350 comparison between the two. */
18351 int
18352 aarch64_host_wide_int_compare (const void *x, const void *y)
18353 {
18354 return wi::cmps (* ((const HOST_WIDE_INT *) x),
18355 * ((const HOST_WIDE_INT *) y));
18356 }
18357
18358 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18359 other pointing to a REG rtx containing an offset, compare the offsets
18360 of the two pairs.
18361
18362 Return:
18363
18364 1 iff offset (X) > offset (Y)
18365 0 iff offset (X) == offset (Y)
18366 -1 iff offset (X) < offset (Y) */
18367 int
18368 aarch64_ldrstr_offset_compare (const void *x, const void *y)
18369 {
18370 const rtx * operands_1 = (const rtx *) x;
18371 const rtx * operands_2 = (const rtx *) y;
18372 rtx mem_1, mem_2, base, offset_1, offset_2;
18373
18374 if (MEM_P (operands_1[0]))
18375 mem_1 = operands_1[0];
18376 else
18377 mem_1 = operands_1[1];
18378
18379 if (MEM_P (operands_2[0]))
18380 mem_2 = operands_2[0];
18381 else
18382 mem_2 = operands_2[1];
18383
18384 /* Extract the offsets. */
18385 extract_base_offset_in_addr (mem_1, &base, &offset_1);
18386 extract_base_offset_in_addr (mem_2, &base, &offset_2);
18387
18388 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
18389
18390 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
18391 }
18392
18393 /* Given OPERANDS of consecutive load/store, check if we can merge
18394 them into ldp/stp by adjusting the offset. LOAD is true if they
18395 are load instructions. MODE is the mode of memory operands.
18396
18397 Given below consecutive stores:
18398
18399 str w1, [xb, 0x100]
18400 str w1, [xb, 0x104]
18401 str w1, [xb, 0x108]
18402 str w1, [xb, 0x10c]
18403
18404 Though the offsets are out of the range supported by stp, we can
18405 still pair them after adjusting the offset, like:
18406
18407 add scratch, xb, 0x100
18408 stp w1, w1, [scratch]
18409 stp w1, w1, [scratch, 0x8]
18410
18411 The peephole patterns detecting this opportunity should guarantee
18412 the scratch register is avaliable. */
18413
18414 bool
18415 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
18416 scalar_mode mode)
18417 {
18418 const int num_insns = 4;
18419 enum reg_class rclass;
18420 HOST_WIDE_INT offvals[num_insns], msize;
18421 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
18422
18423 if (load)
18424 {
18425 for (int i = 0; i < num_insns; i++)
18426 {
18427 reg[i] = operands[2 * i];
18428 mem[i] = operands[2 * i + 1];
18429
18430 gcc_assert (REG_P (reg[i]));
18431 }
18432
18433 /* Do not attempt to merge the loads if the loads clobber each other. */
18434 for (int i = 0; i < 8; i += 2)
18435 for (int j = i + 2; j < 8; j += 2)
18436 if (reg_overlap_mentioned_p (operands[i], operands[j]))
18437 return false;
18438 }
18439 else
18440 for (int i = 0; i < num_insns; i++)
18441 {
18442 mem[i] = operands[2 * i];
18443 reg[i] = operands[2 * i + 1];
18444 }
18445
18446 /* Skip if memory operand is by itself valid for ldp/stp. */
18447 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18448 return false;
18449
18450 for (int i = 0; i < num_insns; i++)
18451 {
18452 /* The mems cannot be volatile. */
18453 if (MEM_VOLATILE_P (mem[i]))
18454 return false;
18455
18456 /* Check if the addresses are in the form of [base+offset]. */
18457 extract_base_offset_in_addr (mem[i], base + i, offset + i);
18458 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18459 return false;
18460 }
18461
18462 /* Check if the registers are of same class. */
18463 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18464 ? FP_REGS : GENERAL_REGS;
18465
18466 for (int i = 1; i < num_insns; i++)
18467 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18468 {
18469 if (rclass != FP_REGS)
18470 return false;
18471 }
18472 else
18473 {
18474 if (rclass != GENERAL_REGS)
18475 return false;
18476 }
18477
18478 /* Only the last register in the order in which they occur
18479 may be clobbered by the load. */
18480 if (rclass == GENERAL_REGS && load)
18481 for (int i = 0; i < num_insns - 1; i++)
18482 if (reg_mentioned_p (reg[i], mem[i]))
18483 return false;
18484
18485 /* Check if the bases are same. */
18486 for (int i = 0; i < num_insns - 1; i++)
18487 if (!rtx_equal_p (base[i], base[i + 1]))
18488 return false;
18489
18490 for (int i = 0; i < num_insns; i++)
18491 offvals[i] = INTVAL (offset[i]);
18492
18493 msize = GET_MODE_SIZE (mode);
18494
18495 /* Check if the offsets can be put in the right order to do a ldp/stp. */
18496 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18497 aarch64_host_wide_int_compare);
18498
18499 if (!(offvals[1] == offvals[0] + msize
18500 && offvals[3] == offvals[2] + msize))
18501 return false;
18502
18503 /* Check that offsets are within range of each other. The ldp/stp
18504 instructions have 7 bit immediate offsets, so use 0x80. */
18505 if (offvals[2] - offvals[0] >= msize * 0x80)
18506 return false;
18507
18508 /* The offsets must be aligned with respect to each other. */
18509 if (offvals[0] % msize != offvals[2] % msize)
18510 return false;
18511
18512 /* If we have SImode and slow unaligned ldp,
18513 check the alignment to be at least 8 byte. */
18514 if (mode == SImode
18515 && (aarch64_tune_params.extra_tuning_flags
18516 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18517 && !optimize_size
18518 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18519 return false;
18520
18521 return true;
18522 }
18523
18524 /* Given OPERANDS of consecutive load/store, this function pairs them
18525 into LDP/STP after adjusting the offset. It depends on the fact
18526 that the operands can be sorted so the offsets are correct for STP.
18527 MODE is the mode of memory operands. CODE is the rtl operator
18528 which should be applied to all memory operands, it's SIGN_EXTEND,
18529 ZERO_EXTEND or UNKNOWN. */
18530
18531 bool
18532 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18533 scalar_mode mode, RTX_CODE code)
18534 {
18535 rtx base, offset_1, offset_3, t1, t2;
18536 rtx mem_1, mem_2, mem_3, mem_4;
18537 rtx temp_operands[8];
18538 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18539 stp_off_upper_limit, stp_off_lower_limit, msize;
18540
18541 /* We make changes on a copy as we may still bail out. */
18542 for (int i = 0; i < 8; i ++)
18543 temp_operands[i] = operands[i];
18544
18545 /* Sort the operands. */
18546 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18547
18548 if (load)
18549 {
18550 mem_1 = temp_operands[1];
18551 mem_2 = temp_operands[3];
18552 mem_3 = temp_operands[5];
18553 mem_4 = temp_operands[7];
18554 }
18555 else
18556 {
18557 mem_1 = temp_operands[0];
18558 mem_2 = temp_operands[2];
18559 mem_3 = temp_operands[4];
18560 mem_4 = temp_operands[6];
18561 gcc_assert (code == UNKNOWN);
18562 }
18563
18564 extract_base_offset_in_addr (mem_1, &base, &offset_1);
18565 extract_base_offset_in_addr (mem_3, &base, &offset_3);
18566 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18567 && offset_3 != NULL_RTX);
18568
18569 /* Adjust offset so it can fit in LDP/STP instruction. */
18570 msize = GET_MODE_SIZE (mode);
18571 stp_off_upper_limit = msize * (0x40 - 1);
18572 stp_off_lower_limit = - msize * 0x40;
18573
18574 off_val_1 = INTVAL (offset_1);
18575 off_val_3 = INTVAL (offset_3);
18576
18577 /* The base offset is optimally half way between the two STP/LDP offsets. */
18578 if (msize <= 4)
18579 base_off = (off_val_1 + off_val_3) / 2;
18580 else
18581 /* However, due to issues with negative LDP/STP offset generation for
18582 larger modes, for DF, DI and vector modes. we must not use negative
18583 addresses smaller than 9 signed unadjusted bits can store. This
18584 provides the most range in this case. */
18585 base_off = off_val_1;
18586
18587 /* Adjust the base so that it is aligned with the addresses but still
18588 optimal. */
18589 if (base_off % msize != off_val_1 % msize)
18590 /* Fix the offset, bearing in mind we want to make it bigger not
18591 smaller. */
18592 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18593 else if (msize <= 4)
18594 /* The negative range of LDP/STP is one larger than the positive range. */
18595 base_off += msize;
18596
18597 /* Check if base offset is too big or too small. We can attempt to resolve
18598 this issue by setting it to the maximum value and seeing if the offsets
18599 still fit. */
18600 if (base_off >= 0x1000)
18601 {
18602 base_off = 0x1000 - 1;
18603 /* We must still make sure that the base offset is aligned with respect
18604 to the address. But it may may not be made any bigger. */
18605 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18606 }
18607
18608 /* Likewise for the case where the base is too small. */
18609 if (base_off <= -0x1000)
18610 {
18611 base_off = -0x1000 + 1;
18612 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18613 }
18614
18615 /* Offset of the first STP/LDP. */
18616 new_off_1 = off_val_1 - base_off;
18617
18618 /* Offset of the second STP/LDP. */
18619 new_off_3 = off_val_3 - base_off;
18620
18621 /* The offsets must be within the range of the LDP/STP instructions. */
18622 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18623 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18624 return false;
18625
18626 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18627 new_off_1), true);
18628 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18629 new_off_1 + msize), true);
18630 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18631 new_off_3), true);
18632 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18633 new_off_3 + msize), true);
18634
18635 if (!aarch64_mem_pair_operand (mem_1, mode)
18636 || !aarch64_mem_pair_operand (mem_3, mode))
18637 return false;
18638
18639 if (code == ZERO_EXTEND)
18640 {
18641 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18642 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18643 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18644 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18645 }
18646 else if (code == SIGN_EXTEND)
18647 {
18648 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18649 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18650 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18651 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18652 }
18653
18654 if (load)
18655 {
18656 operands[0] = temp_operands[0];
18657 operands[1] = mem_1;
18658 operands[2] = temp_operands[2];
18659 operands[3] = mem_2;
18660 operands[4] = temp_operands[4];
18661 operands[5] = mem_3;
18662 operands[6] = temp_operands[6];
18663 operands[7] = mem_4;
18664 }
18665 else
18666 {
18667 operands[0] = mem_1;
18668 operands[1] = temp_operands[1];
18669 operands[2] = mem_2;
18670 operands[3] = temp_operands[3];
18671 operands[4] = mem_3;
18672 operands[5] = temp_operands[5];
18673 operands[6] = mem_4;
18674 operands[7] = temp_operands[7];
18675 }
18676
18677 /* Emit adjusting instruction. */
18678 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18679 /* Emit ldp/stp instructions. */
18680 t1 = gen_rtx_SET (operands[0], operands[1]);
18681 t2 = gen_rtx_SET (operands[2], operands[3]);
18682 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18683 t1 = gen_rtx_SET (operands[4], operands[5]);
18684 t2 = gen_rtx_SET (operands[6], operands[7]);
18685 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18686 return true;
18687 }
18688
18689 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
18690 it isn't worth branching around empty masked ops (including masked
18691 stores). */
18692
18693 static bool
18694 aarch64_empty_mask_is_expensive (unsigned)
18695 {
18696 return false;
18697 }
18698
18699 /* Return 1 if pseudo register should be created and used to hold
18700 GOT address for PIC code. */
18701
18702 bool
18703 aarch64_use_pseudo_pic_reg (void)
18704 {
18705 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18706 }
18707
18708 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
18709
18710 static int
18711 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18712 {
18713 switch (XINT (x, 1))
18714 {
18715 case UNSPEC_GOTSMALLPIC:
18716 case UNSPEC_GOTSMALLPIC28K:
18717 case UNSPEC_GOTTINYPIC:
18718 return 0;
18719 default:
18720 break;
18721 }
18722
18723 return default_unspec_may_trap_p (x, flags);
18724 }
18725
18726
18727 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18728 return the log2 of that value. Otherwise return -1. */
18729
18730 int
18731 aarch64_fpconst_pow_of_2 (rtx x)
18732 {
18733 const REAL_VALUE_TYPE *r;
18734
18735 if (!CONST_DOUBLE_P (x))
18736 return -1;
18737
18738 r = CONST_DOUBLE_REAL_VALUE (x);
18739
18740 if (REAL_VALUE_NEGATIVE (*r)
18741 || REAL_VALUE_ISNAN (*r)
18742 || REAL_VALUE_ISINF (*r)
18743 || !real_isinteger (r, DFmode))
18744 return -1;
18745
18746 return exact_log2 (real_to_integer (r));
18747 }
18748
18749 /* If X is a vector of equal CONST_DOUBLE values and that value is
18750 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
18751
18752 int
18753 aarch64_vec_fpconst_pow_of_2 (rtx x)
18754 {
18755 int nelts;
18756 if (GET_CODE (x) != CONST_VECTOR
18757 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18758 return -1;
18759
18760 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18761 return -1;
18762
18763 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18764 if (firstval <= 0)
18765 return -1;
18766
18767 for (int i = 1; i < nelts; i++)
18768 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18769 return -1;
18770
18771 return firstval;
18772 }
18773
18774 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18775 to float.
18776
18777 __fp16 always promotes through this hook.
18778 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18779 through the generic excess precision logic rather than here. */
18780
18781 static tree
18782 aarch64_promoted_type (const_tree t)
18783 {
18784 if (SCALAR_FLOAT_TYPE_P (t)
18785 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18786 return float_type_node;
18787
18788 return NULL_TREE;
18789 }
18790
18791 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
18792
18793 static bool
18794 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18795 optimization_type opt_type)
18796 {
18797 switch (op)
18798 {
18799 case rsqrt_optab:
18800 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18801
18802 default:
18803 return true;
18804 }
18805 }
18806
18807 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
18808
18809 static unsigned int
18810 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18811 int *offset)
18812 {
18813 /* Polynomial invariant 1 == (VG / 2) - 1. */
18814 gcc_assert (i == 1);
18815 *factor = 2;
18816 *offset = 1;
18817 return AARCH64_DWARF_VG;
18818 }
18819
18820 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18821 if MODE is HFmode, and punt to the generic implementation otherwise. */
18822
18823 static bool
18824 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18825 {
18826 return (mode == HFmode
18827 ? true
18828 : default_libgcc_floating_mode_supported_p (mode));
18829 }
18830
18831 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18832 if MODE is HFmode, and punt to the generic implementation otherwise. */
18833
18834 static bool
18835 aarch64_scalar_mode_supported_p (scalar_mode mode)
18836 {
18837 return (mode == HFmode
18838 ? true
18839 : default_scalar_mode_supported_p (mode));
18840 }
18841
18842 /* Set the value of FLT_EVAL_METHOD.
18843 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18844
18845 0: evaluate all operations and constants, whose semantic type has at
18846 most the range and precision of type float, to the range and
18847 precision of float; evaluate all other operations and constants to
18848 the range and precision of the semantic type;
18849
18850 N, where _FloatN is a supported interchange floating type
18851 evaluate all operations and constants, whose semantic type has at
18852 most the range and precision of _FloatN type, to the range and
18853 precision of the _FloatN type; evaluate all other operations and
18854 constants to the range and precision of the semantic type;
18855
18856 If we have the ARMv8.2-A extensions then we support _Float16 in native
18857 precision, so we should set this to 16. Otherwise, we support the type,
18858 but want to evaluate expressions in float precision, so set this to
18859 0. */
18860
18861 static enum flt_eval_method
18862 aarch64_excess_precision (enum excess_precision_type type)
18863 {
18864 switch (type)
18865 {
18866 case EXCESS_PRECISION_TYPE_FAST:
18867 case EXCESS_PRECISION_TYPE_STANDARD:
18868 /* We can calculate either in 16-bit range and precision or
18869 32-bit range and precision. Make that decision based on whether
18870 we have native support for the ARMv8.2-A 16-bit floating-point
18871 instructions or not. */
18872 return (TARGET_FP_F16INST
18873 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18874 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18875 case EXCESS_PRECISION_TYPE_IMPLICIT:
18876 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18877 default:
18878 gcc_unreachable ();
18879 }
18880 return FLT_EVAL_METHOD_UNPREDICTABLE;
18881 }
18882
18883 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
18884 scheduled for speculative execution. Reject the long-running division
18885 and square-root instructions. */
18886
18887 static bool
18888 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18889 {
18890 switch (get_attr_type (insn))
18891 {
18892 case TYPE_SDIV:
18893 case TYPE_UDIV:
18894 case TYPE_FDIVS:
18895 case TYPE_FDIVD:
18896 case TYPE_FSQRTS:
18897 case TYPE_FSQRTD:
18898 case TYPE_NEON_FP_SQRT_S:
18899 case TYPE_NEON_FP_SQRT_D:
18900 case TYPE_NEON_FP_SQRT_S_Q:
18901 case TYPE_NEON_FP_SQRT_D_Q:
18902 case TYPE_NEON_FP_DIV_S:
18903 case TYPE_NEON_FP_DIV_D:
18904 case TYPE_NEON_FP_DIV_S_Q:
18905 case TYPE_NEON_FP_DIV_D_Q:
18906 return false;
18907 default:
18908 return true;
18909 }
18910 }
18911
18912 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
18913
18914 static int
18915 aarch64_compute_pressure_classes (reg_class *classes)
18916 {
18917 int i = 0;
18918 classes[i++] = GENERAL_REGS;
18919 classes[i++] = FP_REGS;
18920 /* PR_REGS isn't a useful pressure class because many predicate pseudo
18921 registers need to go in PR_LO_REGS at some point during their
18922 lifetime. Splitting it into two halves has the effect of making
18923 all predicates count against PR_LO_REGS, so that we try whenever
18924 possible to restrict the number of live predicates to 8. This
18925 greatly reduces the amount of spilling in certain loops. */
18926 classes[i++] = PR_LO_REGS;
18927 classes[i++] = PR_HI_REGS;
18928 return i;
18929 }
18930
18931 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
18932
18933 static bool
18934 aarch64_can_change_mode_class (machine_mode from,
18935 machine_mode to, reg_class_t)
18936 {
18937 if (BYTES_BIG_ENDIAN)
18938 {
18939 bool from_sve_p = aarch64_sve_data_mode_p (from);
18940 bool to_sve_p = aarch64_sve_data_mode_p (to);
18941
18942 /* Don't allow changes between SVE data modes and non-SVE modes.
18943 See the comment at the head of aarch64-sve.md for details. */
18944 if (from_sve_p != to_sve_p)
18945 return false;
18946
18947 /* Don't allow changes in element size: lane 0 of the new vector
18948 would not then be lane 0 of the old vector. See the comment
18949 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18950 description.
18951
18952 In the worst case, this forces a register to be spilled in
18953 one mode and reloaded in the other, which handles the
18954 endianness correctly. */
18955 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18956 return false;
18957 }
18958 return true;
18959 }
18960
18961 /* Implement TARGET_EARLY_REMAT_MODES. */
18962
18963 static void
18964 aarch64_select_early_remat_modes (sbitmap modes)
18965 {
18966 /* SVE values are not normally live across a call, so it should be
18967 worth doing early rematerialization even in VL-specific mode. */
18968 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18969 {
18970 machine_mode mode = (machine_mode) i;
18971 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18972 if (vec_flags & VEC_ANY_SVE)
18973 bitmap_set_bit (modes, i);
18974 }
18975 }
18976
18977 /* Override the default target speculation_safe_value. */
18978 static rtx
18979 aarch64_speculation_safe_value (machine_mode mode,
18980 rtx result, rtx val, rtx failval)
18981 {
18982 /* Maybe we should warn if falling back to hard barriers. They are
18983 likely to be noticably more expensive than the alternative below. */
18984 if (!aarch64_track_speculation)
18985 return default_speculation_safe_value (mode, result, val, failval);
18986
18987 if (!REG_P (val))
18988 val = copy_to_mode_reg (mode, val);
18989
18990 if (!aarch64_reg_or_zero (failval, mode))
18991 failval = copy_to_mode_reg (mode, failval);
18992
18993 emit_insn (gen_despeculate_copy (mode, result, val, failval));
18994 return result;
18995 }
18996
18997 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18998 Look into the tuning structure for an estimate.
18999 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19000 Advanced SIMD 128 bits. */
19001
19002 static HOST_WIDE_INT
19003 aarch64_estimated_poly_value (poly_int64 val)
19004 {
19005 enum aarch64_sve_vector_bits_enum width_source
19006 = aarch64_tune_params.sve_width;
19007
19008 /* If we still don't have an estimate, use the default. */
19009 if (width_source == SVE_SCALABLE)
19010 return default_estimated_poly_value (val);
19011
19012 HOST_WIDE_INT over_128 = width_source - 128;
19013 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19014 }
19015
19016
19017 /* Return true for types that could be supported as SIMD return or
19018 argument types. */
19019
19020 static bool
19021 supported_simd_type (tree t)
19022 {
19023 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19024 {
19025 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19026 return s == 1 || s == 2 || s == 4 || s == 8;
19027 }
19028 return false;
19029 }
19030
19031 /* Return true for types that currently are supported as SIMD return
19032 or argument types. */
19033
19034 static bool
19035 currently_supported_simd_type (tree t, tree b)
19036 {
19037 if (COMPLEX_FLOAT_TYPE_P (t))
19038 return false;
19039
19040 if (TYPE_SIZE (t) != TYPE_SIZE (b))
19041 return false;
19042
19043 return supported_simd_type (t);
19044 }
19045
19046 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
19047
19048 static int
19049 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19050 struct cgraph_simd_clone *clonei,
19051 tree base_type, int num)
19052 {
19053 tree t, ret_type, arg_type;
19054 unsigned int elt_bits, vec_bits, count;
19055
19056 if (!TARGET_SIMD)
19057 return 0;
19058
19059 if (clonei->simdlen
19060 && (clonei->simdlen < 2
19061 || clonei->simdlen > 1024
19062 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19063 {
19064 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19065 "unsupported simdlen %d", clonei->simdlen);
19066 return 0;
19067 }
19068
19069 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19070 if (TREE_CODE (ret_type) != VOID_TYPE
19071 && !currently_supported_simd_type (ret_type, base_type))
19072 {
19073 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19074 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19075 "GCC does not currently support mixed size types "
19076 "for %<simd%> functions");
19077 else if (supported_simd_type (ret_type))
19078 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19079 "GCC does not currently support return type %qT "
19080 "for %<simd%> functions", ret_type);
19081 else
19082 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19083 "unsupported return type %qT for %<simd%> functions",
19084 ret_type);
19085 return 0;
19086 }
19087
19088 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19089 {
19090 arg_type = TREE_TYPE (t);
19091
19092 if (!currently_supported_simd_type (arg_type, base_type))
19093 {
19094 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19095 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19096 "GCC does not currently support mixed size types "
19097 "for %<simd%> functions");
19098 else
19099 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19100 "GCC does not currently support argument type %qT "
19101 "for %<simd%> functions", arg_type);
19102 return 0;
19103 }
19104 }
19105
19106 clonei->vecsize_mangle = 'n';
19107 clonei->mask_mode = VOIDmode;
19108 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19109 if (clonei->simdlen == 0)
19110 {
19111 count = 2;
19112 vec_bits = (num == 0 ? 64 : 128);
19113 clonei->simdlen = vec_bits / elt_bits;
19114 }
19115 else
19116 {
19117 count = 1;
19118 vec_bits = clonei->simdlen * elt_bits;
19119 if (vec_bits != 64 && vec_bits != 128)
19120 {
19121 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19122 "GCC does not currently support simdlen %d for type %qT",
19123 clonei->simdlen, base_type);
19124 return 0;
19125 }
19126 }
19127 clonei->vecsize_int = vec_bits;
19128 clonei->vecsize_float = vec_bits;
19129 return count;
19130 }
19131
19132 /* Implement TARGET_SIMD_CLONE_ADJUST. */
19133
19134 static void
19135 aarch64_simd_clone_adjust (struct cgraph_node *node)
19136 {
19137 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19138 use the correct ABI. */
19139
19140 tree t = TREE_TYPE (node->decl);
19141 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19142 TYPE_ATTRIBUTES (t));
19143 }
19144
19145 /* Implement TARGET_SIMD_CLONE_USABLE. */
19146
19147 static int
19148 aarch64_simd_clone_usable (struct cgraph_node *node)
19149 {
19150 switch (node->simdclone->vecsize_mangle)
19151 {
19152 case 'n':
19153 if (!TARGET_SIMD)
19154 return -1;
19155 return 0;
19156 default:
19157 gcc_unreachable ();
19158 }
19159 }
19160
19161 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19162
19163 static int
19164 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19165 {
19166 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19167 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19168 return 0;
19169 return 1;
19170 }
19171
19172 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19173
19174 static const char *
19175 aarch64_get_multilib_abi_name (void)
19176 {
19177 if (TARGET_BIG_END)
19178 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19179 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19180 }
19181
19182 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19183 global variable based guard use the default else
19184 return a null tree. */
19185 static tree
19186 aarch64_stack_protect_guard (void)
19187 {
19188 if (aarch64_stack_protector_guard == SSP_GLOBAL)
19189 return default_stack_protect_guard ();
19190
19191 return NULL_TREE;
19192 }
19193
19194 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
19195 section at the end if needed. */
19196 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
19197 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
19198 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
19199 void
19200 aarch64_file_end_indicate_exec_stack ()
19201 {
19202 file_end_indicate_exec_stack ();
19203
19204 unsigned feature_1_and = 0;
19205 if (aarch64_bti_enabled ())
19206 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19207
19208 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19209 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19210
19211 if (feature_1_and)
19212 {
19213 /* Generate .note.gnu.property section. */
19214 switch_to_section (get_section (".note.gnu.property",
19215 SECTION_NOTYPE, NULL));
19216
19217 /* PT_NOTE header: namesz, descsz, type.
19218 namesz = 4 ("GNU\0")
19219 descsz = 16 (Size of the program property array)
19220 [(12 + padding) * Number of array elements]
19221 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
19222 assemble_align (POINTER_SIZE);
19223 assemble_integer (GEN_INT (4), 4, 32, 1);
19224 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
19225 assemble_integer (GEN_INT (5), 4, 32, 1);
19226
19227 /* PT_NOTE name. */
19228 assemble_string ("GNU", 4);
19229
19230 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19231 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19232 datasz = 4
19233 data = feature_1_and. */
19234 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
19235 assemble_integer (GEN_INT (4), 4, 32, 1);
19236 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
19237
19238 /* Pad the size of the note to the required alignment. */
19239 assemble_align (POINTER_SIZE);
19240 }
19241 }
19242 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19243 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19244 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19245
19246 /* Target-specific selftests. */
19247
19248 #if CHECKING_P
19249
19250 namespace selftest {
19251
19252 /* Selftest for the RTL loader.
19253 Verify that the RTL loader copes with a dump from
19254 print_rtx_function. This is essentially just a test that class
19255 function_reader can handle a real dump, but it also verifies
19256 that lookup_reg_by_dump_name correctly handles hard regs.
19257 The presence of hard reg names in the dump means that the test is
19258 target-specific, hence it is in this file. */
19259
19260 static void
19261 aarch64_test_loading_full_dump ()
19262 {
19263 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
19264
19265 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
19266
19267 rtx_insn *insn_1 = get_insn_by_uid (1);
19268 ASSERT_EQ (NOTE, GET_CODE (insn_1));
19269
19270 rtx_insn *insn_15 = get_insn_by_uid (15);
19271 ASSERT_EQ (INSN, GET_CODE (insn_15));
19272 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
19273
19274 /* Verify crtl->return_rtx. */
19275 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
19276 ASSERT_EQ (0, REGNO (crtl->return_rtx));
19277 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
19278 }
19279
19280 /* Run all target-specific selftests. */
19281
19282 static void
19283 aarch64_run_selftests (void)
19284 {
19285 aarch64_test_loading_full_dump ();
19286 }
19287
19288 } // namespace selftest
19289
19290 #endif /* #if CHECKING_P */
19291
19292 #undef TARGET_STACK_PROTECT_GUARD
19293 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19294
19295 #undef TARGET_ADDRESS_COST
19296 #define TARGET_ADDRESS_COST aarch64_address_cost
19297
19298 /* This hook will determines whether unnamed bitfields affect the alignment
19299 of the containing structure. The hook returns true if the structure
19300 should inherit the alignment requirements of an unnamed bitfield's
19301 type. */
19302 #undef TARGET_ALIGN_ANON_BITFIELD
19303 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19304
19305 #undef TARGET_ASM_ALIGNED_DI_OP
19306 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19307
19308 #undef TARGET_ASM_ALIGNED_HI_OP
19309 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19310
19311 #undef TARGET_ASM_ALIGNED_SI_OP
19312 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19313
19314 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19315 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19316 hook_bool_const_tree_hwi_hwi_const_tree_true
19317
19318 #undef TARGET_ASM_FILE_START
19319 #define TARGET_ASM_FILE_START aarch64_start_file
19320
19321 #undef TARGET_ASM_OUTPUT_MI_THUNK
19322 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19323
19324 #undef TARGET_ASM_SELECT_RTX_SECTION
19325 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19326
19327 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19328 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19329
19330 #undef TARGET_BUILD_BUILTIN_VA_LIST
19331 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19332
19333 #undef TARGET_CALLEE_COPIES
19334 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19335
19336 #undef TARGET_CAN_ELIMINATE
19337 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19338
19339 #undef TARGET_CAN_INLINE_P
19340 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19341
19342 #undef TARGET_CANNOT_FORCE_CONST_MEM
19343 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19344
19345 #undef TARGET_CASE_VALUES_THRESHOLD
19346 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19347
19348 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19349 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19350
19351 /* Only the least significant bit is used for initialization guard
19352 variables. */
19353 #undef TARGET_CXX_GUARD_MASK_BIT
19354 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19355
19356 #undef TARGET_C_MODE_FOR_SUFFIX
19357 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19358
19359 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19360 #undef TARGET_DEFAULT_TARGET_FLAGS
19361 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19362 #endif
19363
19364 #undef TARGET_CLASS_MAX_NREGS
19365 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19366
19367 #undef TARGET_BUILTIN_DECL
19368 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19369
19370 #undef TARGET_BUILTIN_RECIPROCAL
19371 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19372
19373 #undef TARGET_C_EXCESS_PRECISION
19374 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19375
19376 #undef TARGET_EXPAND_BUILTIN
19377 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19378
19379 #undef TARGET_EXPAND_BUILTIN_VA_START
19380 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19381
19382 #undef TARGET_FOLD_BUILTIN
19383 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19384
19385 #undef TARGET_FUNCTION_ARG
19386 #define TARGET_FUNCTION_ARG aarch64_function_arg
19387
19388 #undef TARGET_FUNCTION_ARG_ADVANCE
19389 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19390
19391 #undef TARGET_FUNCTION_ARG_BOUNDARY
19392 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19393
19394 #undef TARGET_FUNCTION_ARG_PADDING
19395 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19396
19397 #undef TARGET_GET_RAW_RESULT_MODE
19398 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19399 #undef TARGET_GET_RAW_ARG_MODE
19400 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19401
19402 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19403 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19404
19405 #undef TARGET_FUNCTION_VALUE
19406 #define TARGET_FUNCTION_VALUE aarch64_function_value
19407
19408 #undef TARGET_FUNCTION_VALUE_REGNO_P
19409 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19410
19411 #undef TARGET_GIMPLE_FOLD_BUILTIN
19412 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19413
19414 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19415 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19416
19417 #undef TARGET_INIT_BUILTINS
19418 #define TARGET_INIT_BUILTINS aarch64_init_builtins
19419
19420 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19421 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19422 aarch64_ira_change_pseudo_allocno_class
19423
19424 #undef TARGET_LEGITIMATE_ADDRESS_P
19425 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19426
19427 #undef TARGET_LEGITIMATE_CONSTANT_P
19428 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19429
19430 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19431 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19432 aarch64_legitimize_address_displacement
19433
19434 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19435 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19436
19437 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19438 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19439 aarch64_libgcc_floating_mode_supported_p
19440
19441 #undef TARGET_MANGLE_TYPE
19442 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19443
19444 #undef TARGET_MEMORY_MOVE_COST
19445 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19446
19447 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19448 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19449
19450 #undef TARGET_MUST_PASS_IN_STACK
19451 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19452
19453 /* This target hook should return true if accesses to volatile bitfields
19454 should use the narrowest mode possible. It should return false if these
19455 accesses should use the bitfield container type. */
19456 #undef TARGET_NARROW_VOLATILE_BITFIELD
19457 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19458
19459 #undef TARGET_OPTION_OVERRIDE
19460 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19461
19462 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19463 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19464 aarch64_override_options_after_change
19465
19466 #undef TARGET_OPTION_SAVE
19467 #define TARGET_OPTION_SAVE aarch64_option_save
19468
19469 #undef TARGET_OPTION_RESTORE
19470 #define TARGET_OPTION_RESTORE aarch64_option_restore
19471
19472 #undef TARGET_OPTION_PRINT
19473 #define TARGET_OPTION_PRINT aarch64_option_print
19474
19475 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19476 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19477
19478 #undef TARGET_SET_CURRENT_FUNCTION
19479 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19480
19481 #undef TARGET_PASS_BY_REFERENCE
19482 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19483
19484 #undef TARGET_PREFERRED_RELOAD_CLASS
19485 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19486
19487 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19488 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19489
19490 #undef TARGET_PROMOTED_TYPE
19491 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19492
19493 #undef TARGET_SECONDARY_RELOAD
19494 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19495
19496 #undef TARGET_SHIFT_TRUNCATION_MASK
19497 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19498
19499 #undef TARGET_SETUP_INCOMING_VARARGS
19500 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19501
19502 #undef TARGET_STRUCT_VALUE_RTX
19503 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
19504
19505 #undef TARGET_REGISTER_MOVE_COST
19506 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19507
19508 #undef TARGET_RETURN_IN_MEMORY
19509 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19510
19511 #undef TARGET_RETURN_IN_MSB
19512 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19513
19514 #undef TARGET_RTX_COSTS
19515 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19516
19517 #undef TARGET_SCALAR_MODE_SUPPORTED_P
19518 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
19519
19520 #undef TARGET_SCHED_ISSUE_RATE
19521 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19522
19523 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19524 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19525 aarch64_sched_first_cycle_multipass_dfa_lookahead
19526
19527 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19528 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19529 aarch64_first_cycle_multipass_dfa_lookahead_guard
19530
19531 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19532 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19533 aarch64_get_separate_components
19534
19535 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19536 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19537 aarch64_components_for_bb
19538
19539 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19540 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19541 aarch64_disqualify_components
19542
19543 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19544 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19545 aarch64_emit_prologue_components
19546
19547 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19548 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19549 aarch64_emit_epilogue_components
19550
19551 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19552 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19553 aarch64_set_handled_components
19554
19555 #undef TARGET_TRAMPOLINE_INIT
19556 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19557
19558 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19559 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19560
19561 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19562 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19563
19564 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19565 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19566 aarch64_builtin_support_vector_misalignment
19567
19568 #undef TARGET_ARRAY_MODE
19569 #define TARGET_ARRAY_MODE aarch64_array_mode
19570
19571 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19572 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19573
19574 #undef TARGET_VECTORIZE_ADD_STMT_COST
19575 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19576
19577 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19578 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19579 aarch64_builtin_vectorization_cost
19580
19581 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19582 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19583
19584 #undef TARGET_VECTORIZE_BUILTINS
19585 #define TARGET_VECTORIZE_BUILTINS
19586
19587 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19588 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19589 aarch64_builtin_vectorized_function
19590
19591 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19592 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19593 aarch64_autovectorize_vector_sizes
19594
19595 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19596 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19597 aarch64_atomic_assign_expand_fenv
19598
19599 /* Section anchor support. */
19600
19601 #undef TARGET_MIN_ANCHOR_OFFSET
19602 #define TARGET_MIN_ANCHOR_OFFSET -256
19603
19604 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19605 byte offset; we can do much more for larger data types, but have no way
19606 to determine the size of the access. We assume accesses are aligned. */
19607 #undef TARGET_MAX_ANCHOR_OFFSET
19608 #define TARGET_MAX_ANCHOR_OFFSET 4095
19609
19610 #undef TARGET_VECTOR_ALIGNMENT
19611 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19612
19613 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19614 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19615 aarch64_vectorize_preferred_vector_alignment
19616 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19617 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19618 aarch64_simd_vector_alignment_reachable
19619
19620 /* vec_perm support. */
19621
19622 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19623 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19624 aarch64_vectorize_vec_perm_const
19625
19626 #undef TARGET_VECTORIZE_GET_MASK_MODE
19627 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19628 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19629 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19630 aarch64_empty_mask_is_expensive
19631 #undef TARGET_PREFERRED_ELSE_VALUE
19632 #define TARGET_PREFERRED_ELSE_VALUE \
19633 aarch64_preferred_else_value
19634
19635 #undef TARGET_INIT_LIBFUNCS
19636 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19637
19638 #undef TARGET_FIXED_CONDITION_CODE_REGS
19639 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19640
19641 #undef TARGET_FLAGS_REGNUM
19642 #define TARGET_FLAGS_REGNUM CC_REGNUM
19643
19644 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19645 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19646
19647 #undef TARGET_ASAN_SHADOW_OFFSET
19648 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19649
19650 #undef TARGET_LEGITIMIZE_ADDRESS
19651 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19652
19653 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19654 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19655
19656 #undef TARGET_CAN_USE_DOLOOP_P
19657 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19658
19659 #undef TARGET_SCHED_ADJUST_PRIORITY
19660 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19661
19662 #undef TARGET_SCHED_MACRO_FUSION_P
19663 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19664
19665 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19666 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19667
19668 #undef TARGET_SCHED_FUSION_PRIORITY
19669 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19670
19671 #undef TARGET_UNSPEC_MAY_TRAP_P
19672 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19673
19674 #undef TARGET_USE_PSEUDO_PIC_REG
19675 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19676
19677 #undef TARGET_PRINT_OPERAND
19678 #define TARGET_PRINT_OPERAND aarch64_print_operand
19679
19680 #undef TARGET_PRINT_OPERAND_ADDRESS
19681 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19682
19683 #undef TARGET_OPTAB_SUPPORTED_P
19684 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19685
19686 #undef TARGET_OMIT_STRUCT_RETURN_REG
19687 #define TARGET_OMIT_STRUCT_RETURN_REG true
19688
19689 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19690 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19691 aarch64_dwarf_poly_indeterminate_value
19692
19693 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
19694 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19695 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19696
19697 #undef TARGET_HARD_REGNO_NREGS
19698 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19699 #undef TARGET_HARD_REGNO_MODE_OK
19700 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19701
19702 #undef TARGET_MODES_TIEABLE_P
19703 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19704
19705 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19706 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19707 aarch64_hard_regno_call_part_clobbered
19708
19709 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19710 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19711 aarch64_remove_extra_call_preserved_regs
19712
19713 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19714 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19715 aarch64_return_call_with_max_clobbers
19716
19717 #undef TARGET_CONSTANT_ALIGNMENT
19718 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19719
19720 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19721 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19722 aarch64_stack_clash_protection_alloca_probe_range
19723
19724 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19725 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19726
19727 #undef TARGET_CAN_CHANGE_MODE_CLASS
19728 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19729
19730 #undef TARGET_SELECT_EARLY_REMAT_MODES
19731 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19732
19733 #undef TARGET_SPECULATION_SAFE_VALUE
19734 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19735
19736 #undef TARGET_ESTIMATED_POLY_VALUE
19737 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19738
19739 #undef TARGET_ATTRIBUTE_TABLE
19740 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19741
19742 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19743 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19744 aarch64_simd_clone_compute_vecsize_and_simdlen
19745
19746 #undef TARGET_SIMD_CLONE_ADJUST
19747 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19748
19749 #undef TARGET_SIMD_CLONE_USABLE
19750 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19751
19752 #undef TARGET_COMP_TYPE_ATTRIBUTES
19753 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
19754
19755 #undef TARGET_GET_MULTILIB_ABI_NAME
19756 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
19757
19758 #if CHECKING_P
19759 #undef TARGET_RUN_TARGET_SELFTESTS
19760 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19761 #endif /* #if CHECKING_P */
19762
19763 #undef TARGET_ASM_POST_CFI_STARTPROC
19764 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
19765
19766 struct gcc_target targetm = TARGET_INITIALIZER;
19767
19768 #include "gt-aarch64.h"