]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
[Aarch64] Fix vec_perm cost for thunderx2t99
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 #include "expmed.h"
77 #include "function-abi.h"
78
79 /* This file should be included last. */
80 #include "target-def.h"
81
82 /* Defined for convenience. */
83 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
84
85 /* Information about a legitimate vector immediate operand. */
86 struct simd_immediate_info
87 {
88 enum insn_type { MOV, MVN, INDEX, PTRUE };
89 enum modifier_type { LSL, MSL };
90
91 simd_immediate_info () {}
92 simd_immediate_info (scalar_float_mode, rtx);
93 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
94 insn_type = MOV, modifier_type = LSL,
95 unsigned int = 0);
96 simd_immediate_info (scalar_mode, rtx, rtx);
97 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
98
99 /* The mode of the elements. */
100 scalar_mode elt_mode;
101
102 /* The instruction to use to move the immediate into a vector. */
103 insn_type insn;
104
105 union
106 {
107 /* For MOV and MVN. */
108 struct
109 {
110 /* The value of each element. */
111 rtx value;
112
113 /* The kind of shift modifier to use, and the number of bits to shift.
114 This is (LSL, 0) if no shift is needed. */
115 modifier_type modifier;
116 unsigned int shift;
117 } mov;
118
119 /* For INDEX. */
120 struct
121 {
122 /* The value of the first element and the step to be added for each
123 subsequent element. */
124 rtx base, step;
125 } index;
126
127 /* For PTRUE. */
128 aarch64_svpattern pattern;
129 } u;
130 };
131
132 /* Construct a floating-point immediate in which each element has mode
133 ELT_MODE_IN and value VALUE_IN. */
134 inline simd_immediate_info
135 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
136 : elt_mode (elt_mode_in), insn (MOV)
137 {
138 u.mov.value = value_in;
139 u.mov.modifier = LSL;
140 u.mov.shift = 0;
141 }
142
143 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
144 and value VALUE_IN. The other parameters are as for the structure
145 fields. */
146 inline simd_immediate_info
147 ::simd_immediate_info (scalar_int_mode elt_mode_in,
148 unsigned HOST_WIDE_INT value_in,
149 insn_type insn_in, modifier_type modifier_in,
150 unsigned int shift_in)
151 : elt_mode (elt_mode_in), insn (insn_in)
152 {
153 u.mov.value = gen_int_mode (value_in, elt_mode_in);
154 u.mov.modifier = modifier_in;
155 u.mov.shift = shift_in;
156 }
157
158 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
159 and where element I is equal to BASE_IN + I * STEP_IN. */
160 inline simd_immediate_info
161 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
162 : elt_mode (elt_mode_in), insn (INDEX)
163 {
164 u.index.base = base_in;
165 u.index.step = step_in;
166 }
167
168 /* Construct a predicate that controls elements of mode ELT_MODE_IN
169 and has PTRUE pattern PATTERN_IN. */
170 inline simd_immediate_info
171 ::simd_immediate_info (scalar_int_mode elt_mode_in,
172 aarch64_svpattern pattern_in)
173 : elt_mode (elt_mode_in), insn (PTRUE)
174 {
175 u.pattern = pattern_in;
176 }
177
178 /* The current code model. */
179 enum aarch64_code_model aarch64_cmodel;
180
181 /* The number of 64-bit elements in an SVE vector. */
182 poly_uint16 aarch64_sve_vg;
183
184 #ifdef HAVE_AS_TLS
185 #undef TARGET_HAVE_TLS
186 #define TARGET_HAVE_TLS 1
187 #endif
188
189 static bool aarch64_composite_type_p (const_tree, machine_mode);
190 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
191 const_tree,
192 machine_mode *, int *,
193 bool *);
194 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
195 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
196 static void aarch64_override_options_after_change (void);
197 static bool aarch64_vector_mode_supported_p (machine_mode);
198 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
199 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
200 const_tree type,
201 int misalignment,
202 bool is_packed);
203 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
204 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
205 aarch64_addr_query_type);
206 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
207
208 /* Major revision number of the ARM Architecture implemented by the target. */
209 unsigned aarch64_architecture_version;
210
211 /* The processor for which instructions should be scheduled. */
212 enum aarch64_processor aarch64_tune = cortexa53;
213
214 /* Mask to specify which instruction scheduling options should be used. */
215 uint64_t aarch64_tune_flags = 0;
216
217 /* Global flag for PC relative loads. */
218 bool aarch64_pcrelative_literal_loads;
219
220 /* Global flag for whether frame pointer is enabled. */
221 bool aarch64_use_frame_pointer;
222
223 #define BRANCH_PROTECT_STR_MAX 255
224 char *accepted_branch_protection_string = NULL;
225
226 static enum aarch64_parse_opt_result
227 aarch64_parse_branch_protection (const char*, char**);
228
229 /* Support for command line parsing of boolean flags in the tuning
230 structures. */
231 struct aarch64_flag_desc
232 {
233 const char* name;
234 unsigned int flag;
235 };
236
237 #define AARCH64_FUSION_PAIR(name, internal_name) \
238 { name, AARCH64_FUSE_##internal_name },
239 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
240 {
241 { "none", AARCH64_FUSE_NOTHING },
242 #include "aarch64-fusion-pairs.def"
243 { "all", AARCH64_FUSE_ALL },
244 { NULL, AARCH64_FUSE_NOTHING }
245 };
246
247 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
248 { name, AARCH64_EXTRA_TUNE_##internal_name },
249 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
250 {
251 { "none", AARCH64_EXTRA_TUNE_NONE },
252 #include "aarch64-tuning-flags.def"
253 { "all", AARCH64_EXTRA_TUNE_ALL },
254 { NULL, AARCH64_EXTRA_TUNE_NONE }
255 };
256
257 /* Tuning parameters. */
258
259 static const struct cpu_addrcost_table generic_addrcost_table =
260 {
261 {
262 1, /* hi */
263 0, /* si */
264 0, /* di */
265 1, /* ti */
266 },
267 0, /* pre_modify */
268 0, /* post_modify */
269 0, /* register_offset */
270 0, /* register_sextend */
271 0, /* register_zextend */
272 0 /* imm_offset */
273 };
274
275 static const struct cpu_addrcost_table exynosm1_addrcost_table =
276 {
277 {
278 0, /* hi */
279 0, /* si */
280 0, /* di */
281 2, /* ti */
282 },
283 0, /* pre_modify */
284 0, /* post_modify */
285 1, /* register_offset */
286 1, /* register_sextend */
287 2, /* register_zextend */
288 0, /* imm_offset */
289 };
290
291 static const struct cpu_addrcost_table xgene1_addrcost_table =
292 {
293 {
294 1, /* hi */
295 0, /* si */
296 0, /* di */
297 1, /* ti */
298 },
299 1, /* pre_modify */
300 1, /* post_modify */
301 0, /* register_offset */
302 1, /* register_sextend */
303 1, /* register_zextend */
304 0, /* imm_offset */
305 };
306
307 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
308 {
309 {
310 1, /* hi */
311 1, /* si */
312 1, /* di */
313 2, /* ti */
314 },
315 0, /* pre_modify */
316 0, /* post_modify */
317 2, /* register_offset */
318 3, /* register_sextend */
319 3, /* register_zextend */
320 0, /* imm_offset */
321 };
322
323 static const struct cpu_addrcost_table tsv110_addrcost_table =
324 {
325 {
326 1, /* hi */
327 0, /* si */
328 0, /* di */
329 1, /* ti */
330 },
331 0, /* pre_modify */
332 0, /* post_modify */
333 0, /* register_offset */
334 1, /* register_sextend */
335 1, /* register_zextend */
336 0, /* imm_offset */
337 };
338
339 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
340 {
341 {
342 1, /* hi */
343 1, /* si */
344 1, /* di */
345 2, /* ti */
346 },
347 1, /* pre_modify */
348 1, /* post_modify */
349 3, /* register_offset */
350 3, /* register_sextend */
351 3, /* register_zextend */
352 2, /* imm_offset */
353 };
354
355 static const struct cpu_regmove_cost generic_regmove_cost =
356 {
357 1, /* GP2GP */
358 /* Avoid the use of slow int<->fp moves for spilling by setting
359 their cost higher than memmov_cost. */
360 5, /* GP2FP */
361 5, /* FP2GP */
362 2 /* FP2FP */
363 };
364
365 static const struct cpu_regmove_cost cortexa57_regmove_cost =
366 {
367 1, /* GP2GP */
368 /* Avoid the use of slow int<->fp moves for spilling by setting
369 their cost higher than memmov_cost. */
370 5, /* GP2FP */
371 5, /* FP2GP */
372 2 /* FP2FP */
373 };
374
375 static const struct cpu_regmove_cost cortexa53_regmove_cost =
376 {
377 1, /* GP2GP */
378 /* Avoid the use of slow int<->fp moves for spilling by setting
379 their cost higher than memmov_cost. */
380 5, /* GP2FP */
381 5, /* FP2GP */
382 2 /* FP2FP */
383 };
384
385 static const struct cpu_regmove_cost exynosm1_regmove_cost =
386 {
387 1, /* GP2GP */
388 /* Avoid the use of slow int<->fp moves for spilling by setting
389 their cost higher than memmov_cost (actual, 4 and 9). */
390 9, /* GP2FP */
391 9, /* FP2GP */
392 1 /* FP2FP */
393 };
394
395 static const struct cpu_regmove_cost thunderx_regmove_cost =
396 {
397 2, /* GP2GP */
398 2, /* GP2FP */
399 6, /* FP2GP */
400 4 /* FP2FP */
401 };
402
403 static const struct cpu_regmove_cost xgene1_regmove_cost =
404 {
405 1, /* GP2GP */
406 /* Avoid the use of slow int<->fp moves for spilling by setting
407 their cost higher than memmov_cost. */
408 8, /* GP2FP */
409 8, /* FP2GP */
410 2 /* FP2FP */
411 };
412
413 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
414 {
415 2, /* GP2GP */
416 /* Avoid the use of int<->fp moves for spilling. */
417 6, /* GP2FP */
418 6, /* FP2GP */
419 4 /* FP2FP */
420 };
421
422 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
423 {
424 1, /* GP2GP */
425 /* Avoid the use of int<->fp moves for spilling. */
426 8, /* GP2FP */
427 8, /* FP2GP */
428 4 /* FP2FP */
429 };
430
431 static const struct cpu_regmove_cost tsv110_regmove_cost =
432 {
433 1, /* GP2GP */
434 /* Avoid the use of slow int<->fp moves for spilling by setting
435 their cost higher than memmov_cost. */
436 2, /* GP2FP */
437 3, /* FP2GP */
438 2 /* FP2FP */
439 };
440
441 /* Generic costs for vector insn classes. */
442 static const struct cpu_vector_cost generic_vector_cost =
443 {
444 1, /* scalar_int_stmt_cost */
445 1, /* scalar_fp_stmt_cost */
446 1, /* scalar_load_cost */
447 1, /* scalar_store_cost */
448 1, /* vec_int_stmt_cost */
449 1, /* vec_fp_stmt_cost */
450 2, /* vec_permute_cost */
451 1, /* vec_to_scalar_cost */
452 1, /* scalar_to_vec_cost */
453 1, /* vec_align_load_cost */
454 1, /* vec_unalign_load_cost */
455 1, /* vec_unalign_store_cost */
456 1, /* vec_store_cost */
457 3, /* cond_taken_branch_cost */
458 1 /* cond_not_taken_branch_cost */
459 };
460
461 /* QDF24XX costs for vector insn classes. */
462 static const struct cpu_vector_cost qdf24xx_vector_cost =
463 {
464 1, /* scalar_int_stmt_cost */
465 1, /* scalar_fp_stmt_cost */
466 1, /* scalar_load_cost */
467 1, /* scalar_store_cost */
468 1, /* vec_int_stmt_cost */
469 3, /* vec_fp_stmt_cost */
470 2, /* vec_permute_cost */
471 1, /* vec_to_scalar_cost */
472 1, /* scalar_to_vec_cost */
473 1, /* vec_align_load_cost */
474 1, /* vec_unalign_load_cost */
475 1, /* vec_unalign_store_cost */
476 1, /* vec_store_cost */
477 3, /* cond_taken_branch_cost */
478 1 /* cond_not_taken_branch_cost */
479 };
480
481 /* ThunderX costs for vector insn classes. */
482 static const struct cpu_vector_cost thunderx_vector_cost =
483 {
484 1, /* scalar_int_stmt_cost */
485 1, /* scalar_fp_stmt_cost */
486 3, /* scalar_load_cost */
487 1, /* scalar_store_cost */
488 4, /* vec_int_stmt_cost */
489 1, /* vec_fp_stmt_cost */
490 4, /* vec_permute_cost */
491 2, /* vec_to_scalar_cost */
492 2, /* scalar_to_vec_cost */
493 3, /* vec_align_load_cost */
494 5, /* vec_unalign_load_cost */
495 5, /* vec_unalign_store_cost */
496 1, /* vec_store_cost */
497 3, /* cond_taken_branch_cost */
498 3 /* cond_not_taken_branch_cost */
499 };
500
501 static const struct cpu_vector_cost tsv110_vector_cost =
502 {
503 1, /* scalar_int_stmt_cost */
504 1, /* scalar_fp_stmt_cost */
505 5, /* scalar_load_cost */
506 1, /* scalar_store_cost */
507 2, /* vec_int_stmt_cost */
508 2, /* vec_fp_stmt_cost */
509 2, /* vec_permute_cost */
510 3, /* vec_to_scalar_cost */
511 2, /* scalar_to_vec_cost */
512 5, /* vec_align_load_cost */
513 5, /* vec_unalign_load_cost */
514 1, /* vec_unalign_store_cost */
515 1, /* vec_store_cost */
516 1, /* cond_taken_branch_cost */
517 1 /* cond_not_taken_branch_cost */
518 };
519
520 /* Generic costs for vector insn classes. */
521 static const struct cpu_vector_cost cortexa57_vector_cost =
522 {
523 1, /* scalar_int_stmt_cost */
524 1, /* scalar_fp_stmt_cost */
525 4, /* scalar_load_cost */
526 1, /* scalar_store_cost */
527 2, /* vec_int_stmt_cost */
528 2, /* vec_fp_stmt_cost */
529 3, /* vec_permute_cost */
530 8, /* vec_to_scalar_cost */
531 8, /* scalar_to_vec_cost */
532 4, /* vec_align_load_cost */
533 4, /* vec_unalign_load_cost */
534 1, /* vec_unalign_store_cost */
535 1, /* vec_store_cost */
536 1, /* cond_taken_branch_cost */
537 1 /* cond_not_taken_branch_cost */
538 };
539
540 static const struct cpu_vector_cost exynosm1_vector_cost =
541 {
542 1, /* scalar_int_stmt_cost */
543 1, /* scalar_fp_stmt_cost */
544 5, /* scalar_load_cost */
545 1, /* scalar_store_cost */
546 3, /* vec_int_stmt_cost */
547 3, /* vec_fp_stmt_cost */
548 3, /* vec_permute_cost */
549 3, /* vec_to_scalar_cost */
550 3, /* scalar_to_vec_cost */
551 5, /* vec_align_load_cost */
552 5, /* vec_unalign_load_cost */
553 1, /* vec_unalign_store_cost */
554 1, /* vec_store_cost */
555 1, /* cond_taken_branch_cost */
556 1 /* cond_not_taken_branch_cost */
557 };
558
559 /* Generic costs for vector insn classes. */
560 static const struct cpu_vector_cost xgene1_vector_cost =
561 {
562 1, /* scalar_int_stmt_cost */
563 1, /* scalar_fp_stmt_cost */
564 5, /* scalar_load_cost */
565 1, /* scalar_store_cost */
566 2, /* vec_int_stmt_cost */
567 2, /* vec_fp_stmt_cost */
568 2, /* vec_permute_cost */
569 4, /* vec_to_scalar_cost */
570 4, /* scalar_to_vec_cost */
571 10, /* vec_align_load_cost */
572 10, /* vec_unalign_load_cost */
573 2, /* vec_unalign_store_cost */
574 2, /* vec_store_cost */
575 2, /* cond_taken_branch_cost */
576 1 /* cond_not_taken_branch_cost */
577 };
578
579 /* Costs for vector insn classes for Vulcan. */
580 static const struct cpu_vector_cost thunderx2t99_vector_cost =
581 {
582 1, /* scalar_int_stmt_cost */
583 6, /* scalar_fp_stmt_cost */
584 4, /* scalar_load_cost */
585 1, /* scalar_store_cost */
586 5, /* vec_int_stmt_cost */
587 6, /* vec_fp_stmt_cost */
588 10, /* vec_permute_cost */
589 6, /* vec_to_scalar_cost */
590 5, /* scalar_to_vec_cost */
591 8, /* vec_align_load_cost */
592 8, /* vec_unalign_load_cost */
593 4, /* vec_unalign_store_cost */
594 4, /* vec_store_cost */
595 2, /* cond_taken_branch_cost */
596 1 /* cond_not_taken_branch_cost */
597 };
598
599 /* Generic costs for branch instructions. */
600 static const struct cpu_branch_cost generic_branch_cost =
601 {
602 1, /* Predictable. */
603 3 /* Unpredictable. */
604 };
605
606 /* Generic approximation modes. */
607 static const cpu_approx_modes generic_approx_modes =
608 {
609 AARCH64_APPROX_NONE, /* division */
610 AARCH64_APPROX_NONE, /* sqrt */
611 AARCH64_APPROX_NONE /* recip_sqrt */
612 };
613
614 /* Approximation modes for Exynos M1. */
615 static const cpu_approx_modes exynosm1_approx_modes =
616 {
617 AARCH64_APPROX_NONE, /* division */
618 AARCH64_APPROX_ALL, /* sqrt */
619 AARCH64_APPROX_ALL /* recip_sqrt */
620 };
621
622 /* Approximation modes for X-Gene 1. */
623 static const cpu_approx_modes xgene1_approx_modes =
624 {
625 AARCH64_APPROX_NONE, /* division */
626 AARCH64_APPROX_NONE, /* sqrt */
627 AARCH64_APPROX_ALL /* recip_sqrt */
628 };
629
630 /* Generic prefetch settings (which disable prefetch). */
631 static const cpu_prefetch_tune generic_prefetch_tune =
632 {
633 0, /* num_slots */
634 -1, /* l1_cache_size */
635 -1, /* l1_cache_line_size */
636 -1, /* l2_cache_size */
637 true, /* prefetch_dynamic_strides */
638 -1, /* minimum_stride */
639 -1 /* default_opt_level */
640 };
641
642 static const cpu_prefetch_tune exynosm1_prefetch_tune =
643 {
644 0, /* num_slots */
645 -1, /* l1_cache_size */
646 64, /* l1_cache_line_size */
647 -1, /* l2_cache_size */
648 true, /* prefetch_dynamic_strides */
649 -1, /* minimum_stride */
650 -1 /* default_opt_level */
651 };
652
653 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
654 {
655 4, /* num_slots */
656 32, /* l1_cache_size */
657 64, /* l1_cache_line_size */
658 512, /* l2_cache_size */
659 false, /* prefetch_dynamic_strides */
660 2048, /* minimum_stride */
661 3 /* default_opt_level */
662 };
663
664 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
665 {
666 8, /* num_slots */
667 32, /* l1_cache_size */
668 128, /* l1_cache_line_size */
669 16*1024, /* l2_cache_size */
670 true, /* prefetch_dynamic_strides */
671 -1, /* minimum_stride */
672 3 /* default_opt_level */
673 };
674
675 static const cpu_prefetch_tune thunderx_prefetch_tune =
676 {
677 8, /* num_slots */
678 32, /* l1_cache_size */
679 128, /* l1_cache_line_size */
680 -1, /* l2_cache_size */
681 true, /* prefetch_dynamic_strides */
682 -1, /* minimum_stride */
683 -1 /* default_opt_level */
684 };
685
686 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
687 {
688 8, /* num_slots */
689 32, /* l1_cache_size */
690 64, /* l1_cache_line_size */
691 256, /* l2_cache_size */
692 true, /* prefetch_dynamic_strides */
693 -1, /* minimum_stride */
694 -1 /* default_opt_level */
695 };
696
697 static const cpu_prefetch_tune tsv110_prefetch_tune =
698 {
699 0, /* num_slots */
700 64, /* l1_cache_size */
701 64, /* l1_cache_line_size */
702 512, /* l2_cache_size */
703 true, /* prefetch_dynamic_strides */
704 -1, /* minimum_stride */
705 -1 /* default_opt_level */
706 };
707
708 static const cpu_prefetch_tune xgene1_prefetch_tune =
709 {
710 8, /* num_slots */
711 32, /* l1_cache_size */
712 64, /* l1_cache_line_size */
713 256, /* l2_cache_size */
714 true, /* prefetch_dynamic_strides */
715 -1, /* minimum_stride */
716 -1 /* default_opt_level */
717 };
718
719 static const struct tune_params generic_tunings =
720 {
721 &cortexa57_extra_costs,
722 &generic_addrcost_table,
723 &generic_regmove_cost,
724 &generic_vector_cost,
725 &generic_branch_cost,
726 &generic_approx_modes,
727 SVE_NOT_IMPLEMENTED, /* sve_width */
728 4, /* memmov_cost */
729 2, /* issue_rate */
730 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
731 "16:12", /* function_align. */
732 "4", /* jump_align. */
733 "8", /* loop_align. */
734 2, /* int_reassoc_width. */
735 4, /* fp_reassoc_width. */
736 1, /* vec_reassoc_width. */
737 2, /* min_div_recip_mul_sf. */
738 2, /* min_div_recip_mul_df. */
739 0, /* max_case_values. */
740 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
741 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
742 &generic_prefetch_tune
743 };
744
745 static const struct tune_params cortexa35_tunings =
746 {
747 &cortexa53_extra_costs,
748 &generic_addrcost_table,
749 &cortexa53_regmove_cost,
750 &generic_vector_cost,
751 &generic_branch_cost,
752 &generic_approx_modes,
753 SVE_NOT_IMPLEMENTED, /* sve_width */
754 4, /* memmov_cost */
755 1, /* issue_rate */
756 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
757 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
758 "16", /* function_align. */
759 "4", /* jump_align. */
760 "8", /* loop_align. */
761 2, /* int_reassoc_width. */
762 4, /* fp_reassoc_width. */
763 1, /* vec_reassoc_width. */
764 2, /* min_div_recip_mul_sf. */
765 2, /* min_div_recip_mul_df. */
766 0, /* max_case_values. */
767 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
768 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
769 &generic_prefetch_tune
770 };
771
772 static const struct tune_params cortexa53_tunings =
773 {
774 &cortexa53_extra_costs,
775 &generic_addrcost_table,
776 &cortexa53_regmove_cost,
777 &generic_vector_cost,
778 &generic_branch_cost,
779 &generic_approx_modes,
780 SVE_NOT_IMPLEMENTED, /* sve_width */
781 4, /* memmov_cost */
782 2, /* issue_rate */
783 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
784 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
785 "16", /* function_align. */
786 "4", /* jump_align. */
787 "8", /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
796 &generic_prefetch_tune
797 };
798
799 static const struct tune_params cortexa57_tunings =
800 {
801 &cortexa57_extra_costs,
802 &generic_addrcost_table,
803 &cortexa57_regmove_cost,
804 &cortexa57_vector_cost,
805 &generic_branch_cost,
806 &generic_approx_modes,
807 SVE_NOT_IMPLEMENTED, /* sve_width */
808 4, /* memmov_cost */
809 3, /* issue_rate */
810 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
811 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
812 "16", /* function_align. */
813 "4", /* jump_align. */
814 "8", /* loop_align. */
815 2, /* int_reassoc_width. */
816 4, /* fp_reassoc_width. */
817 1, /* vec_reassoc_width. */
818 2, /* min_div_recip_mul_sf. */
819 2, /* min_div_recip_mul_df. */
820 0, /* max_case_values. */
821 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
822 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
823 &generic_prefetch_tune
824 };
825
826 static const struct tune_params cortexa72_tunings =
827 {
828 &cortexa57_extra_costs,
829 &generic_addrcost_table,
830 &cortexa57_regmove_cost,
831 &cortexa57_vector_cost,
832 &generic_branch_cost,
833 &generic_approx_modes,
834 SVE_NOT_IMPLEMENTED, /* sve_width */
835 4, /* memmov_cost */
836 3, /* issue_rate */
837 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
838 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
839 "16", /* function_align. */
840 "4", /* jump_align. */
841 "8", /* loop_align. */
842 2, /* int_reassoc_width. */
843 4, /* fp_reassoc_width. */
844 1, /* vec_reassoc_width. */
845 2, /* min_div_recip_mul_sf. */
846 2, /* min_div_recip_mul_df. */
847 0, /* max_case_values. */
848 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
849 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
850 &generic_prefetch_tune
851 };
852
853 static const struct tune_params cortexa73_tunings =
854 {
855 &cortexa57_extra_costs,
856 &generic_addrcost_table,
857 &cortexa57_regmove_cost,
858 &cortexa57_vector_cost,
859 &generic_branch_cost,
860 &generic_approx_modes,
861 SVE_NOT_IMPLEMENTED, /* sve_width */
862 4, /* memmov_cost. */
863 2, /* issue_rate. */
864 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
865 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
866 "16", /* function_align. */
867 "4", /* jump_align. */
868 "8", /* loop_align. */
869 2, /* int_reassoc_width. */
870 4, /* fp_reassoc_width. */
871 1, /* vec_reassoc_width. */
872 2, /* min_div_recip_mul_sf. */
873 2, /* min_div_recip_mul_df. */
874 0, /* max_case_values. */
875 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
876 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
877 &generic_prefetch_tune
878 };
879
880
881
882 static const struct tune_params exynosm1_tunings =
883 {
884 &exynosm1_extra_costs,
885 &exynosm1_addrcost_table,
886 &exynosm1_regmove_cost,
887 &exynosm1_vector_cost,
888 &generic_branch_cost,
889 &exynosm1_approx_modes,
890 SVE_NOT_IMPLEMENTED, /* sve_width */
891 4, /* memmov_cost */
892 3, /* issue_rate */
893 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
894 "4", /* function_align. */
895 "4", /* jump_align. */
896 "4", /* loop_align. */
897 2, /* int_reassoc_width. */
898 4, /* fp_reassoc_width. */
899 1, /* vec_reassoc_width. */
900 2, /* min_div_recip_mul_sf. */
901 2, /* min_div_recip_mul_df. */
902 48, /* max_case_values. */
903 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
904 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
905 &exynosm1_prefetch_tune
906 };
907
908 static const struct tune_params thunderxt88_tunings =
909 {
910 &thunderx_extra_costs,
911 &generic_addrcost_table,
912 &thunderx_regmove_cost,
913 &thunderx_vector_cost,
914 &generic_branch_cost,
915 &generic_approx_modes,
916 SVE_NOT_IMPLEMENTED, /* sve_width */
917 6, /* memmov_cost */
918 2, /* issue_rate */
919 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
920 "8", /* function_align. */
921 "8", /* jump_align. */
922 "8", /* loop_align. */
923 2, /* int_reassoc_width. */
924 4, /* fp_reassoc_width. */
925 1, /* vec_reassoc_width. */
926 2, /* min_div_recip_mul_sf. */
927 2, /* min_div_recip_mul_df. */
928 0, /* max_case_values. */
929 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
930 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
931 &thunderxt88_prefetch_tune
932 };
933
934 static const struct tune_params thunderx_tunings =
935 {
936 &thunderx_extra_costs,
937 &generic_addrcost_table,
938 &thunderx_regmove_cost,
939 &thunderx_vector_cost,
940 &generic_branch_cost,
941 &generic_approx_modes,
942 SVE_NOT_IMPLEMENTED, /* sve_width */
943 6, /* memmov_cost */
944 2, /* issue_rate */
945 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
946 "8", /* function_align. */
947 "8", /* jump_align. */
948 "8", /* loop_align. */
949 2, /* int_reassoc_width. */
950 4, /* fp_reassoc_width. */
951 1, /* vec_reassoc_width. */
952 2, /* min_div_recip_mul_sf. */
953 2, /* min_div_recip_mul_df. */
954 0, /* max_case_values. */
955 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
956 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
957 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
958 &thunderx_prefetch_tune
959 };
960
961 static const struct tune_params tsv110_tunings =
962 {
963 &tsv110_extra_costs,
964 &tsv110_addrcost_table,
965 &tsv110_regmove_cost,
966 &tsv110_vector_cost,
967 &generic_branch_cost,
968 &generic_approx_modes,
969 SVE_NOT_IMPLEMENTED, /* sve_width */
970 4, /* memmov_cost */
971 4, /* issue_rate */
972 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
973 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
974 "16", /* function_align. */
975 "4", /* jump_align. */
976 "8", /* loop_align. */
977 2, /* int_reassoc_width. */
978 4, /* fp_reassoc_width. */
979 1, /* vec_reassoc_width. */
980 2, /* min_div_recip_mul_sf. */
981 2, /* min_div_recip_mul_df. */
982 0, /* max_case_values. */
983 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
984 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
985 &tsv110_prefetch_tune
986 };
987
988 static const struct tune_params xgene1_tunings =
989 {
990 &xgene1_extra_costs,
991 &xgene1_addrcost_table,
992 &xgene1_regmove_cost,
993 &xgene1_vector_cost,
994 &generic_branch_cost,
995 &xgene1_approx_modes,
996 SVE_NOT_IMPLEMENTED, /* sve_width */
997 6, /* memmov_cost */
998 4, /* issue_rate */
999 AARCH64_FUSE_NOTHING, /* fusible_ops */
1000 "16", /* function_align. */
1001 "16", /* jump_align. */
1002 "16", /* loop_align. */
1003 2, /* int_reassoc_width. */
1004 4, /* fp_reassoc_width. */
1005 1, /* vec_reassoc_width. */
1006 2, /* min_div_recip_mul_sf. */
1007 2, /* min_div_recip_mul_df. */
1008 17, /* max_case_values. */
1009 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1010 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1011 &xgene1_prefetch_tune
1012 };
1013
1014 static const struct tune_params emag_tunings =
1015 {
1016 &xgene1_extra_costs,
1017 &xgene1_addrcost_table,
1018 &xgene1_regmove_cost,
1019 &xgene1_vector_cost,
1020 &generic_branch_cost,
1021 &xgene1_approx_modes,
1022 SVE_NOT_IMPLEMENTED,
1023 6, /* memmov_cost */
1024 4, /* issue_rate */
1025 AARCH64_FUSE_NOTHING, /* fusible_ops */
1026 "16", /* function_align. */
1027 "16", /* jump_align. */
1028 "16", /* loop_align. */
1029 2, /* int_reassoc_width. */
1030 4, /* fp_reassoc_width. */
1031 1, /* vec_reassoc_width. */
1032 2, /* min_div_recip_mul_sf. */
1033 2, /* min_div_recip_mul_df. */
1034 17, /* max_case_values. */
1035 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1036 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1037 &xgene1_prefetch_tune
1038 };
1039
1040 static const struct tune_params qdf24xx_tunings =
1041 {
1042 &qdf24xx_extra_costs,
1043 &qdf24xx_addrcost_table,
1044 &qdf24xx_regmove_cost,
1045 &qdf24xx_vector_cost,
1046 &generic_branch_cost,
1047 &generic_approx_modes,
1048 SVE_NOT_IMPLEMENTED, /* sve_width */
1049 4, /* memmov_cost */
1050 4, /* issue_rate */
1051 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1052 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1053 "16", /* function_align. */
1054 "8", /* jump_align. */
1055 "16", /* loop_align. */
1056 2, /* int_reassoc_width. */
1057 4, /* fp_reassoc_width. */
1058 1, /* vec_reassoc_width. */
1059 2, /* min_div_recip_mul_sf. */
1060 2, /* min_div_recip_mul_df. */
1061 0, /* max_case_values. */
1062 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1063 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1064 &qdf24xx_prefetch_tune
1065 };
1066
1067 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1068 for now. */
1069 static const struct tune_params saphira_tunings =
1070 {
1071 &generic_extra_costs,
1072 &generic_addrcost_table,
1073 &generic_regmove_cost,
1074 &generic_vector_cost,
1075 &generic_branch_cost,
1076 &generic_approx_modes,
1077 SVE_NOT_IMPLEMENTED, /* sve_width */
1078 4, /* memmov_cost */
1079 4, /* issue_rate */
1080 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1081 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1082 "16", /* function_align. */
1083 "8", /* jump_align. */
1084 "16", /* loop_align. */
1085 2, /* int_reassoc_width. */
1086 4, /* fp_reassoc_width. */
1087 1, /* vec_reassoc_width. */
1088 2, /* min_div_recip_mul_sf. */
1089 2, /* min_div_recip_mul_df. */
1090 0, /* max_case_values. */
1091 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1092 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1093 &generic_prefetch_tune
1094 };
1095
1096 static const struct tune_params thunderx2t99_tunings =
1097 {
1098 &thunderx2t99_extra_costs,
1099 &thunderx2t99_addrcost_table,
1100 &thunderx2t99_regmove_cost,
1101 &thunderx2t99_vector_cost,
1102 &generic_branch_cost,
1103 &generic_approx_modes,
1104 SVE_NOT_IMPLEMENTED, /* sve_width */
1105 4, /* memmov_cost. */
1106 4, /* issue_rate. */
1107 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1108 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1109 "16", /* function_align. */
1110 "8", /* jump_align. */
1111 "16", /* loop_align. */
1112 3, /* int_reassoc_width. */
1113 2, /* fp_reassoc_width. */
1114 2, /* vec_reassoc_width. */
1115 2, /* min_div_recip_mul_sf. */
1116 2, /* min_div_recip_mul_df. */
1117 0, /* max_case_values. */
1118 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1119 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1120 &thunderx2t99_prefetch_tune
1121 };
1122
1123 static const struct tune_params neoversen1_tunings =
1124 {
1125 &cortexa57_extra_costs,
1126 &generic_addrcost_table,
1127 &generic_regmove_cost,
1128 &cortexa57_vector_cost,
1129 &generic_branch_cost,
1130 &generic_approx_modes,
1131 SVE_NOT_IMPLEMENTED, /* sve_width */
1132 4, /* memmov_cost */
1133 3, /* issue_rate */
1134 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1135 "32:16", /* function_align. */
1136 "32:16", /* jump_align. */
1137 "32:16", /* loop_align. */
1138 2, /* int_reassoc_width. */
1139 4, /* fp_reassoc_width. */
1140 2, /* vec_reassoc_width. */
1141 2, /* min_div_recip_mul_sf. */
1142 2, /* min_div_recip_mul_df. */
1143 0, /* max_case_values. */
1144 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1145 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1146 &generic_prefetch_tune
1147 };
1148
1149 /* Support for fine-grained override of the tuning structures. */
1150 struct aarch64_tuning_override_function
1151 {
1152 const char* name;
1153 void (*parse_override)(const char*, struct tune_params*);
1154 };
1155
1156 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1157 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1158 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1159
1160 static const struct aarch64_tuning_override_function
1161 aarch64_tuning_override_functions[] =
1162 {
1163 { "fuse", aarch64_parse_fuse_string },
1164 { "tune", aarch64_parse_tune_string },
1165 { "sve_width", aarch64_parse_sve_width_string },
1166 { NULL, NULL }
1167 };
1168
1169 /* A processor implementing AArch64. */
1170 struct processor
1171 {
1172 const char *const name;
1173 enum aarch64_processor ident;
1174 enum aarch64_processor sched_core;
1175 enum aarch64_arch arch;
1176 unsigned architecture_version;
1177 const uint64_t flags;
1178 const struct tune_params *const tune;
1179 };
1180
1181 /* Architectures implementing AArch64. */
1182 static const struct processor all_architectures[] =
1183 {
1184 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1185 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1186 #include "aarch64-arches.def"
1187 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1188 };
1189
1190 /* Processor cores implementing AArch64. */
1191 static const struct processor all_cores[] =
1192 {
1193 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1194 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1195 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1196 FLAGS, &COSTS##_tunings},
1197 #include "aarch64-cores.def"
1198 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1199 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1200 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1201 };
1202
1203
1204 /* Target specification. These are populated by the -march, -mtune, -mcpu
1205 handling code or by target attributes. */
1206 static const struct processor *selected_arch;
1207 static const struct processor *selected_cpu;
1208 static const struct processor *selected_tune;
1209
1210 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1211
1212 /* The current tuning set. */
1213 struct tune_params aarch64_tune_params = generic_tunings;
1214
1215 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1216
1217 static tree
1218 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1219 int, bool *no_add_attrs)
1220 {
1221 /* Since we set fn_type_req to true, the caller should have checked
1222 this for us. */
1223 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1224 switch ((arm_pcs) fntype_abi (*node).id ())
1225 {
1226 case ARM_PCS_AAPCS64:
1227 case ARM_PCS_SIMD:
1228 return NULL_TREE;
1229
1230 case ARM_PCS_SVE:
1231 error ("the %qE attribute cannot be applied to an SVE function type",
1232 name);
1233 *no_add_attrs = true;
1234 return NULL_TREE;
1235
1236 case ARM_PCS_TLSDESC:
1237 case ARM_PCS_UNKNOWN:
1238 break;
1239 }
1240 gcc_unreachable ();
1241 }
1242
1243 /* Table of machine attributes. */
1244 static const struct attribute_spec aarch64_attribute_table[] =
1245 {
1246 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1247 affects_type_identity, handler, exclude } */
1248 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1249 handle_aarch64_vector_pcs_attribute, NULL },
1250 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1251 };
1252
1253 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1254
1255 /* An ISA extension in the co-processor and main instruction set space. */
1256 struct aarch64_option_extension
1257 {
1258 const char *const name;
1259 const unsigned long flags_on;
1260 const unsigned long flags_off;
1261 };
1262
1263 typedef enum aarch64_cond_code
1264 {
1265 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1266 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1267 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1268 }
1269 aarch64_cc;
1270
1271 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1272
1273 struct aarch64_branch_protect_type
1274 {
1275 /* The type's name that the user passes to the branch-protection option
1276 string. */
1277 const char* name;
1278 /* Function to handle the protection type and set global variables.
1279 First argument is the string token corresponding with this type and the
1280 second argument is the next token in the option string.
1281 Return values:
1282 * AARCH64_PARSE_OK: Handling was sucessful.
1283 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1284 should print an error.
1285 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1286 own error. */
1287 enum aarch64_parse_opt_result (*handler)(char*, char*);
1288 /* A list of types that can follow this type in the option string. */
1289 const aarch64_branch_protect_type* subtypes;
1290 unsigned int num_subtypes;
1291 };
1292
1293 static enum aarch64_parse_opt_result
1294 aarch64_handle_no_branch_protection (char* str, char* rest)
1295 {
1296 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1297 aarch64_enable_bti = 0;
1298 if (rest)
1299 {
1300 error ("unexpected %<%s%> after %<%s%>", rest, str);
1301 return AARCH64_PARSE_INVALID_FEATURE;
1302 }
1303 return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_standard_branch_protection (char* str, char* rest)
1308 {
1309 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1310 aarch64_ra_sign_key = AARCH64_KEY_A;
1311 aarch64_enable_bti = 1;
1312 if (rest)
1313 {
1314 error ("unexpected %<%s%> after %<%s%>", rest, str);
1315 return AARCH64_PARSE_INVALID_FEATURE;
1316 }
1317 return AARCH64_PARSE_OK;
1318 }
1319
1320 static enum aarch64_parse_opt_result
1321 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1322 char* rest ATTRIBUTE_UNUSED)
1323 {
1324 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1325 aarch64_ra_sign_key = AARCH64_KEY_A;
1326 return AARCH64_PARSE_OK;
1327 }
1328
1329 static enum aarch64_parse_opt_result
1330 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1331 char* rest ATTRIBUTE_UNUSED)
1332 {
1333 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1334 return AARCH64_PARSE_OK;
1335 }
1336
1337 static enum aarch64_parse_opt_result
1338 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1339 char* rest ATTRIBUTE_UNUSED)
1340 {
1341 aarch64_ra_sign_key = AARCH64_KEY_B;
1342 return AARCH64_PARSE_OK;
1343 }
1344
1345 static enum aarch64_parse_opt_result
1346 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1347 char* rest ATTRIBUTE_UNUSED)
1348 {
1349 aarch64_enable_bti = 1;
1350 return AARCH64_PARSE_OK;
1351 }
1352
1353 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1354 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1355 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1356 { NULL, NULL, NULL, 0 }
1357 };
1358
1359 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1360 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1361 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1362 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1363 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1364 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1365 { NULL, NULL, NULL, 0 }
1366 };
1367
1368 /* The condition codes of the processor, and the inverse function. */
1369 static const char * const aarch64_condition_codes[] =
1370 {
1371 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1372 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1373 };
1374
1375 /* The preferred condition codes for SVE conditions. */
1376 static const char *const aarch64_sve_condition_codes[] =
1377 {
1378 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1379 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1380 };
1381
1382 /* Return the assembly token for svpattern value VALUE. */
1383
1384 static const char *
1385 svpattern_token (enum aarch64_svpattern pattern)
1386 {
1387 switch (pattern)
1388 {
1389 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1390 AARCH64_FOR_SVPATTERN (CASE)
1391 #undef CASE
1392 case AARCH64_NUM_SVPATTERNS:
1393 break;
1394 }
1395 gcc_unreachable ();
1396 }
1397
1398 /* Return the descriptor of the SIMD ABI. */
1399
1400 static const predefined_function_abi &
1401 aarch64_simd_abi (void)
1402 {
1403 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1404 if (!simd_abi.initialized_p ())
1405 {
1406 HARD_REG_SET full_reg_clobbers
1407 = default_function_abi.full_reg_clobbers ();
1408 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1409 if (FP_SIMD_SAVED_REGNUM_P (regno))
1410 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1411 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1412 }
1413 return simd_abi;
1414 }
1415
1416 /* Return the descriptor of the SVE PCS. */
1417
1418 static const predefined_function_abi &
1419 aarch64_sve_abi (void)
1420 {
1421 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1422 if (!sve_abi.initialized_p ())
1423 {
1424 HARD_REG_SET full_reg_clobbers
1425 = default_function_abi.full_reg_clobbers ();
1426 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1427 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1428 for (int regno = P4_REGNUM; regno <= P11_REGNUM; ++regno)
1429 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1430 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1431 }
1432 return sve_abi;
1433 }
1434
1435 /* Generate code to enable conditional branches in functions over 1 MiB. */
1436 const char *
1437 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1438 const char * branch_format)
1439 {
1440 rtx_code_label * tmp_label = gen_label_rtx ();
1441 char label_buf[256];
1442 char buffer[128];
1443 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1444 CODE_LABEL_NUMBER (tmp_label));
1445 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1446 rtx dest_label = operands[pos_label];
1447 operands[pos_label] = tmp_label;
1448
1449 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1450 output_asm_insn (buffer, operands);
1451
1452 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1453 operands[pos_label] = dest_label;
1454 output_asm_insn (buffer, operands);
1455 return "";
1456 }
1457
1458 void
1459 aarch64_err_no_fpadvsimd (machine_mode mode)
1460 {
1461 if (TARGET_GENERAL_REGS_ONLY)
1462 if (FLOAT_MODE_P (mode))
1463 error ("%qs is incompatible with the use of floating-point types",
1464 "-mgeneral-regs-only");
1465 else
1466 error ("%qs is incompatible with the use of vector types",
1467 "-mgeneral-regs-only");
1468 else
1469 if (FLOAT_MODE_P (mode))
1470 error ("%qs feature modifier is incompatible with the use of"
1471 " floating-point types", "+nofp");
1472 else
1473 error ("%qs feature modifier is incompatible with the use of"
1474 " vector types", "+nofp");
1475 }
1476
1477 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1478 registers. */
1479 inline bool
1480 pr_or_ffr_regnum_p (unsigned int regno)
1481 {
1482 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1483 }
1484
1485 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1486 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1487 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1488 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1489 and GENERAL_REGS is lower than the memory cost (in this case the best class
1490 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1491 cost results in bad allocations with many redundant int<->FP moves which
1492 are expensive on various cores.
1493 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1494 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1495 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1496 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1497 The result of this is that it is no longer inefficient to have a higher
1498 memory move cost than the register move cost.
1499 */
1500
1501 static reg_class_t
1502 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1503 reg_class_t best_class)
1504 {
1505 machine_mode mode;
1506
1507 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1508 || !reg_class_subset_p (FP_REGS, allocno_class))
1509 return allocno_class;
1510
1511 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1512 || !reg_class_subset_p (FP_REGS, best_class))
1513 return best_class;
1514
1515 mode = PSEUDO_REGNO_MODE (regno);
1516 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1517 }
1518
1519 static unsigned int
1520 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1521 {
1522 if (GET_MODE_UNIT_SIZE (mode) == 4)
1523 return aarch64_tune_params.min_div_recip_mul_sf;
1524 return aarch64_tune_params.min_div_recip_mul_df;
1525 }
1526
1527 /* Return the reassociation width of treeop OPC with mode MODE. */
1528 static int
1529 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1530 {
1531 if (VECTOR_MODE_P (mode))
1532 return aarch64_tune_params.vec_reassoc_width;
1533 if (INTEGRAL_MODE_P (mode))
1534 return aarch64_tune_params.int_reassoc_width;
1535 /* Avoid reassociating floating point addition so we emit more FMAs. */
1536 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1537 return aarch64_tune_params.fp_reassoc_width;
1538 return 1;
1539 }
1540
1541 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1542 unsigned
1543 aarch64_dbx_register_number (unsigned regno)
1544 {
1545 if (GP_REGNUM_P (regno))
1546 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1547 else if (regno == SP_REGNUM)
1548 return AARCH64_DWARF_SP;
1549 else if (FP_REGNUM_P (regno))
1550 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1551 else if (PR_REGNUM_P (regno))
1552 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1553 else if (regno == VG_REGNUM)
1554 return AARCH64_DWARF_VG;
1555
1556 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1557 equivalent DWARF register. */
1558 return DWARF_FRAME_REGISTERS;
1559 }
1560
1561 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1562 integer, otherwise return X unmodified. */
1563 static rtx
1564 aarch64_bit_representation (rtx x)
1565 {
1566 if (CONST_DOUBLE_P (x))
1567 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1568 return x;
1569 }
1570
1571 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1572 static bool
1573 aarch64_advsimd_struct_mode_p (machine_mode mode)
1574 {
1575 return (TARGET_SIMD
1576 && (mode == OImode || mode == CImode || mode == XImode));
1577 }
1578
1579 /* Return true if MODE is an SVE predicate mode. */
1580 static bool
1581 aarch64_sve_pred_mode_p (machine_mode mode)
1582 {
1583 return (TARGET_SVE
1584 && (mode == VNx16BImode
1585 || mode == VNx8BImode
1586 || mode == VNx4BImode
1587 || mode == VNx2BImode));
1588 }
1589
1590 /* Three mutually-exclusive flags describing a vector or predicate type. */
1591 const unsigned int VEC_ADVSIMD = 1;
1592 const unsigned int VEC_SVE_DATA = 2;
1593 const unsigned int VEC_SVE_PRED = 4;
1594 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1595 a structure of 2, 3 or 4 vectors. */
1596 const unsigned int VEC_STRUCT = 8;
1597 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1598 vector has fewer significant bytes than a full SVE vector. */
1599 const unsigned int VEC_PARTIAL = 16;
1600 /* Useful combinations of the above. */
1601 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1602 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1603
1604 /* Return a set of flags describing the vector properties of mode MODE.
1605 Ignore modes that are not supported by the current target. */
1606 static unsigned int
1607 aarch64_classify_vector_mode (machine_mode mode)
1608 {
1609 if (aarch64_advsimd_struct_mode_p (mode))
1610 return VEC_ADVSIMD | VEC_STRUCT;
1611
1612 if (aarch64_sve_pred_mode_p (mode))
1613 return VEC_SVE_PRED;
1614
1615 /* Make the decision based on the mode's enum value rather than its
1616 properties, so that we keep the correct classification regardless
1617 of -msve-vector-bits. */
1618 switch (mode)
1619 {
1620 /* Partial SVE QI vectors. */
1621 case E_VNx2QImode:
1622 case E_VNx4QImode:
1623 case E_VNx8QImode:
1624 /* Partial SVE HI vectors. */
1625 case E_VNx2HImode:
1626 case E_VNx4HImode:
1627 /* Partial SVE SI vector. */
1628 case E_VNx2SImode:
1629 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1630
1631 case E_VNx16QImode:
1632 case E_VNx8HImode:
1633 case E_VNx4SImode:
1634 case E_VNx2DImode:
1635 case E_VNx8HFmode:
1636 case E_VNx4SFmode:
1637 case E_VNx2DFmode:
1638 return TARGET_SVE ? VEC_SVE_DATA : 0;
1639
1640 /* x2 SVE vectors. */
1641 case E_VNx32QImode:
1642 case E_VNx16HImode:
1643 case E_VNx8SImode:
1644 case E_VNx4DImode:
1645 case E_VNx16HFmode:
1646 case E_VNx8SFmode:
1647 case E_VNx4DFmode:
1648 /* x3 SVE vectors. */
1649 case E_VNx48QImode:
1650 case E_VNx24HImode:
1651 case E_VNx12SImode:
1652 case E_VNx6DImode:
1653 case E_VNx24HFmode:
1654 case E_VNx12SFmode:
1655 case E_VNx6DFmode:
1656 /* x4 SVE vectors. */
1657 case E_VNx64QImode:
1658 case E_VNx32HImode:
1659 case E_VNx16SImode:
1660 case E_VNx8DImode:
1661 case E_VNx32HFmode:
1662 case E_VNx16SFmode:
1663 case E_VNx8DFmode:
1664 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1665
1666 /* 64-bit Advanced SIMD vectors. */
1667 case E_V8QImode:
1668 case E_V4HImode:
1669 case E_V2SImode:
1670 /* ...E_V1DImode doesn't exist. */
1671 case E_V4HFmode:
1672 case E_V2SFmode:
1673 case E_V1DFmode:
1674 /* 128-bit Advanced SIMD vectors. */
1675 case E_V16QImode:
1676 case E_V8HImode:
1677 case E_V4SImode:
1678 case E_V2DImode:
1679 case E_V8HFmode:
1680 case E_V4SFmode:
1681 case E_V2DFmode:
1682 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1683
1684 default:
1685 return 0;
1686 }
1687 }
1688
1689 /* Return true if MODE is any of the data vector modes, including
1690 structure modes. */
1691 static bool
1692 aarch64_vector_data_mode_p (machine_mode mode)
1693 {
1694 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1695 }
1696
1697 /* Return true if MODE is any form of SVE mode, including predicates,
1698 vectors and structures. */
1699 bool
1700 aarch64_sve_mode_p (machine_mode mode)
1701 {
1702 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1703 }
1704
1705 /* Return true if MODE is an SVE data vector mode; either a single vector
1706 or a structure of vectors. */
1707 static bool
1708 aarch64_sve_data_mode_p (machine_mode mode)
1709 {
1710 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1711 }
1712
1713 /* Return the number of defined bytes in one constituent vector of
1714 SVE mode MODE, which has vector flags VEC_FLAGS. */
1715 static poly_int64
1716 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1717 {
1718 if (vec_flags & VEC_PARTIAL)
1719 /* A single partial vector. */
1720 return GET_MODE_SIZE (mode);
1721
1722 if (vec_flags & VEC_SVE_DATA)
1723 /* A single vector or a tuple. */
1724 return BYTES_PER_SVE_VECTOR;
1725
1726 /* A single predicate. */
1727 gcc_assert (vec_flags & VEC_SVE_PRED);
1728 return BYTES_PER_SVE_PRED;
1729 }
1730
1731 /* Implement target hook TARGET_ARRAY_MODE. */
1732 static opt_machine_mode
1733 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1734 {
1735 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1736 && IN_RANGE (nelems, 2, 4))
1737 return mode_for_vector (GET_MODE_INNER (mode),
1738 GET_MODE_NUNITS (mode) * nelems);
1739
1740 return opt_machine_mode ();
1741 }
1742
1743 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1744 static bool
1745 aarch64_array_mode_supported_p (machine_mode mode,
1746 unsigned HOST_WIDE_INT nelems)
1747 {
1748 if (TARGET_SIMD
1749 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1750 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1751 && (nelems >= 2 && nelems <= 4))
1752 return true;
1753
1754 return false;
1755 }
1756
1757 /* Return the SVE predicate mode to use for elements that have
1758 ELEM_NBYTES bytes, if such a mode exists. */
1759
1760 opt_machine_mode
1761 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1762 {
1763 if (TARGET_SVE)
1764 {
1765 if (elem_nbytes == 1)
1766 return VNx16BImode;
1767 if (elem_nbytes == 2)
1768 return VNx8BImode;
1769 if (elem_nbytes == 4)
1770 return VNx4BImode;
1771 if (elem_nbytes == 8)
1772 return VNx2BImode;
1773 }
1774 return opt_machine_mode ();
1775 }
1776
1777 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1778
1779 static opt_machine_mode
1780 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1781 {
1782 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1783 {
1784 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1785 machine_mode pred_mode;
1786 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1787 return pred_mode;
1788 }
1789
1790 return default_get_mask_mode (nunits, nbytes);
1791 }
1792
1793 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1794
1795 opt_machine_mode
1796 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1797 {
1798 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1799 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1800 machine_mode mode;
1801 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1802 if (inner_mode == GET_MODE_INNER (mode)
1803 && known_eq (nunits, GET_MODE_NUNITS (mode))
1804 && aarch64_sve_data_mode_p (mode))
1805 return mode;
1806 return opt_machine_mode ();
1807 }
1808
1809 /* Return the integer element mode associated with SVE mode MODE. */
1810
1811 static scalar_int_mode
1812 aarch64_sve_element_int_mode (machine_mode mode)
1813 {
1814 unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1815 GET_MODE_NUNITS (mode));
1816 return int_mode_for_size (elt_bits, 0).require ();
1817 }
1818
1819 /* Return the integer vector mode associated with SVE mode MODE.
1820 Unlike mode_for_int_vector, this can handle the case in which
1821 MODE is a predicate (and thus has a different total size). */
1822
1823 machine_mode
1824 aarch64_sve_int_mode (machine_mode mode)
1825 {
1826 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1827 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1828 }
1829
1830 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1831 prefer to use the first arithmetic operand as the else value if
1832 the else value doesn't matter, since that exactly matches the SVE
1833 destructive merging form. For ternary operations we could either
1834 pick the first operand and use FMAD-like instructions or the last
1835 operand and use FMLA-like instructions; the latter seems more
1836 natural. */
1837
1838 static tree
1839 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1840 {
1841 return nops == 3 ? ops[2] : ops[0];
1842 }
1843
1844 /* Implement TARGET_HARD_REGNO_NREGS. */
1845
1846 static unsigned int
1847 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1848 {
1849 /* ??? Logically we should only need to provide a value when
1850 HARD_REGNO_MODE_OK says that the combination is valid,
1851 but at the moment we need to handle all modes. Just ignore
1852 any runtime parts for registers that can't store them. */
1853 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1854 switch (aarch64_regno_regclass (regno))
1855 {
1856 case FP_REGS:
1857 case FP_LO_REGS:
1858 case FP_LO8_REGS:
1859 {
1860 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1861 if (vec_flags & VEC_SVE_DATA)
1862 return exact_div (GET_MODE_SIZE (mode),
1863 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
1864 return CEIL (lowest_size, UNITS_PER_VREG);
1865 }
1866 case PR_REGS:
1867 case PR_LO_REGS:
1868 case PR_HI_REGS:
1869 case FFR_REGS:
1870 case PR_AND_FFR_REGS:
1871 return 1;
1872 default:
1873 return CEIL (lowest_size, UNITS_PER_WORD);
1874 }
1875 gcc_unreachable ();
1876 }
1877
1878 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1879
1880 static bool
1881 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1882 {
1883 if (GET_MODE_CLASS (mode) == MODE_CC)
1884 return regno == CC_REGNUM;
1885
1886 if (regno == VG_REGNUM)
1887 /* This must have the same size as _Unwind_Word. */
1888 return mode == DImode;
1889
1890 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1891 /* At the moment, partial vector modes are only useful for memory
1892 references, but that could change in future. */
1893 if (vec_flags & VEC_PARTIAL)
1894 return false;
1895
1896 if (vec_flags & VEC_SVE_PRED)
1897 return pr_or_ffr_regnum_p (regno);
1898
1899 if (pr_or_ffr_regnum_p (regno))
1900 return false;
1901
1902 if (regno == SP_REGNUM)
1903 /* The purpose of comparing with ptr_mode is to support the
1904 global register variable associated with the stack pointer
1905 register via the syntax of asm ("wsp") in ILP32. */
1906 return mode == Pmode || mode == ptr_mode;
1907
1908 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1909 return mode == Pmode;
1910
1911 if (GP_REGNUM_P (regno))
1912 {
1913 if (known_le (GET_MODE_SIZE (mode), 8))
1914 return true;
1915 else if (known_le (GET_MODE_SIZE (mode), 16))
1916 return (regno & 1) == 0;
1917 }
1918 else if (FP_REGNUM_P (regno))
1919 {
1920 if (vec_flags & VEC_STRUCT)
1921 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1922 else
1923 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1924 }
1925
1926 return false;
1927 }
1928
1929 /* Return true if TYPE is a type that should be passed or returned in
1930 SVE registers, assuming enough registers are available. When returning
1931 true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
1932 respectively. */
1933
1934 static bool
1935 aarch64_sve_argument_p (const_tree type, unsigned int *num_zr,
1936 unsigned int *num_pr)
1937 {
1938 if (aarch64_sve::svbool_type_p (type))
1939 {
1940 *num_pr = 1;
1941 *num_zr = 0;
1942 return true;
1943 }
1944
1945 if (unsigned int nvectors = aarch64_sve::nvectors_if_data_type (type))
1946 {
1947 *num_pr = 0;
1948 *num_zr = nvectors;
1949 return true;
1950 }
1951
1952 return false;
1953 }
1954
1955 /* Return true if a function with type FNTYPE returns its value in
1956 SVE vector or predicate registers. */
1957
1958 static bool
1959 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
1960 {
1961 unsigned int num_zr, num_pr;
1962 tree return_type = TREE_TYPE (fntype);
1963 return (return_type != error_mark_node
1964 && aarch64_sve_argument_p (return_type, &num_zr, &num_pr));
1965 }
1966
1967 /* Return true if a function with type FNTYPE takes arguments in
1968 SVE vector or predicate registers. */
1969
1970 static bool
1971 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
1972 {
1973 CUMULATIVE_ARGS args_so_far_v;
1974 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
1975 NULL_TREE, 0, true);
1976 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
1977
1978 for (tree chain = TYPE_ARG_TYPES (fntype);
1979 chain && chain != void_list_node;
1980 chain = TREE_CHAIN (chain))
1981 {
1982 tree arg_type = TREE_VALUE (chain);
1983 if (arg_type == error_mark_node)
1984 return false;
1985
1986 function_arg_info arg (arg_type, /*named=*/true);
1987 apply_pass_by_reference_rules (&args_so_far_v, arg);
1988 unsigned int num_zr, num_pr;
1989 if (aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
1990 return true;
1991
1992 targetm.calls.function_arg_advance (args_so_far, arg);
1993 }
1994 return false;
1995 }
1996
1997 /* Implement TARGET_FNTYPE_ABI. */
1998
1999 static const predefined_function_abi &
2000 aarch64_fntype_abi (const_tree fntype)
2001 {
2002 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2003 return aarch64_simd_abi ();
2004
2005 if (aarch64_returns_value_in_sve_regs_p (fntype)
2006 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2007 return aarch64_sve_abi ();
2008
2009 return default_function_abi;
2010 }
2011
2012 /* Return true if we should emit CFI for register REGNO. */
2013
2014 static bool
2015 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2016 {
2017 return (GP_REGNUM_P (regno)
2018 || !default_function_abi.clobbers_full_reg_p (regno));
2019 }
2020
2021 /* Return the mode we should use to save and restore register REGNO. */
2022
2023 static machine_mode
2024 aarch64_reg_save_mode (unsigned int regno)
2025 {
2026 if (GP_REGNUM_P (regno))
2027 return DImode;
2028
2029 if (FP_REGNUM_P (regno))
2030 switch (crtl->abi->id ())
2031 {
2032 case ARM_PCS_AAPCS64:
2033 /* Only the low 64 bits are saved by the base PCS. */
2034 return DFmode;
2035
2036 case ARM_PCS_SIMD:
2037 /* The vector PCS saves the low 128 bits (which is the full
2038 register on non-SVE targets). */
2039 return TFmode;
2040
2041 case ARM_PCS_SVE:
2042 /* Use vectors of DImode for registers that need frame
2043 information, so that the first 64 bytes of the save slot
2044 are always the equivalent of what storing D<n> would give. */
2045 if (aarch64_emit_cfi_for_reg_p (regno))
2046 return VNx2DImode;
2047
2048 /* Use vectors of bytes otherwise, so that the layout is
2049 endian-agnostic, and so that we can use LDR and STR for
2050 big-endian targets. */
2051 return VNx16QImode;
2052
2053 case ARM_PCS_TLSDESC:
2054 case ARM_PCS_UNKNOWN:
2055 break;
2056 }
2057
2058 if (PR_REGNUM_P (regno))
2059 /* Save the full predicate register. */
2060 return VNx16BImode;
2061
2062 gcc_unreachable ();
2063 }
2064
2065 /* Implement TARGET_INSN_CALLEE_ABI. */
2066
2067 const predefined_function_abi &
2068 aarch64_insn_callee_abi (const rtx_insn *insn)
2069 {
2070 rtx pat = PATTERN (insn);
2071 gcc_assert (GET_CODE (pat) == PARALLEL);
2072 rtx unspec = XVECEXP (pat, 0, 1);
2073 gcc_assert (GET_CODE (unspec) == UNSPEC
2074 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2075 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2076 }
2077
2078 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2079 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2080 clobbers the top 64 bits when restoring the bottom 64 bits. */
2081
2082 static bool
2083 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2084 unsigned int regno,
2085 machine_mode mode)
2086 {
2087 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2088 {
2089 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2090 unsigned int nregs = hard_regno_nregs (regno, mode);
2091 if (nregs > 1)
2092 per_register_size = exact_div (per_register_size, nregs);
2093 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2094 return maybe_gt (per_register_size, 16);
2095 return maybe_gt (per_register_size, 8);
2096 }
2097 return false;
2098 }
2099
2100 /* Implement REGMODE_NATURAL_SIZE. */
2101 poly_uint64
2102 aarch64_regmode_natural_size (machine_mode mode)
2103 {
2104 /* The natural size for SVE data modes is one SVE data vector,
2105 and similarly for predicates. We can't independently modify
2106 anything smaller than that. */
2107 /* ??? For now, only do this for variable-width SVE registers.
2108 Doing it for constant-sized registers breaks lower-subreg.c. */
2109 /* ??? And once that's fixed, we should probably have similar
2110 code for Advanced SIMD. */
2111 if (!aarch64_sve_vg.is_constant ())
2112 {
2113 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2114 if (vec_flags & VEC_SVE_PRED)
2115 return BYTES_PER_SVE_PRED;
2116 if (vec_flags & VEC_SVE_DATA)
2117 return BYTES_PER_SVE_VECTOR;
2118 }
2119 return UNITS_PER_WORD;
2120 }
2121
2122 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2123 machine_mode
2124 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2125 machine_mode mode)
2126 {
2127 /* The predicate mode determines which bits are significant and
2128 which are "don't care". Decreasing the number of lanes would
2129 lose data while increasing the number of lanes would make bits
2130 unnecessarily significant. */
2131 if (PR_REGNUM_P (regno))
2132 return mode;
2133 if (known_ge (GET_MODE_SIZE (mode), 4))
2134 return mode;
2135 else
2136 return SImode;
2137 }
2138
2139 /* Return true if I's bits are consecutive ones from the MSB. */
2140 bool
2141 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2142 {
2143 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2144 }
2145
2146 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2147 that strcpy from constants will be faster. */
2148
2149 static HOST_WIDE_INT
2150 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2151 {
2152 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2153 return MAX (align, BITS_PER_WORD);
2154 return align;
2155 }
2156
2157 /* Return true if calls to DECL should be treated as
2158 long-calls (ie called via a register). */
2159 static bool
2160 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2161 {
2162 return false;
2163 }
2164
2165 /* Return true if calls to symbol-ref SYM should be treated as
2166 long-calls (ie called via a register). */
2167 bool
2168 aarch64_is_long_call_p (rtx sym)
2169 {
2170 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2171 }
2172
2173 /* Return true if calls to symbol-ref SYM should not go through
2174 plt stubs. */
2175
2176 bool
2177 aarch64_is_noplt_call_p (rtx sym)
2178 {
2179 const_tree decl = SYMBOL_REF_DECL (sym);
2180
2181 if (flag_pic
2182 && decl
2183 && (!flag_plt
2184 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2185 && !targetm.binds_local_p (decl))
2186 return true;
2187
2188 return false;
2189 }
2190
2191 /* Return true if the offsets to a zero/sign-extract operation
2192 represent an expression that matches an extend operation. The
2193 operands represent the paramters from
2194
2195 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2196 bool
2197 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2198 rtx extract_imm)
2199 {
2200 HOST_WIDE_INT mult_val, extract_val;
2201
2202 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2203 return false;
2204
2205 mult_val = INTVAL (mult_imm);
2206 extract_val = INTVAL (extract_imm);
2207
2208 if (extract_val > 8
2209 && extract_val < GET_MODE_BITSIZE (mode)
2210 && exact_log2 (extract_val & ~7) > 0
2211 && (extract_val & 7) <= 4
2212 && mult_val == (1 << (extract_val & 7)))
2213 return true;
2214
2215 return false;
2216 }
2217
2218 /* Emit an insn that's a simple single-set. Both the operands must be
2219 known to be valid. */
2220 inline static rtx_insn *
2221 emit_set_insn (rtx x, rtx y)
2222 {
2223 return emit_insn (gen_rtx_SET (x, y));
2224 }
2225
2226 /* X and Y are two things to compare using CODE. Emit the compare insn and
2227 return the rtx for register 0 in the proper mode. */
2228 rtx
2229 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2230 {
2231 machine_mode cmp_mode = GET_MODE (x);
2232 machine_mode cc_mode;
2233 rtx cc_reg;
2234
2235 if (cmp_mode == TImode)
2236 {
2237 gcc_assert (code == NE);
2238
2239 cc_mode = CCmode;
2240 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2241
2242 rtx x_lo = operand_subword (x, 0, 0, TImode);
2243 rtx y_lo = operand_subword (y, 0, 0, TImode);
2244 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2245
2246 rtx x_hi = operand_subword (x, 1, 0, TImode);
2247 rtx y_hi = operand_subword (y, 1, 0, TImode);
2248 emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
2249 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2250 GEN_INT (AARCH64_EQ)));
2251 }
2252 else
2253 {
2254 cc_mode = SELECT_CC_MODE (code, x, y);
2255 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2256 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2257 }
2258 return cc_reg;
2259 }
2260
2261 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2262
2263 static rtx
2264 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2265 machine_mode y_mode)
2266 {
2267 if (y_mode == E_QImode || y_mode == E_HImode)
2268 {
2269 if (CONST_INT_P (y))
2270 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2271 else
2272 {
2273 rtx t, cc_reg;
2274 machine_mode cc_mode;
2275
2276 t = gen_rtx_ZERO_EXTEND (SImode, y);
2277 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2278 cc_mode = CC_SWPmode;
2279 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2280 emit_set_insn (cc_reg, t);
2281 return cc_reg;
2282 }
2283 }
2284
2285 if (!aarch64_plus_operand (y, y_mode))
2286 y = force_reg (y_mode, y);
2287
2288 return aarch64_gen_compare_reg (code, x, y);
2289 }
2290
2291 /* Build the SYMBOL_REF for __tls_get_addr. */
2292
2293 static GTY(()) rtx tls_get_addr_libfunc;
2294
2295 rtx
2296 aarch64_tls_get_addr (void)
2297 {
2298 if (!tls_get_addr_libfunc)
2299 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2300 return tls_get_addr_libfunc;
2301 }
2302
2303 /* Return the TLS model to use for ADDR. */
2304
2305 static enum tls_model
2306 tls_symbolic_operand_type (rtx addr)
2307 {
2308 enum tls_model tls_kind = TLS_MODEL_NONE;
2309 if (GET_CODE (addr) == CONST)
2310 {
2311 poly_int64 addend;
2312 rtx sym = strip_offset (addr, &addend);
2313 if (GET_CODE (sym) == SYMBOL_REF)
2314 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2315 }
2316 else if (GET_CODE (addr) == SYMBOL_REF)
2317 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2318
2319 return tls_kind;
2320 }
2321
2322 /* We'll allow lo_sum's in addresses in our legitimate addresses
2323 so that combine would take care of combining addresses where
2324 necessary, but for generation purposes, we'll generate the address
2325 as :
2326 RTL Absolute
2327 tmp = hi (symbol_ref); adrp x1, foo
2328 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2329 nop
2330
2331 PIC TLS
2332 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2333 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2334 bl __tls_get_addr
2335 nop
2336
2337 Load TLS symbol, depending on TLS mechanism and TLS access model.
2338
2339 Global Dynamic - Traditional TLS:
2340 adrp tmp, :tlsgd:imm
2341 add dest, tmp, #:tlsgd_lo12:imm
2342 bl __tls_get_addr
2343
2344 Global Dynamic - TLS Descriptors:
2345 adrp dest, :tlsdesc:imm
2346 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2347 add dest, dest, #:tlsdesc_lo12:imm
2348 blr tmp
2349 mrs tp, tpidr_el0
2350 add dest, dest, tp
2351
2352 Initial Exec:
2353 mrs tp, tpidr_el0
2354 adrp tmp, :gottprel:imm
2355 ldr dest, [tmp, #:gottprel_lo12:imm]
2356 add dest, dest, tp
2357
2358 Local Exec:
2359 mrs tp, tpidr_el0
2360 add t0, tp, #:tprel_hi12:imm, lsl #12
2361 add t0, t0, #:tprel_lo12_nc:imm
2362 */
2363
2364 static void
2365 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2366 enum aarch64_symbol_type type)
2367 {
2368 switch (type)
2369 {
2370 case SYMBOL_SMALL_ABSOLUTE:
2371 {
2372 /* In ILP32, the mode of dest can be either SImode or DImode. */
2373 rtx tmp_reg = dest;
2374 machine_mode mode = GET_MODE (dest);
2375
2376 gcc_assert (mode == Pmode || mode == ptr_mode);
2377
2378 if (can_create_pseudo_p ())
2379 tmp_reg = gen_reg_rtx (mode);
2380
2381 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2382 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2383 return;
2384 }
2385
2386 case SYMBOL_TINY_ABSOLUTE:
2387 emit_insn (gen_rtx_SET (dest, imm));
2388 return;
2389
2390 case SYMBOL_SMALL_GOT_28K:
2391 {
2392 machine_mode mode = GET_MODE (dest);
2393 rtx gp_rtx = pic_offset_table_rtx;
2394 rtx insn;
2395 rtx mem;
2396
2397 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2398 here before rtl expand. Tree IVOPT will generate rtl pattern to
2399 decide rtx costs, in which case pic_offset_table_rtx is not
2400 initialized. For that case no need to generate the first adrp
2401 instruction as the final cost for global variable access is
2402 one instruction. */
2403 if (gp_rtx != NULL)
2404 {
2405 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2406 using the page base as GOT base, the first page may be wasted,
2407 in the worst scenario, there is only 28K space for GOT).
2408
2409 The generate instruction sequence for accessing global variable
2410 is:
2411
2412 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2413
2414 Only one instruction needed. But we must initialize
2415 pic_offset_table_rtx properly. We generate initialize insn for
2416 every global access, and allow CSE to remove all redundant.
2417
2418 The final instruction sequences will look like the following
2419 for multiply global variables access.
2420
2421 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2422
2423 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2424 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2425 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2426 ... */
2427
2428 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2429 crtl->uses_pic_offset_table = 1;
2430 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2431
2432 if (mode != GET_MODE (gp_rtx))
2433 gp_rtx = gen_lowpart (mode, gp_rtx);
2434
2435 }
2436
2437 if (mode == ptr_mode)
2438 {
2439 if (mode == DImode)
2440 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2441 else
2442 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2443
2444 mem = XVECEXP (SET_SRC (insn), 0, 0);
2445 }
2446 else
2447 {
2448 gcc_assert (mode == Pmode);
2449
2450 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2451 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2452 }
2453
2454 /* The operand is expected to be MEM. Whenever the related insn
2455 pattern changed, above code which calculate mem should be
2456 updated. */
2457 gcc_assert (GET_CODE (mem) == MEM);
2458 MEM_READONLY_P (mem) = 1;
2459 MEM_NOTRAP_P (mem) = 1;
2460 emit_insn (insn);
2461 return;
2462 }
2463
2464 case SYMBOL_SMALL_GOT_4G:
2465 {
2466 /* In ILP32, the mode of dest can be either SImode or DImode,
2467 while the got entry is always of SImode size. The mode of
2468 dest depends on how dest is used: if dest is assigned to a
2469 pointer (e.g. in the memory), it has SImode; it may have
2470 DImode if dest is dereferenced to access the memeory.
2471 This is why we have to handle three different ldr_got_small
2472 patterns here (two patterns for ILP32). */
2473
2474 rtx insn;
2475 rtx mem;
2476 rtx tmp_reg = dest;
2477 machine_mode mode = GET_MODE (dest);
2478
2479 if (can_create_pseudo_p ())
2480 tmp_reg = gen_reg_rtx (mode);
2481
2482 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2483 if (mode == ptr_mode)
2484 {
2485 if (mode == DImode)
2486 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2487 else
2488 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2489
2490 mem = XVECEXP (SET_SRC (insn), 0, 0);
2491 }
2492 else
2493 {
2494 gcc_assert (mode == Pmode);
2495
2496 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2497 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2498 }
2499
2500 gcc_assert (GET_CODE (mem) == MEM);
2501 MEM_READONLY_P (mem) = 1;
2502 MEM_NOTRAP_P (mem) = 1;
2503 emit_insn (insn);
2504 return;
2505 }
2506
2507 case SYMBOL_SMALL_TLSGD:
2508 {
2509 rtx_insn *insns;
2510 machine_mode mode = GET_MODE (dest);
2511 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2512
2513 start_sequence ();
2514 if (TARGET_ILP32)
2515 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2516 else
2517 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2518 insns = get_insns ();
2519 end_sequence ();
2520
2521 RTL_CONST_CALL_P (insns) = 1;
2522 emit_libcall_block (insns, dest, result, imm);
2523 return;
2524 }
2525
2526 case SYMBOL_SMALL_TLSDESC:
2527 {
2528 machine_mode mode = GET_MODE (dest);
2529 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2530 rtx tp;
2531
2532 gcc_assert (mode == Pmode || mode == ptr_mode);
2533
2534 /* In ILP32, the got entry is always of SImode size. Unlike
2535 small GOT, the dest is fixed at reg 0. */
2536 if (TARGET_ILP32)
2537 emit_insn (gen_tlsdesc_small_si (imm));
2538 else
2539 emit_insn (gen_tlsdesc_small_di (imm));
2540 tp = aarch64_load_tp (NULL);
2541
2542 if (mode != Pmode)
2543 tp = gen_lowpart (mode, tp);
2544
2545 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2546 if (REG_P (dest))
2547 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2548 return;
2549 }
2550
2551 case SYMBOL_SMALL_TLSIE:
2552 {
2553 /* In ILP32, the mode of dest can be either SImode or DImode,
2554 while the got entry is always of SImode size. The mode of
2555 dest depends on how dest is used: if dest is assigned to a
2556 pointer (e.g. in the memory), it has SImode; it may have
2557 DImode if dest is dereferenced to access the memeory.
2558 This is why we have to handle three different tlsie_small
2559 patterns here (two patterns for ILP32). */
2560 machine_mode mode = GET_MODE (dest);
2561 rtx tmp_reg = gen_reg_rtx (mode);
2562 rtx tp = aarch64_load_tp (NULL);
2563
2564 if (mode == ptr_mode)
2565 {
2566 if (mode == DImode)
2567 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2568 else
2569 {
2570 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2571 tp = gen_lowpart (mode, tp);
2572 }
2573 }
2574 else
2575 {
2576 gcc_assert (mode == Pmode);
2577 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2578 }
2579
2580 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2581 if (REG_P (dest))
2582 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2583 return;
2584 }
2585
2586 case SYMBOL_TLSLE12:
2587 case SYMBOL_TLSLE24:
2588 case SYMBOL_TLSLE32:
2589 case SYMBOL_TLSLE48:
2590 {
2591 machine_mode mode = GET_MODE (dest);
2592 rtx tp = aarch64_load_tp (NULL);
2593
2594 if (mode != Pmode)
2595 tp = gen_lowpart (mode, tp);
2596
2597 switch (type)
2598 {
2599 case SYMBOL_TLSLE12:
2600 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2601 (dest, tp, imm));
2602 break;
2603 case SYMBOL_TLSLE24:
2604 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2605 (dest, tp, imm));
2606 break;
2607 case SYMBOL_TLSLE32:
2608 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2609 (dest, imm));
2610 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2611 (dest, dest, tp));
2612 break;
2613 case SYMBOL_TLSLE48:
2614 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2615 (dest, imm));
2616 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2617 (dest, dest, tp));
2618 break;
2619 default:
2620 gcc_unreachable ();
2621 }
2622
2623 if (REG_P (dest))
2624 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2625 return;
2626 }
2627
2628 case SYMBOL_TINY_GOT:
2629 emit_insn (gen_ldr_got_tiny (dest, imm));
2630 return;
2631
2632 case SYMBOL_TINY_TLSIE:
2633 {
2634 machine_mode mode = GET_MODE (dest);
2635 rtx tp = aarch64_load_tp (NULL);
2636
2637 if (mode == ptr_mode)
2638 {
2639 if (mode == DImode)
2640 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2641 else
2642 {
2643 tp = gen_lowpart (mode, tp);
2644 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2645 }
2646 }
2647 else
2648 {
2649 gcc_assert (mode == Pmode);
2650 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2651 }
2652
2653 if (REG_P (dest))
2654 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2655 return;
2656 }
2657
2658 default:
2659 gcc_unreachable ();
2660 }
2661 }
2662
2663 /* Emit a move from SRC to DEST. Assume that the move expanders can
2664 handle all moves if !can_create_pseudo_p (). The distinction is
2665 important because, unlike emit_move_insn, the move expanders know
2666 how to force Pmode objects into the constant pool even when the
2667 constant pool address is not itself legitimate. */
2668 static rtx
2669 aarch64_emit_move (rtx dest, rtx src)
2670 {
2671 return (can_create_pseudo_p ()
2672 ? emit_move_insn (dest, src)
2673 : emit_move_insn_1 (dest, src));
2674 }
2675
2676 /* Apply UNOPTAB to OP and store the result in DEST. */
2677
2678 static void
2679 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2680 {
2681 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2682 if (dest != tmp)
2683 emit_move_insn (dest, tmp);
2684 }
2685
2686 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2687
2688 static void
2689 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2690 {
2691 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2692 OPTAB_DIRECT);
2693 if (dest != tmp)
2694 emit_move_insn (dest, tmp);
2695 }
2696
2697 /* Split a 128-bit move operation into two 64-bit move operations,
2698 taking care to handle partial overlap of register to register
2699 copies. Special cases are needed when moving between GP regs and
2700 FP regs. SRC can be a register, constant or memory; DST a register
2701 or memory. If either operand is memory it must not have any side
2702 effects. */
2703 void
2704 aarch64_split_128bit_move (rtx dst, rtx src)
2705 {
2706 rtx dst_lo, dst_hi;
2707 rtx src_lo, src_hi;
2708
2709 machine_mode mode = GET_MODE (dst);
2710
2711 gcc_assert (mode == TImode || mode == TFmode);
2712 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2713 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2714
2715 if (REG_P (dst) && REG_P (src))
2716 {
2717 int src_regno = REGNO (src);
2718 int dst_regno = REGNO (dst);
2719
2720 /* Handle FP <-> GP regs. */
2721 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2722 {
2723 src_lo = gen_lowpart (word_mode, src);
2724 src_hi = gen_highpart (word_mode, src);
2725
2726 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2727 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2728 return;
2729 }
2730 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2731 {
2732 dst_lo = gen_lowpart (word_mode, dst);
2733 dst_hi = gen_highpart (word_mode, dst);
2734
2735 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2736 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2737 return;
2738 }
2739 }
2740
2741 dst_lo = gen_lowpart (word_mode, dst);
2742 dst_hi = gen_highpart (word_mode, dst);
2743 src_lo = gen_lowpart (word_mode, src);
2744 src_hi = gen_highpart_mode (word_mode, mode, src);
2745
2746 /* At most one pairing may overlap. */
2747 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2748 {
2749 aarch64_emit_move (dst_hi, src_hi);
2750 aarch64_emit_move (dst_lo, src_lo);
2751 }
2752 else
2753 {
2754 aarch64_emit_move (dst_lo, src_lo);
2755 aarch64_emit_move (dst_hi, src_hi);
2756 }
2757 }
2758
2759 bool
2760 aarch64_split_128bit_move_p (rtx dst, rtx src)
2761 {
2762 return (! REG_P (src)
2763 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2764 }
2765
2766 /* Split a complex SIMD combine. */
2767
2768 void
2769 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2770 {
2771 machine_mode src_mode = GET_MODE (src1);
2772 machine_mode dst_mode = GET_MODE (dst);
2773
2774 gcc_assert (VECTOR_MODE_P (dst_mode));
2775 gcc_assert (register_operand (dst, dst_mode)
2776 && register_operand (src1, src_mode)
2777 && register_operand (src2, src_mode));
2778
2779 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2780 return;
2781 }
2782
2783 /* Split a complex SIMD move. */
2784
2785 void
2786 aarch64_split_simd_move (rtx dst, rtx src)
2787 {
2788 machine_mode src_mode = GET_MODE (src);
2789 machine_mode dst_mode = GET_MODE (dst);
2790
2791 gcc_assert (VECTOR_MODE_P (dst_mode));
2792
2793 if (REG_P (dst) && REG_P (src))
2794 {
2795 gcc_assert (VECTOR_MODE_P (src_mode));
2796 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2797 }
2798 }
2799
2800 bool
2801 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2802 machine_mode ymode, rtx y)
2803 {
2804 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2805 gcc_assert (r != NULL);
2806 return rtx_equal_p (x, r);
2807 }
2808
2809 /* Return TARGET if it is nonnull and a register of mode MODE.
2810 Otherwise, return a fresh register of mode MODE if we can,
2811 or TARGET reinterpreted as MODE if we can't. */
2812
2813 static rtx
2814 aarch64_target_reg (rtx target, machine_mode mode)
2815 {
2816 if (target && REG_P (target) && GET_MODE (target) == mode)
2817 return target;
2818 if (!can_create_pseudo_p ())
2819 {
2820 gcc_assert (target);
2821 return gen_lowpart (mode, target);
2822 }
2823 return gen_reg_rtx (mode);
2824 }
2825
2826 /* Return a register that contains the constant in BUILDER, given that
2827 the constant is a legitimate move operand. Use TARGET as the register
2828 if it is nonnull and convenient. */
2829
2830 static rtx
2831 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2832 {
2833 rtx src = builder.build ();
2834 target = aarch64_target_reg (target, GET_MODE (src));
2835 emit_insn (gen_rtx_SET (target, src));
2836 return target;
2837 }
2838
2839 static rtx
2840 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2841 {
2842 if (can_create_pseudo_p ())
2843 return force_reg (mode, value);
2844 else
2845 {
2846 gcc_assert (x);
2847 aarch64_emit_move (x, value);
2848 return x;
2849 }
2850 }
2851
2852 /* Return true if predicate value X is a constant in which every element
2853 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2854 value, i.e. as a predicate in which all bits are significant. */
2855
2856 static bool
2857 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2858 {
2859 if (GET_CODE (x) != CONST_VECTOR)
2860 return false;
2861
2862 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2863 GET_MODE_NUNITS (GET_MODE (x)));
2864 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2865 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2866 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2867
2868 unsigned int nelts = const_vector_encoded_nelts (x);
2869 for (unsigned int i = 0; i < nelts; ++i)
2870 {
2871 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2872 if (!CONST_INT_P (elt))
2873 return false;
2874
2875 builder.quick_push (elt);
2876 for (unsigned int j = 1; j < factor; ++j)
2877 builder.quick_push (const0_rtx);
2878 }
2879 builder.finalize ();
2880 return true;
2881 }
2882
2883 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2884 widest predicate element size it can have (that is, the largest size
2885 for which each element would still be 0 or 1). */
2886
2887 unsigned int
2888 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2889 {
2890 /* Start with the most optimistic assumption: that we only need
2891 one bit per pattern. This is what we will use if only the first
2892 bit in each pattern is ever set. */
2893 unsigned int mask = GET_MODE_SIZE (DImode);
2894 mask |= builder.npatterns ();
2895
2896 /* Look for set bits. */
2897 unsigned int nelts = builder.encoded_nelts ();
2898 for (unsigned int i = 1; i < nelts; ++i)
2899 if (INTVAL (builder.elt (i)) != 0)
2900 {
2901 if (i & 1)
2902 return 1;
2903 mask |= i;
2904 }
2905 return mask & -mask;
2906 }
2907
2908 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
2909 return that predicate mode, otherwise return opt_machine_mode (). */
2910
2911 opt_machine_mode
2912 aarch64_ptrue_all_mode (rtx x)
2913 {
2914 gcc_assert (GET_MODE (x) == VNx16BImode);
2915 if (GET_CODE (x) != CONST_VECTOR
2916 || !CONST_VECTOR_DUPLICATE_P (x)
2917 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
2918 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
2919 return opt_machine_mode ();
2920
2921 unsigned int nelts = const_vector_encoded_nelts (x);
2922 for (unsigned int i = 1; i < nelts; ++i)
2923 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
2924 return opt_machine_mode ();
2925
2926 return aarch64_sve_pred_mode (nelts);
2927 }
2928
2929 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2930 that the constant would have with predicate element size ELT_SIZE
2931 (ignoring the upper bits in each element) and return:
2932
2933 * -1 if all bits are set
2934 * N if the predicate has N leading set bits followed by all clear bits
2935 * 0 if the predicate does not have any of these forms. */
2936
2937 int
2938 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2939 unsigned int elt_size)
2940 {
2941 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2942 followed by set bits. */
2943 if (builder.nelts_per_pattern () == 3)
2944 return 0;
2945
2946 /* Skip over leading set bits. */
2947 unsigned int nelts = builder.encoded_nelts ();
2948 unsigned int i = 0;
2949 for (; i < nelts; i += elt_size)
2950 if (INTVAL (builder.elt (i)) == 0)
2951 break;
2952 unsigned int vl = i / elt_size;
2953
2954 /* Check for the all-true case. */
2955 if (i == nelts)
2956 return -1;
2957
2958 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2959 repeating pattern of set bits followed by clear bits. */
2960 if (builder.nelts_per_pattern () != 2)
2961 return 0;
2962
2963 /* We have a "foreground" value and a duplicated "background" value.
2964 If the background might repeat and the last set bit belongs to it,
2965 we might have set bits followed by clear bits followed by set bits. */
2966 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2967 return 0;
2968
2969 /* Make sure that the rest are all clear. */
2970 for (; i < nelts; i += elt_size)
2971 if (INTVAL (builder.elt (i)) != 0)
2972 return 0;
2973
2974 return vl;
2975 }
2976
2977 /* See if there is an svpattern that encodes an SVE predicate of mode
2978 PRED_MODE in which the first VL bits are set and the rest are clear.
2979 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2980 A VL of -1 indicates an all-true vector. */
2981
2982 aarch64_svpattern
2983 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2984 {
2985 if (vl < 0)
2986 return AARCH64_SV_ALL;
2987
2988 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2989 return AARCH64_NUM_SVPATTERNS;
2990
2991 if (vl >= 1 && vl <= 8)
2992 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2993
2994 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2995 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2996
2997 int max_vl;
2998 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2999 {
3000 if (vl == (max_vl / 3) * 3)
3001 return AARCH64_SV_MUL3;
3002 /* These would only trigger for non-power-of-2 lengths. */
3003 if (vl == (max_vl & -4))
3004 return AARCH64_SV_MUL4;
3005 if (vl == (1 << floor_log2 (max_vl)))
3006 return AARCH64_SV_POW2;
3007 if (vl == max_vl)
3008 return AARCH64_SV_ALL;
3009 }
3010 return AARCH64_NUM_SVPATTERNS;
3011 }
3012
3013 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3014 bits has the lowest bit set and the upper bits clear. This is the
3015 VNx16BImode equivalent of a PTRUE for controlling elements of
3016 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3017 all bits are significant, even the upper zeros. */
3018
3019 rtx
3020 aarch64_ptrue_all (unsigned int elt_size)
3021 {
3022 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3023 builder.quick_push (const1_rtx);
3024 for (unsigned int i = 1; i < elt_size; ++i)
3025 builder.quick_push (const0_rtx);
3026 return builder.build ();
3027 }
3028
3029 /* Return an all-true predicate register of mode MODE. */
3030
3031 rtx
3032 aarch64_ptrue_reg (machine_mode mode)
3033 {
3034 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3035 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3036 return gen_lowpart (mode, reg);
3037 }
3038
3039 /* Return an all-false predicate register of mode MODE. */
3040
3041 rtx
3042 aarch64_pfalse_reg (machine_mode mode)
3043 {
3044 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3045 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3046 return gen_lowpart (mode, reg);
3047 }
3048
3049 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3050 true, or alternatively if we know that the operation predicated by
3051 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
3052 aarch64_sve_gp_strictness operand that describes the operation
3053 predicated by PRED1[0]. */
3054
3055 bool
3056 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3057 {
3058 machine_mode mode = GET_MODE (pred2);
3059 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3060 && mode == GET_MODE (pred1[0])
3061 && aarch64_sve_gp_strictness (pred1[1], SImode));
3062 return (pred1[0] == CONSTM1_RTX (mode)
3063 || INTVAL (pred1[1]) == SVE_RELAXED_GP
3064 || rtx_equal_p (pred1[0], pred2));
3065 }
3066
3067 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3068 for it. PRED2[0] is the predicate for the instruction whose result
3069 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3070 for it. Return true if we can prove that the two predicates are
3071 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3072 with PRED1[0] without changing behavior. */
3073
3074 bool
3075 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3076 {
3077 machine_mode mode = GET_MODE (pred1[0]);
3078 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3079 && mode == GET_MODE (pred2[0])
3080 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3081 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3082
3083 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3084 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3085 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3086 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3087 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3088 }
3089
3090 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3091 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3092 Use TARGET as the target register if nonnull and convenient. */
3093
3094 static rtx
3095 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3096 machine_mode data_mode, rtx op1, rtx op2)
3097 {
3098 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3099 expand_operand ops[5];
3100 create_output_operand (&ops[0], target, pred_mode);
3101 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3102 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3103 create_input_operand (&ops[3], op1, data_mode);
3104 create_input_operand (&ops[4], op2, data_mode);
3105 expand_insn (icode, 5, ops);
3106 return ops[0].value;
3107 }
3108
3109 /* Use a comparison to convert integer vector SRC into MODE, which is
3110 the corresponding SVE predicate mode. Use TARGET for the result
3111 if it's nonnull and convenient. */
3112
3113 rtx
3114 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3115 {
3116 machine_mode src_mode = GET_MODE (src);
3117 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3118 src, CONST0_RTX (src_mode));
3119 }
3120
3121 /* Return the assembly token for svprfop value PRFOP. */
3122
3123 static const char *
3124 svprfop_token (enum aarch64_svprfop prfop)
3125 {
3126 switch (prfop)
3127 {
3128 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3129 AARCH64_FOR_SVPRFOP (CASE)
3130 #undef CASE
3131 case AARCH64_NUM_SVPRFOPS:
3132 break;
3133 }
3134 gcc_unreachable ();
3135 }
3136
3137 /* Return the assembly string for an SVE prefetch operation with
3138 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3139 and that SUFFIX is the format for the remaining operands. */
3140
3141 char *
3142 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3143 const char *suffix)
3144 {
3145 static char buffer[128];
3146 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3147 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3148 mnemonic, svprfop_token (prfop), suffix);
3149 gcc_assert (written < sizeof (buffer));
3150 return buffer;
3151 }
3152
3153 /* Check whether we can calculate the number of elements in PATTERN
3154 at compile time, given that there are NELTS_PER_VQ elements per
3155 128-bit block. Return the value if so, otherwise return -1. */
3156
3157 HOST_WIDE_INT
3158 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3159 {
3160 unsigned int vl, const_vg;
3161 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3162 vl = 1 + (pattern - AARCH64_SV_VL1);
3163 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3164 vl = 16 << (pattern - AARCH64_SV_VL16);
3165 else if (aarch64_sve_vg.is_constant (&const_vg))
3166 {
3167 /* There are two vector granules per quadword. */
3168 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3169 switch (pattern)
3170 {
3171 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3172 case AARCH64_SV_MUL4: return nelts & -4;
3173 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3174 case AARCH64_SV_ALL: return nelts;
3175 default: gcc_unreachable ();
3176 }
3177 }
3178 else
3179 return -1;
3180
3181 /* There are two vector granules per quadword. */
3182 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3183 if (known_le (vl, nelts_all))
3184 return vl;
3185
3186 /* Requesting more elements than are available results in a PFALSE. */
3187 if (known_gt (vl, nelts_all))
3188 return 0;
3189
3190 return -1;
3191 }
3192
3193 /* Return true if we can move VALUE into a register using a single
3194 CNT[BHWD] instruction. */
3195
3196 static bool
3197 aarch64_sve_cnt_immediate_p (poly_int64 value)
3198 {
3199 HOST_WIDE_INT factor = value.coeffs[0];
3200 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3201 return (value.coeffs[1] == factor
3202 && IN_RANGE (factor, 2, 16 * 16)
3203 && (factor & 1) == 0
3204 && factor <= 16 * (factor & -factor));
3205 }
3206
3207 /* Likewise for rtx X. */
3208
3209 bool
3210 aarch64_sve_cnt_immediate_p (rtx x)
3211 {
3212 poly_int64 value;
3213 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3214 }
3215
3216 /* Return the asm string for an instruction with a CNT-like vector size
3217 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3218 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3219 first part of the operands template (the part that comes before the
3220 vector size itself). PATTERN is the pattern to use. FACTOR is the
3221 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3222 in each quadword. If it is zero, we can use any element size. */
3223
3224 static char *
3225 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3226 aarch64_svpattern pattern,
3227 unsigned int factor,
3228 unsigned int nelts_per_vq)
3229 {
3230 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3231
3232 if (nelts_per_vq == 0)
3233 /* There is some overlap in the ranges of the four CNT instructions.
3234 Here we always use the smallest possible element size, so that the
3235 multiplier is 1 whereever possible. */
3236 nelts_per_vq = factor & -factor;
3237 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3238 gcc_assert (IN_RANGE (shift, 1, 4));
3239 char suffix = "dwhb"[shift - 1];
3240
3241 factor >>= shift;
3242 unsigned int written;
3243 if (pattern == AARCH64_SV_ALL && factor == 1)
3244 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3245 prefix, suffix, operands);
3246 else if (factor == 1)
3247 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3248 prefix, suffix, operands, svpattern_token (pattern));
3249 else
3250 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3251 prefix, suffix, operands, svpattern_token (pattern),
3252 factor);
3253 gcc_assert (written < sizeof (buffer));
3254 return buffer;
3255 }
3256
3257 /* Return the asm string for an instruction with a CNT-like vector size
3258 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3259 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3260 first part of the operands template (the part that comes before the
3261 vector size itself). X is the value of the vector size operand,
3262 as a polynomial integer rtx; we need to convert this into an "all"
3263 pattern with a multiplier. */
3264
3265 char *
3266 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3267 rtx x)
3268 {
3269 poly_int64 value = rtx_to_poly_int64 (x);
3270 gcc_assert (aarch64_sve_cnt_immediate_p (value));
3271 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3272 value.coeffs[1], 0);
3273 }
3274
3275 /* Return the asm string for an instruction with a CNT-like vector size
3276 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3277 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3278 first part of the operands template (the part that comes before the
3279 vector size itself). CNT_PAT[0..2] are the operands of the
3280 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3281
3282 char *
3283 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3284 const char *operands, rtx *cnt_pat)
3285 {
3286 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3287 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3288 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3289 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3290 factor, nelts_per_vq);
3291 }
3292
3293 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3294
3295 bool
3296 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3297 {
3298 poly_int64 value;
3299 return (poly_int_rtx_p (x, &value)
3300 && (aarch64_sve_cnt_immediate_p (value)
3301 || aarch64_sve_cnt_immediate_p (-value)));
3302 }
3303
3304 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3305 operand 0. */
3306
3307 char *
3308 aarch64_output_sve_scalar_inc_dec (rtx offset)
3309 {
3310 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3311 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3312 if (offset_value.coeffs[1] > 0)
3313 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3314 offset_value.coeffs[1], 0);
3315 else
3316 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3317 -offset_value.coeffs[1], 0);
3318 }
3319
3320 /* Return true if we can add VALUE to a register using a single ADDVL
3321 or ADDPL instruction. */
3322
3323 static bool
3324 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3325 {
3326 HOST_WIDE_INT factor = value.coeffs[0];
3327 if (factor == 0 || value.coeffs[1] != factor)
3328 return false;
3329 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3330 and a value of 16 is one vector width. */
3331 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3332 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3333 }
3334
3335 /* Likewise for rtx X. */
3336
3337 bool
3338 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3339 {
3340 poly_int64 value;
3341 return (poly_int_rtx_p (x, &value)
3342 && aarch64_sve_addvl_addpl_immediate_p (value));
3343 }
3344
3345 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3346 to operand 1 and storing the result in operand 0. */
3347
3348 char *
3349 aarch64_output_sve_addvl_addpl (rtx offset)
3350 {
3351 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3352 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3353 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3354
3355 int factor = offset_value.coeffs[1];
3356 if ((factor & 15) == 0)
3357 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3358 else
3359 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3360 return buffer;
3361 }
3362
3363 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3364 instruction. If it is, store the number of elements in each vector
3365 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3366 factor in *FACTOR_OUT (if nonnull). */
3367
3368 bool
3369 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3370 unsigned int *nelts_per_vq_out)
3371 {
3372 rtx elt;
3373 poly_int64 value;
3374
3375 if (!const_vec_duplicate_p (x, &elt)
3376 || !poly_int_rtx_p (elt, &value))
3377 return false;
3378
3379 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3380 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3381 /* There's no vector INCB. */
3382 return false;
3383
3384 HOST_WIDE_INT factor = value.coeffs[0];
3385 if (value.coeffs[1] != factor)
3386 return false;
3387
3388 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3389 if ((factor % nelts_per_vq) != 0
3390 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3391 return false;
3392
3393 if (factor_out)
3394 *factor_out = factor;
3395 if (nelts_per_vq_out)
3396 *nelts_per_vq_out = nelts_per_vq;
3397 return true;
3398 }
3399
3400 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3401 instruction. */
3402
3403 bool
3404 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3405 {
3406 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3407 }
3408
3409 /* Return the asm template for an SVE vector INC or DEC instruction.
3410 OPERANDS gives the operands before the vector count and X is the
3411 value of the vector count operand itself. */
3412
3413 char *
3414 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3415 {
3416 int factor;
3417 unsigned int nelts_per_vq;
3418 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3419 gcc_unreachable ();
3420 if (factor < 0)
3421 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3422 -factor, nelts_per_vq);
3423 else
3424 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3425 factor, nelts_per_vq);
3426 }
3427
3428 static int
3429 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3430 scalar_int_mode mode)
3431 {
3432 int i;
3433 unsigned HOST_WIDE_INT val, val2, mask;
3434 int one_match, zero_match;
3435 int num_insns;
3436
3437 val = INTVAL (imm);
3438
3439 if (aarch64_move_imm (val, mode))
3440 {
3441 if (generate)
3442 emit_insn (gen_rtx_SET (dest, imm));
3443 return 1;
3444 }
3445
3446 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3447 (with XXXX non-zero). In that case check to see if the move can be done in
3448 a smaller mode. */
3449 val2 = val & 0xffffffff;
3450 if (mode == DImode
3451 && aarch64_move_imm (val2, SImode)
3452 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3453 {
3454 if (generate)
3455 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3456
3457 /* Check if we have to emit a second instruction by checking to see
3458 if any of the upper 32 bits of the original DI mode value is set. */
3459 if (val == val2)
3460 return 1;
3461
3462 i = (val >> 48) ? 48 : 32;
3463
3464 if (generate)
3465 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3466 GEN_INT ((val >> i) & 0xffff)));
3467
3468 return 2;
3469 }
3470
3471 if ((val >> 32) == 0 || mode == SImode)
3472 {
3473 if (generate)
3474 {
3475 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3476 if (mode == SImode)
3477 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3478 GEN_INT ((val >> 16) & 0xffff)));
3479 else
3480 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3481 GEN_INT ((val >> 16) & 0xffff)));
3482 }
3483 return 2;
3484 }
3485
3486 /* Remaining cases are all for DImode. */
3487
3488 mask = 0xffff;
3489 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3490 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3491 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3492 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3493
3494 if (zero_match != 2 && one_match != 2)
3495 {
3496 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3497 For a 64-bit bitmask try whether changing 16 bits to all ones or
3498 zeroes creates a valid bitmask. To check any repeated bitmask,
3499 try using 16 bits from the other 32-bit half of val. */
3500
3501 for (i = 0; i < 64; i += 16, mask <<= 16)
3502 {
3503 val2 = val & ~mask;
3504 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3505 break;
3506 val2 = val | mask;
3507 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3508 break;
3509 val2 = val2 & ~mask;
3510 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3511 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3512 break;
3513 }
3514 if (i != 64)
3515 {
3516 if (generate)
3517 {
3518 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3519 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3520 GEN_INT ((val >> i) & 0xffff)));
3521 }
3522 return 2;
3523 }
3524 }
3525
3526 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3527 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3528 otherwise skip zero bits. */
3529
3530 num_insns = 1;
3531 mask = 0xffff;
3532 val2 = one_match > zero_match ? ~val : val;
3533 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3534
3535 if (generate)
3536 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3537 ? (val | ~(mask << i))
3538 : (val & (mask << i)))));
3539 for (i += 16; i < 64; i += 16)
3540 {
3541 if ((val2 & (mask << i)) == 0)
3542 continue;
3543 if (generate)
3544 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3545 GEN_INT ((val >> i) & 0xffff)));
3546 num_insns ++;
3547 }
3548
3549 return num_insns;
3550 }
3551
3552 /* Return whether imm is a 128-bit immediate which is simple enough to
3553 expand inline. */
3554 bool
3555 aarch64_mov128_immediate (rtx imm)
3556 {
3557 if (GET_CODE (imm) == CONST_INT)
3558 return true;
3559
3560 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3561
3562 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3563 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3564
3565 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3566 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3567 }
3568
3569
3570 /* Return the number of temporary registers that aarch64_add_offset_1
3571 would need to add OFFSET to a register. */
3572
3573 static unsigned int
3574 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3575 {
3576 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3577 }
3578
3579 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3580 a non-polynomial OFFSET. MODE is the mode of the addition.
3581 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3582 be set and CFA adjustments added to the generated instructions.
3583
3584 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3585 temporary if register allocation is already complete. This temporary
3586 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3587 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3588 the immediate again.
3589
3590 Since this function may be used to adjust the stack pointer, we must
3591 ensure that it cannot cause transient stack deallocation (for example
3592 by first incrementing SP and then decrementing when adjusting by a
3593 large immediate). */
3594
3595 static void
3596 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3597 rtx src, HOST_WIDE_INT offset, rtx temp1,
3598 bool frame_related_p, bool emit_move_imm)
3599 {
3600 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3601 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3602
3603 HOST_WIDE_INT moffset = abs_hwi (offset);
3604 rtx_insn *insn;
3605
3606 if (!moffset)
3607 {
3608 if (!rtx_equal_p (dest, src))
3609 {
3610 insn = emit_insn (gen_rtx_SET (dest, src));
3611 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3612 }
3613 return;
3614 }
3615
3616 /* Single instruction adjustment. */
3617 if (aarch64_uimm12_shift (moffset))
3618 {
3619 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3620 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3621 return;
3622 }
3623
3624 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3625 and either:
3626
3627 a) the offset cannot be loaded by a 16-bit move or
3628 b) there is no spare register into which we can move it. */
3629 if (moffset < 0x1000000
3630 && ((!temp1 && !can_create_pseudo_p ())
3631 || !aarch64_move_imm (moffset, mode)))
3632 {
3633 HOST_WIDE_INT low_off = moffset & 0xfff;
3634
3635 low_off = offset < 0 ? -low_off : low_off;
3636 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3637 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3638 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3639 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3640 return;
3641 }
3642
3643 /* Emit a move immediate if required and an addition/subtraction. */
3644 if (emit_move_imm)
3645 {
3646 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3647 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3648 }
3649 insn = emit_insn (offset < 0
3650 ? gen_sub3_insn (dest, src, temp1)
3651 : gen_add3_insn (dest, src, temp1));
3652 if (frame_related_p)
3653 {
3654 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3655 rtx adj = plus_constant (mode, src, offset);
3656 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3657 }
3658 }
3659
3660 /* Return the number of temporary registers that aarch64_add_offset
3661 would need to move OFFSET into a register or add OFFSET to a register;
3662 ADD_P is true if we want the latter rather than the former. */
3663
3664 static unsigned int
3665 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3666 {
3667 /* This follows the same structure as aarch64_add_offset. */
3668 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3669 return 0;
3670
3671 unsigned int count = 0;
3672 HOST_WIDE_INT factor = offset.coeffs[1];
3673 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3674 poly_int64 poly_offset (factor, factor);
3675 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3676 /* Need one register for the ADDVL/ADDPL result. */
3677 count += 1;
3678 else if (factor != 0)
3679 {
3680 factor = abs (factor);
3681 if (factor > 16 * (factor & -factor))
3682 /* Need one register for the CNT result and one for the multiplication
3683 factor. If necessary, the second temporary can be reused for the
3684 constant part of the offset. */
3685 return 2;
3686 /* Need one register for the CNT result (which might then
3687 be shifted). */
3688 count += 1;
3689 }
3690 return count + aarch64_add_offset_1_temporaries (constant);
3691 }
3692
3693 /* If X can be represented as a poly_int64, return the number
3694 of temporaries that are required to add it to a register.
3695 Return -1 otherwise. */
3696
3697 int
3698 aarch64_add_offset_temporaries (rtx x)
3699 {
3700 poly_int64 offset;
3701 if (!poly_int_rtx_p (x, &offset))
3702 return -1;
3703 return aarch64_offset_temporaries (true, offset);
3704 }
3705
3706 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3707 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3708 be set and CFA adjustments added to the generated instructions.
3709
3710 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3711 temporary if register allocation is already complete. This temporary
3712 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3713 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3714 false to avoid emitting the immediate again.
3715
3716 TEMP2, if nonnull, is a second temporary register that doesn't
3717 overlap either DEST or REG.
3718
3719 Since this function may be used to adjust the stack pointer, we must
3720 ensure that it cannot cause transient stack deallocation (for example
3721 by first incrementing SP and then decrementing when adjusting by a
3722 large immediate). */
3723
3724 static void
3725 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3726 poly_int64 offset, rtx temp1, rtx temp2,
3727 bool frame_related_p, bool emit_move_imm = true)
3728 {
3729 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3730 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3731 gcc_assert (temp1 == NULL_RTX
3732 || !frame_related_p
3733 || !reg_overlap_mentioned_p (temp1, dest));
3734 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3735
3736 /* Try using ADDVL or ADDPL to add the whole value. */
3737 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3738 {
3739 rtx offset_rtx = gen_int_mode (offset, mode);
3740 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3741 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3742 return;
3743 }
3744
3745 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3746 SVE vector register, over and above the minimum size of 128 bits.
3747 This is equivalent to half the value returned by CNTD with a
3748 vector shape of ALL. */
3749 HOST_WIDE_INT factor = offset.coeffs[1];
3750 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3751
3752 /* Try using ADDVL or ADDPL to add the VG-based part. */
3753 poly_int64 poly_offset (factor, factor);
3754 if (src != const0_rtx
3755 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3756 {
3757 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3758 if (frame_related_p)
3759 {
3760 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3761 RTX_FRAME_RELATED_P (insn) = true;
3762 src = dest;
3763 }
3764 else
3765 {
3766 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3767 src = aarch64_force_temporary (mode, temp1, addr);
3768 temp1 = temp2;
3769 temp2 = NULL_RTX;
3770 }
3771 }
3772 /* Otherwise use a CNT-based sequence. */
3773 else if (factor != 0)
3774 {
3775 /* Use a subtraction if we have a negative factor. */
3776 rtx_code code = PLUS;
3777 if (factor < 0)
3778 {
3779 factor = -factor;
3780 code = MINUS;
3781 }
3782
3783 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3784 into the multiplication. */
3785 rtx val;
3786 int shift = 0;
3787 if (factor & 1)
3788 /* Use a right shift by 1. */
3789 shift = -1;
3790 else
3791 factor /= 2;
3792 HOST_WIDE_INT low_bit = factor & -factor;
3793 if (factor <= 16 * low_bit)
3794 {
3795 if (factor > 16 * 8)
3796 {
3797 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3798 the value with the minimum multiplier and shift it into
3799 position. */
3800 int extra_shift = exact_log2 (low_bit);
3801 shift += extra_shift;
3802 factor >>= extra_shift;
3803 }
3804 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3805 }
3806 else
3807 {
3808 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3809 directly, since that should increase the chances of being
3810 able to use a shift and add sequence. If LOW_BIT itself
3811 is out of range, just use CNTD. */
3812 if (low_bit <= 16 * 8)
3813 factor /= low_bit;
3814 else
3815 low_bit = 1;
3816
3817 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3818 val = aarch64_force_temporary (mode, temp1, val);
3819
3820 if (can_create_pseudo_p ())
3821 {
3822 rtx coeff1 = gen_int_mode (factor, mode);
3823 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3824 }
3825 else
3826 {
3827 /* Go back to using a negative multiplication factor if we have
3828 no register from which to subtract. */
3829 if (code == MINUS && src == const0_rtx)
3830 {
3831 factor = -factor;
3832 code = PLUS;
3833 }
3834 rtx coeff1 = gen_int_mode (factor, mode);
3835 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3836 val = gen_rtx_MULT (mode, val, coeff1);
3837 }
3838 }
3839
3840 if (shift > 0)
3841 {
3842 /* Multiply by 1 << SHIFT. */
3843 val = aarch64_force_temporary (mode, temp1, val);
3844 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3845 }
3846 else if (shift == -1)
3847 {
3848 /* Divide by 2. */
3849 val = aarch64_force_temporary (mode, temp1, val);
3850 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3851 }
3852
3853 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3854 if (src != const0_rtx)
3855 {
3856 val = aarch64_force_temporary (mode, temp1, val);
3857 val = gen_rtx_fmt_ee (code, mode, src, val);
3858 }
3859 else if (code == MINUS)
3860 {
3861 val = aarch64_force_temporary (mode, temp1, val);
3862 val = gen_rtx_NEG (mode, val);
3863 }
3864
3865 if (constant == 0 || frame_related_p)
3866 {
3867 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3868 if (frame_related_p)
3869 {
3870 RTX_FRAME_RELATED_P (insn) = true;
3871 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3872 gen_rtx_SET (dest, plus_constant (Pmode, src,
3873 poly_offset)));
3874 }
3875 src = dest;
3876 if (constant == 0)
3877 return;
3878 }
3879 else
3880 {
3881 src = aarch64_force_temporary (mode, temp1, val);
3882 temp1 = temp2;
3883 temp2 = NULL_RTX;
3884 }
3885
3886 emit_move_imm = true;
3887 }
3888
3889 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3890 frame_related_p, emit_move_imm);
3891 }
3892
3893 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3894 than a poly_int64. */
3895
3896 void
3897 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3898 rtx offset_rtx, rtx temp1, rtx temp2)
3899 {
3900 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3901 temp1, temp2, false);
3902 }
3903
3904 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3905 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3906 if TEMP1 already contains abs (DELTA). */
3907
3908 static inline void
3909 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3910 {
3911 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3912 temp1, temp2, true, emit_move_imm);
3913 }
3914
3915 /* Subtract DELTA from the stack pointer, marking the instructions
3916 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3917 if nonnull. */
3918
3919 static inline void
3920 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3921 bool emit_move_imm = true)
3922 {
3923 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3924 temp1, temp2, frame_related_p, emit_move_imm);
3925 }
3926
3927 /* Set DEST to (vec_series BASE STEP). */
3928
3929 static void
3930 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3931 {
3932 machine_mode mode = GET_MODE (dest);
3933 scalar_mode inner = GET_MODE_INNER (mode);
3934
3935 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3936 if (!aarch64_sve_index_immediate_p (base))
3937 base = force_reg (inner, base);
3938 if (!aarch64_sve_index_immediate_p (step))
3939 step = force_reg (inner, step);
3940
3941 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3942 }
3943
3944 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3945 register of mode MODE. Use TARGET for the result if it's nonnull
3946 and convenient.
3947
3948 The two vector modes must have the same element mode. The behavior
3949 is to duplicate architectural lane N of SRC into architectural lanes
3950 N + I * STEP of the result. On big-endian targets, architectural
3951 lane 0 of an Advanced SIMD vector is the last element of the vector
3952 in memory layout, so for big-endian targets this operation has the
3953 effect of reversing SRC before duplicating it. Callers need to
3954 account for this. */
3955
3956 rtx
3957 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3958 {
3959 machine_mode src_mode = GET_MODE (src);
3960 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3961 insn_code icode = (BYTES_BIG_ENDIAN
3962 ? code_for_aarch64_vec_duplicate_vq_be (mode)
3963 : code_for_aarch64_vec_duplicate_vq_le (mode));
3964
3965 unsigned int i = 0;
3966 expand_operand ops[3];
3967 create_output_operand (&ops[i++], target, mode);
3968 create_output_operand (&ops[i++], src, src_mode);
3969 if (BYTES_BIG_ENDIAN)
3970 {
3971 /* Create a PARALLEL describing the reversal of SRC. */
3972 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3973 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3974 nelts_per_vq - 1, -1);
3975 create_fixed_operand (&ops[i++], sel);
3976 }
3977 expand_insn (icode, i, ops);
3978 return ops[0].value;
3979 }
3980
3981 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3982 the memory image into DEST. Return true on success. */
3983
3984 static bool
3985 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3986 {
3987 src = force_const_mem (GET_MODE (src), src);
3988 if (!src)
3989 return false;
3990
3991 /* Make sure that the address is legitimate. */
3992 if (!aarch64_sve_ld1rq_operand_p (src))
3993 {
3994 rtx addr = force_reg (Pmode, XEXP (src, 0));
3995 src = replace_equiv_address (src, addr);
3996 }
3997
3998 machine_mode mode = GET_MODE (dest);
3999 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
4000 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
4001 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4002 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4003 return true;
4004 }
4005
4006 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4007 SVE data mode and isn't a legitimate constant. Use TARGET for the
4008 result if convenient.
4009
4010 The returned register can have whatever mode seems most natural
4011 given the contents of SRC. */
4012
4013 static rtx
4014 aarch64_expand_sve_const_vector (rtx target, rtx src)
4015 {
4016 machine_mode mode = GET_MODE (src);
4017 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4018 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4019 scalar_mode elt_mode = GET_MODE_INNER (mode);
4020 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4021 unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
4022
4023 if (nelts_per_pattern == 1 && encoded_bits == 128)
4024 {
4025 /* The constant is a duplicated quadword but can't be narrowed
4026 beyond a quadword. Get the memory image of the first quadword
4027 as a 128-bit vector and try using LD1RQ to load it from memory.
4028
4029 The effect for both endiannesses is to load memory lane N into
4030 architectural lanes N + I * STEP of the result. On big-endian
4031 targets, the layout of the 128-bit vector in an Advanced SIMD
4032 register would be different from its layout in an SVE register,
4033 but this 128-bit vector is a memory value only. */
4034 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4035 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4036 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4037 return target;
4038 }
4039
4040 if (nelts_per_pattern == 1 && encoded_bits < 128)
4041 {
4042 /* The vector is a repeating sequence of 64 bits or fewer.
4043 See if we can load them using an Advanced SIMD move and then
4044 duplicate it to fill a vector. This is better than using a GPR
4045 move because it keeps everything in the same register file. */
4046 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4047 rtx_vector_builder builder (vq_mode, npatterns, 1);
4048 for (unsigned int i = 0; i < npatterns; ++i)
4049 {
4050 /* We want memory lane N to go into architectural lane N,
4051 so reverse for big-endian targets. The DUP .Q pattern
4052 has a compensating reverse built-in. */
4053 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4054 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4055 }
4056 rtx vq_src = builder.build ();
4057 if (aarch64_simd_valid_immediate (vq_src, NULL))
4058 {
4059 vq_src = force_reg (vq_mode, vq_src);
4060 return aarch64_expand_sve_dupq (target, mode, vq_src);
4061 }
4062
4063 /* Get an integer representation of the repeating part of Advanced
4064 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4065 which for big-endian targets is lane-swapped wrt a normal
4066 Advanced SIMD vector. This means that for both endiannesses,
4067 memory lane N of SVE vector SRC corresponds to architectural
4068 lane N of a register holding VQ_SRC. This in turn means that
4069 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4070 as a single 128-bit value) and thus that memory lane 0 of SRC is
4071 in the lsb of the integer. Duplicating the integer therefore
4072 ensures that memory lane N of SRC goes into architectural lane
4073 N + I * INDEX of the SVE register. */
4074 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4075 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4076 if (elt_value)
4077 {
4078 /* Pretend that we had a vector of INT_MODE to start with. */
4079 elt_mode = int_mode;
4080 mode = aarch64_full_sve_mode (int_mode).require ();
4081
4082 /* If the integer can be moved into a general register by a
4083 single instruction, do that and duplicate the result. */
4084 if (CONST_INT_P (elt_value)
4085 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4086 {
4087 elt_value = force_reg (elt_mode, elt_value);
4088 return expand_vector_broadcast (mode, elt_value);
4089 }
4090 }
4091 else if (npatterns == 1)
4092 /* We're duplicating a single value, but can't do better than
4093 force it to memory and load from there. This handles things
4094 like symbolic constants. */
4095 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4096
4097 if (elt_value)
4098 {
4099 /* Load the element from memory if we can, otherwise move it into
4100 a register and use a DUP. */
4101 rtx op = force_const_mem (elt_mode, elt_value);
4102 if (!op)
4103 op = force_reg (elt_mode, elt_value);
4104 return expand_vector_broadcast (mode, op);
4105 }
4106 }
4107
4108 /* Try using INDEX. */
4109 rtx base, step;
4110 if (const_vec_series_p (src, &base, &step))
4111 {
4112 aarch64_expand_vec_series (target, base, step);
4113 return target;
4114 }
4115
4116 /* From here on, it's better to force the whole constant to memory
4117 if we can. */
4118 if (GET_MODE_NUNITS (mode).is_constant ())
4119 return NULL_RTX;
4120
4121 /* Expand each pattern individually. */
4122 gcc_assert (npatterns > 1);
4123 rtx_vector_builder builder;
4124 auto_vec<rtx, 16> vectors (npatterns);
4125 for (unsigned int i = 0; i < npatterns; ++i)
4126 {
4127 builder.new_vector (mode, 1, nelts_per_pattern);
4128 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4129 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4130 vectors.quick_push (force_reg (mode, builder.build ()));
4131 }
4132
4133 /* Use permutes to interleave the separate vectors. */
4134 while (npatterns > 1)
4135 {
4136 npatterns /= 2;
4137 for (unsigned int i = 0; i < npatterns; ++i)
4138 {
4139 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4140 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4141 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4142 vectors[i] = tmp;
4143 }
4144 }
4145 gcc_assert (vectors[0] == target);
4146 return target;
4147 }
4148
4149 /* Use WHILE to set a predicate register of mode MODE in which the first
4150 VL bits are set and the rest are clear. Use TARGET for the register
4151 if it's nonnull and convenient. */
4152
4153 static rtx
4154 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4155 unsigned int vl)
4156 {
4157 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4158 target = aarch64_target_reg (target, mode);
4159 emit_insn (gen_while (UNSPEC_WHILE_LO, DImode, mode,
4160 target, const0_rtx, limit));
4161 return target;
4162 }
4163
4164 static rtx
4165 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4166
4167 /* BUILDER is a constant predicate in which the index of every set bit
4168 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4169 by inverting every element at a multiple of ELT_SIZE and EORing the
4170 result with an ELT_SIZE PTRUE.
4171
4172 Return a register that contains the constant on success, otherwise
4173 return null. Use TARGET as the register if it is nonnull and
4174 convenient. */
4175
4176 static rtx
4177 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4178 unsigned int elt_size)
4179 {
4180 /* Invert every element at a multiple of ELT_SIZE, keeping the
4181 other bits zero. */
4182 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4183 builder.nelts_per_pattern ());
4184 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4185 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4186 inv_builder.quick_push (const1_rtx);
4187 else
4188 inv_builder.quick_push (const0_rtx);
4189 inv_builder.finalize ();
4190
4191 /* See if we can load the constant cheaply. */
4192 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4193 if (!inv)
4194 return NULL_RTX;
4195
4196 /* EOR the result with an ELT_SIZE PTRUE. */
4197 rtx mask = aarch64_ptrue_all (elt_size);
4198 mask = force_reg (VNx16BImode, mask);
4199 target = aarch64_target_reg (target, VNx16BImode);
4200 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4201 return target;
4202 }
4203
4204 /* BUILDER is a constant predicate in which the index of every set bit
4205 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4206 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4207 register on success, otherwise return null. Use TARGET as the register
4208 if nonnull and convenient. */
4209
4210 static rtx
4211 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4212 unsigned int elt_size,
4213 unsigned int permute_size)
4214 {
4215 /* We're going to split the constant into two new constants A and B,
4216 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4217 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4218
4219 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4220 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4221
4222 where _ indicates elements that will be discarded by the permute.
4223
4224 First calculate the ELT_SIZEs for A and B. */
4225 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4226 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4227 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4228 if (INTVAL (builder.elt (i)) != 0)
4229 {
4230 if (i & permute_size)
4231 b_elt_size |= i - permute_size;
4232 else
4233 a_elt_size |= i;
4234 }
4235 a_elt_size &= -a_elt_size;
4236 b_elt_size &= -b_elt_size;
4237
4238 /* Now construct the vectors themselves. */
4239 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4240 builder.nelts_per_pattern ());
4241 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4242 builder.nelts_per_pattern ());
4243 unsigned int nelts = builder.encoded_nelts ();
4244 for (unsigned int i = 0; i < nelts; ++i)
4245 if (i & (elt_size - 1))
4246 {
4247 a_builder.quick_push (const0_rtx);
4248 b_builder.quick_push (const0_rtx);
4249 }
4250 else if ((i & permute_size) == 0)
4251 {
4252 /* The A and B elements are significant. */
4253 a_builder.quick_push (builder.elt (i));
4254 b_builder.quick_push (builder.elt (i + permute_size));
4255 }
4256 else
4257 {
4258 /* The A and B elements are going to be discarded, so pick whatever
4259 is likely to give a nice constant. We are targeting element
4260 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4261 with the aim of each being a sequence of ones followed by
4262 a sequence of zeros. So:
4263
4264 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4265 duplicate the last X_ELT_SIZE element, to extend the
4266 current sequence of ones or zeros.
4267
4268 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4269 zero, so that the constant really does have X_ELT_SIZE and
4270 not a smaller size. */
4271 if (a_elt_size > permute_size)
4272 a_builder.quick_push (const0_rtx);
4273 else
4274 a_builder.quick_push (a_builder.elt (i - a_elt_size));
4275 if (b_elt_size > permute_size)
4276 b_builder.quick_push (const0_rtx);
4277 else
4278 b_builder.quick_push (b_builder.elt (i - b_elt_size));
4279 }
4280 a_builder.finalize ();
4281 b_builder.finalize ();
4282
4283 /* Try loading A into a register. */
4284 rtx_insn *last = get_last_insn ();
4285 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4286 if (!a)
4287 return NULL_RTX;
4288
4289 /* Try loading B into a register. */
4290 rtx b = a;
4291 if (a_builder != b_builder)
4292 {
4293 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4294 if (!b)
4295 {
4296 delete_insns_since (last);
4297 return NULL_RTX;
4298 }
4299 }
4300
4301 /* Emit the TRN1 itself. */
4302 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4303 target = aarch64_target_reg (target, mode);
4304 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4305 gen_lowpart (mode, a),
4306 gen_lowpart (mode, b)));
4307 return target;
4308 }
4309
4310 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4311 constant in BUILDER into an SVE predicate register. Return the register
4312 on success, otherwise return null. Use TARGET for the register if
4313 nonnull and convenient.
4314
4315 ALLOW_RECURSE_P is true if we can use methods that would call this
4316 function recursively. */
4317
4318 static rtx
4319 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4320 bool allow_recurse_p)
4321 {
4322 if (builder.encoded_nelts () == 1)
4323 /* A PFALSE or a PTRUE .B ALL. */
4324 return aarch64_emit_set_immediate (target, builder);
4325
4326 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4327 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4328 {
4329 /* If we can load the constant using PTRUE, use it as-is. */
4330 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4331 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4332 return aarch64_emit_set_immediate (target, builder);
4333
4334 /* Otherwise use WHILE to set the first VL bits. */
4335 return aarch64_sve_move_pred_via_while (target, mode, vl);
4336 }
4337
4338 if (!allow_recurse_p)
4339 return NULL_RTX;
4340
4341 /* Try inverting the vector in element size ELT_SIZE and then EORing
4342 the result with an ELT_SIZE PTRUE. */
4343 if (INTVAL (builder.elt (0)) == 0)
4344 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4345 elt_size))
4346 return res;
4347
4348 /* Try using TRN1 to permute two simpler constants. */
4349 for (unsigned int i = elt_size; i <= 8; i *= 2)
4350 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4351 elt_size, i))
4352 return res;
4353
4354 return NULL_RTX;
4355 }
4356
4357 /* Return an SVE predicate register that contains the VNx16BImode
4358 constant in BUILDER, without going through the move expanders.
4359
4360 The returned register can have whatever mode seems most natural
4361 given the contents of BUILDER. Use TARGET for the result if
4362 convenient. */
4363
4364 static rtx
4365 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4366 {
4367 /* Try loading the constant using pure predicate operations. */
4368 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4369 return res;
4370
4371 /* Try forcing the constant to memory. */
4372 if (builder.full_nelts ().is_constant ())
4373 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4374 {
4375 target = aarch64_target_reg (target, VNx16BImode);
4376 emit_move_insn (target, mem);
4377 return target;
4378 }
4379
4380 /* The last resort is to load the constant as an integer and then
4381 compare it against zero. Use -1 for set bits in order to increase
4382 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4383 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4384 builder.nelts_per_pattern ());
4385 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4386 int_builder.quick_push (INTVAL (builder.elt (i))
4387 ? constm1_rtx : const0_rtx);
4388 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4389 int_builder.build ());
4390 }
4391
4392 /* Set DEST to immediate IMM. */
4393
4394 void
4395 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4396 {
4397 machine_mode mode = GET_MODE (dest);
4398
4399 /* Check on what type of symbol it is. */
4400 scalar_int_mode int_mode;
4401 if ((GET_CODE (imm) == SYMBOL_REF
4402 || GET_CODE (imm) == LABEL_REF
4403 || GET_CODE (imm) == CONST
4404 || GET_CODE (imm) == CONST_POLY_INT)
4405 && is_a <scalar_int_mode> (mode, &int_mode))
4406 {
4407 rtx mem;
4408 poly_int64 offset;
4409 HOST_WIDE_INT const_offset;
4410 enum aarch64_symbol_type sty;
4411
4412 /* If we have (const (plus symbol offset)), separate out the offset
4413 before we start classifying the symbol. */
4414 rtx base = strip_offset (imm, &offset);
4415
4416 /* We must always add an offset involving VL separately, rather than
4417 folding it into the relocation. */
4418 if (!offset.is_constant (&const_offset))
4419 {
4420 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4421 emit_insn (gen_rtx_SET (dest, imm));
4422 else
4423 {
4424 /* Do arithmetic on 32-bit values if the result is smaller
4425 than that. */
4426 if (partial_subreg_p (int_mode, SImode))
4427 {
4428 /* It is invalid to do symbol calculations in modes
4429 narrower than SImode. */
4430 gcc_assert (base == const0_rtx);
4431 dest = gen_lowpart (SImode, dest);
4432 int_mode = SImode;
4433 }
4434 if (base != const0_rtx)
4435 {
4436 base = aarch64_force_temporary (int_mode, dest, base);
4437 aarch64_add_offset (int_mode, dest, base, offset,
4438 NULL_RTX, NULL_RTX, false);
4439 }
4440 else
4441 aarch64_add_offset (int_mode, dest, base, offset,
4442 dest, NULL_RTX, false);
4443 }
4444 return;
4445 }
4446
4447 sty = aarch64_classify_symbol (base, const_offset);
4448 switch (sty)
4449 {
4450 case SYMBOL_FORCE_TO_MEM:
4451 if (const_offset != 0
4452 && targetm.cannot_force_const_mem (int_mode, imm))
4453 {
4454 gcc_assert (can_create_pseudo_p ());
4455 base = aarch64_force_temporary (int_mode, dest, base);
4456 aarch64_add_offset (int_mode, dest, base, const_offset,
4457 NULL_RTX, NULL_RTX, false);
4458 return;
4459 }
4460
4461 mem = force_const_mem (ptr_mode, imm);
4462 gcc_assert (mem);
4463
4464 /* If we aren't generating PC relative literals, then
4465 we need to expand the literal pool access carefully.
4466 This is something that needs to be done in a number
4467 of places, so could well live as a separate function. */
4468 if (!aarch64_pcrelative_literal_loads)
4469 {
4470 gcc_assert (can_create_pseudo_p ());
4471 base = gen_reg_rtx (ptr_mode);
4472 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4473 if (ptr_mode != Pmode)
4474 base = convert_memory_address (Pmode, base);
4475 mem = gen_rtx_MEM (ptr_mode, base);
4476 }
4477
4478 if (int_mode != ptr_mode)
4479 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4480
4481 emit_insn (gen_rtx_SET (dest, mem));
4482
4483 return;
4484
4485 case SYMBOL_SMALL_TLSGD:
4486 case SYMBOL_SMALL_TLSDESC:
4487 case SYMBOL_SMALL_TLSIE:
4488 case SYMBOL_SMALL_GOT_28K:
4489 case SYMBOL_SMALL_GOT_4G:
4490 case SYMBOL_TINY_GOT:
4491 case SYMBOL_TINY_TLSIE:
4492 if (const_offset != 0)
4493 {
4494 gcc_assert(can_create_pseudo_p ());
4495 base = aarch64_force_temporary (int_mode, dest, base);
4496 aarch64_add_offset (int_mode, dest, base, const_offset,
4497 NULL_RTX, NULL_RTX, false);
4498 return;
4499 }
4500 /* FALLTHRU */
4501
4502 case SYMBOL_SMALL_ABSOLUTE:
4503 case SYMBOL_TINY_ABSOLUTE:
4504 case SYMBOL_TLSLE12:
4505 case SYMBOL_TLSLE24:
4506 case SYMBOL_TLSLE32:
4507 case SYMBOL_TLSLE48:
4508 aarch64_load_symref_appropriately (dest, imm, sty);
4509 return;
4510
4511 default:
4512 gcc_unreachable ();
4513 }
4514 }
4515
4516 if (!CONST_INT_P (imm))
4517 {
4518 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4519 {
4520 /* Only the low bit of each .H, .S and .D element is defined,
4521 so we can set the upper bits to whatever we like. If the
4522 predicate is all-true in MODE, prefer to set all the undefined
4523 bits as well, so that we can share a single .B predicate for
4524 all modes. */
4525 if (imm == CONSTM1_RTX (mode))
4526 imm = CONSTM1_RTX (VNx16BImode);
4527
4528 /* All methods for constructing predicate modes wider than VNx16BI
4529 will set the upper bits of each element to zero. Expose this
4530 by moving such constants as a VNx16BI, so that all bits are
4531 significant and so that constants for different modes can be
4532 shared. The wider constant will still be available as a
4533 REG_EQUAL note. */
4534 rtx_vector_builder builder;
4535 if (aarch64_get_sve_pred_bits (builder, imm))
4536 {
4537 rtx res = aarch64_expand_sve_const_pred (dest, builder);
4538 if (dest != res)
4539 emit_move_insn (dest, gen_lowpart (mode, res));
4540 return;
4541 }
4542 }
4543
4544 if (GET_CODE (imm) == HIGH
4545 || aarch64_simd_valid_immediate (imm, NULL))
4546 {
4547 emit_insn (gen_rtx_SET (dest, imm));
4548 return;
4549 }
4550
4551 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4552 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4553 {
4554 if (dest != res)
4555 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4556 return;
4557 }
4558
4559 rtx mem = force_const_mem (mode, imm);
4560 gcc_assert (mem);
4561 emit_move_insn (dest, mem);
4562 return;
4563 }
4564
4565 aarch64_internal_mov_immediate (dest, imm, true,
4566 as_a <scalar_int_mode> (mode));
4567 }
4568
4569 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4570 that is known to contain PTRUE. */
4571
4572 void
4573 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4574 {
4575 expand_operand ops[3];
4576 machine_mode mode = GET_MODE (dest);
4577 create_output_operand (&ops[0], dest, mode);
4578 create_input_operand (&ops[1], pred, GET_MODE(pred));
4579 create_input_operand (&ops[2], src, mode);
4580 temporary_volatile_ok v (true);
4581 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4582 }
4583
4584 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4585 operand is in memory. In this case we need to use the predicated LD1
4586 and ST1 instead of LDR and STR, both for correctness on big-endian
4587 targets and because LD1 and ST1 support a wider range of addressing modes.
4588 PRED_MODE is the mode of the predicate.
4589
4590 See the comment at the head of aarch64-sve.md for details about the
4591 big-endian handling. */
4592
4593 void
4594 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4595 {
4596 machine_mode mode = GET_MODE (dest);
4597 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4598 if (!register_operand (src, mode)
4599 && !register_operand (dest, mode))
4600 {
4601 rtx tmp = gen_reg_rtx (mode);
4602 if (MEM_P (src))
4603 aarch64_emit_sve_pred_move (tmp, ptrue, src);
4604 else
4605 emit_move_insn (tmp, src);
4606 src = tmp;
4607 }
4608 aarch64_emit_sve_pred_move (dest, ptrue, src);
4609 }
4610
4611 /* Called only on big-endian targets. See whether an SVE vector move
4612 from SRC to DEST is effectively a REV[BHW] instruction, because at
4613 least one operand is a subreg of an SVE vector that has wider or
4614 narrower elements. Return true and emit the instruction if so.
4615
4616 For example:
4617
4618 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4619
4620 represents a VIEW_CONVERT between the following vectors, viewed
4621 in memory order:
4622
4623 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4624 R1: { [0], [1], [2], [3], ... }
4625
4626 The high part of lane X in R2 should therefore correspond to lane X*2
4627 of R1, but the register representations are:
4628
4629 msb lsb
4630 R2: ...... [1].high [1].low [0].high [0].low
4631 R1: ...... [3] [2] [1] [0]
4632
4633 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4634 We therefore need a reverse operation to swap the high and low values
4635 around.
4636
4637 This is purely an optimization. Without it we would spill the
4638 subreg operand to the stack in one mode and reload it in the
4639 other mode, which has the same effect as the REV. */
4640
4641 bool
4642 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4643 {
4644 gcc_assert (BYTES_BIG_ENDIAN);
4645 if (GET_CODE (dest) == SUBREG)
4646 dest = SUBREG_REG (dest);
4647 if (GET_CODE (src) == SUBREG)
4648 src = SUBREG_REG (src);
4649
4650 /* The optimization handles two single SVE REGs with different element
4651 sizes. */
4652 if (!REG_P (dest)
4653 || !REG_P (src)
4654 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4655 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4656 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4657 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4658 return false;
4659
4660 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4661 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4662 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4663 UNSPEC_REV_SUBREG);
4664 emit_insn (gen_rtx_SET (dest, unspec));
4665 return true;
4666 }
4667
4668 /* Return a copy of X with mode MODE, without changing its other
4669 attributes. Unlike gen_lowpart, this doesn't care whether the
4670 mode change is valid. */
4671
4672 rtx
4673 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4674 {
4675 if (GET_MODE (x) == mode)
4676 return x;
4677
4678 x = shallow_copy_rtx (x);
4679 set_mode_and_regno (x, mode, REGNO (x));
4680 return x;
4681 }
4682
4683 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4684 stored in wider integer containers. */
4685
4686 static unsigned int
4687 aarch64_sve_rev_unspec (machine_mode mode)
4688 {
4689 switch (GET_MODE_UNIT_SIZE (mode))
4690 {
4691 case 1: return UNSPEC_REVB;
4692 case 2: return UNSPEC_REVH;
4693 case 4: return UNSPEC_REVW;
4694 }
4695 gcc_unreachable ();
4696 }
4697
4698 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4699 operands. */
4700
4701 void
4702 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4703 {
4704 /* Decide which REV operation we need. The mode with wider elements
4705 determines the mode of the operands and the mode with the narrower
4706 elements determines the reverse width. */
4707 machine_mode mode_with_wider_elts = GET_MODE (dest);
4708 machine_mode mode_with_narrower_elts = GET_MODE (src);
4709 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4710 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4711 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4712
4713 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4714 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4715 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4716
4717 /* Get the operands in the appropriate modes and emit the instruction. */
4718 ptrue = gen_lowpart (pred_mode, ptrue);
4719 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4720 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4721 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4722 dest, ptrue, src));
4723 }
4724
4725 static bool
4726 aarch64_function_ok_for_sibcall (tree, tree exp)
4727 {
4728 if (crtl->abi->id () != expr_callee_abi (exp).id ())
4729 return false;
4730
4731 return true;
4732 }
4733
4734 /* Implement TARGET_PASS_BY_REFERENCE. */
4735
4736 static bool
4737 aarch64_pass_by_reference (cumulative_args_t pcum_v,
4738 const function_arg_info &arg)
4739 {
4740 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4741 HOST_WIDE_INT size;
4742 machine_mode dummymode;
4743 int nregs;
4744
4745 unsigned int num_zr, num_pr;
4746 if (arg.type && aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
4747 {
4748 if (pcum && !pcum->silent_p && !TARGET_SVE)
4749 /* We can't gracefully recover at this point, so make this a
4750 fatal error. */
4751 fatal_error (input_location, "arguments of type %qT require"
4752 " the SVE ISA extension", arg.type);
4753
4754 /* Variadic SVE types are passed by reference. Normal non-variadic
4755 arguments are too if we've run out of registers. */
4756 return (!arg.named
4757 || pcum->aapcs_nvrn + num_zr > NUM_FP_ARG_REGS
4758 || pcum->aapcs_nprn + num_pr > NUM_PR_ARG_REGS);
4759 }
4760
4761 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4762 if (arg.mode == BLKmode && arg.type)
4763 size = int_size_in_bytes (arg.type);
4764 else
4765 /* No frontends can create types with variable-sized modes, so we
4766 shouldn't be asked to pass or return them. */
4767 size = GET_MODE_SIZE (arg.mode).to_constant ();
4768
4769 /* Aggregates are passed by reference based on their size. */
4770 if (arg.aggregate_type_p ())
4771 size = int_size_in_bytes (arg.type);
4772
4773 /* Variable sized arguments are always returned by reference. */
4774 if (size < 0)
4775 return true;
4776
4777 /* Can this be a candidate to be passed in fp/simd register(s)? */
4778 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4779 &dummymode, &nregs,
4780 NULL))
4781 return false;
4782
4783 /* Arguments which are variable sized or larger than 2 registers are
4784 passed by reference unless they are a homogenous floating point
4785 aggregate. */
4786 return size > 2 * UNITS_PER_WORD;
4787 }
4788
4789 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4790 static bool
4791 aarch64_return_in_msb (const_tree valtype)
4792 {
4793 machine_mode dummy_mode;
4794 int dummy_int;
4795
4796 /* Never happens in little-endian mode. */
4797 if (!BYTES_BIG_ENDIAN)
4798 return false;
4799
4800 /* Only composite types smaller than or equal to 16 bytes can
4801 be potentially returned in registers. */
4802 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4803 || int_size_in_bytes (valtype) <= 0
4804 || int_size_in_bytes (valtype) > 16)
4805 return false;
4806
4807 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4808 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4809 is always passed/returned in the least significant bits of fp/simd
4810 register(s). */
4811 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4812 &dummy_mode, &dummy_int, NULL))
4813 return false;
4814
4815 return true;
4816 }
4817
4818 /* Implement TARGET_FUNCTION_VALUE.
4819 Define how to find the value returned by a function. */
4820
4821 static rtx
4822 aarch64_function_value (const_tree type, const_tree func,
4823 bool outgoing ATTRIBUTE_UNUSED)
4824 {
4825 machine_mode mode;
4826 int unsignedp;
4827 int count;
4828 machine_mode ag_mode;
4829
4830 mode = TYPE_MODE (type);
4831 if (INTEGRAL_TYPE_P (type))
4832 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4833
4834 unsigned int num_zr, num_pr;
4835 if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
4836 {
4837 /* Don't raise an error here if we're called when SVE is disabled,
4838 since this is really just a query function. Other code must
4839 do that where appropriate. */
4840 mode = TYPE_MODE_RAW (type);
4841 gcc_assert (VECTOR_MODE_P (mode)
4842 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
4843
4844 if (num_zr > 0 && num_pr == 0)
4845 return gen_rtx_REG (mode, V0_REGNUM);
4846
4847 if (num_zr == 0 && num_pr == 1)
4848 return gen_rtx_REG (mode, P0_REGNUM);
4849
4850 gcc_unreachable ();
4851 }
4852
4853 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
4854 returned in memory, not by value. */
4855 gcc_assert (!aarch64_sve_mode_p (mode));
4856
4857 if (aarch64_return_in_msb (type))
4858 {
4859 HOST_WIDE_INT size = int_size_in_bytes (type);
4860
4861 if (size % UNITS_PER_WORD != 0)
4862 {
4863 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4864 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4865 }
4866 }
4867
4868 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4869 &ag_mode, &count, NULL))
4870 {
4871 if (!aarch64_composite_type_p (type, mode))
4872 {
4873 gcc_assert (count == 1 && mode == ag_mode);
4874 return gen_rtx_REG (mode, V0_REGNUM);
4875 }
4876 else
4877 {
4878 int i;
4879 rtx par;
4880
4881 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4882 for (i = 0; i < count; i++)
4883 {
4884 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4885 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4886 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4887 XVECEXP (par, 0, i) = tmp;
4888 }
4889 return par;
4890 }
4891 }
4892 else
4893 return gen_rtx_REG (mode, R0_REGNUM);
4894 }
4895
4896 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4897 Return true if REGNO is the number of a hard register in which the values
4898 of called function may come back. */
4899
4900 static bool
4901 aarch64_function_value_regno_p (const unsigned int regno)
4902 {
4903 /* Maximum of 16 bytes can be returned in the general registers. Examples
4904 of 16-byte return values are: 128-bit integers and 16-byte small
4905 structures (excluding homogeneous floating-point aggregates). */
4906 if (regno == R0_REGNUM || regno == R1_REGNUM)
4907 return true;
4908
4909 /* Up to four fp/simd registers can return a function value, e.g. a
4910 homogeneous floating-point aggregate having four members. */
4911 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4912 return TARGET_FLOAT;
4913
4914 return false;
4915 }
4916
4917 /* Implement TARGET_RETURN_IN_MEMORY.
4918
4919 If the type T of the result of a function is such that
4920 void func (T arg)
4921 would require that arg be passed as a value in a register (or set of
4922 registers) according to the parameter passing rules, then the result
4923 is returned in the same registers as would be used for such an
4924 argument. */
4925
4926 static bool
4927 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4928 {
4929 HOST_WIDE_INT size;
4930 machine_mode ag_mode;
4931 int count;
4932
4933 if (!AGGREGATE_TYPE_P (type)
4934 && TREE_CODE (type) != COMPLEX_TYPE
4935 && TREE_CODE (type) != VECTOR_TYPE)
4936 /* Simple scalar types always returned in registers. */
4937 return false;
4938
4939 unsigned int num_zr, num_pr;
4940 if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
4941 {
4942 /* All SVE types we support fit in registers. For example, it isn't
4943 yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
4944 predicates. */
4945 gcc_assert (num_zr <= NUM_FP_ARG_REGS && num_pr <= NUM_PR_ARG_REGS);
4946 return false;
4947 }
4948
4949 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4950 type,
4951 &ag_mode,
4952 &count,
4953 NULL))
4954 return false;
4955
4956 /* Types larger than 2 registers returned in memory. */
4957 size = int_size_in_bytes (type);
4958 return (size < 0 || size > 2 * UNITS_PER_WORD);
4959 }
4960
4961 static bool
4962 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4963 const_tree type, int *nregs)
4964 {
4965 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4966 return aarch64_vfp_is_call_or_return_candidate (mode,
4967 type,
4968 &pcum->aapcs_vfp_rmode,
4969 nregs,
4970 NULL);
4971 }
4972
4973 /* Given MODE and TYPE of a function argument, return the alignment in
4974 bits. The idea is to suppress any stronger alignment requested by
4975 the user and opt for the natural alignment (specified in AAPCS64 \S
4976 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4977 calculated in versions of GCC prior to GCC-9. This is a helper
4978 function for local use only. */
4979
4980 static unsigned int
4981 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4982 bool *abi_break)
4983 {
4984 *abi_break = false;
4985 if (!type)
4986 return GET_MODE_ALIGNMENT (mode);
4987
4988 if (integer_zerop (TYPE_SIZE (type)))
4989 return 0;
4990
4991 gcc_assert (TYPE_MODE (type) == mode);
4992
4993 if (!AGGREGATE_TYPE_P (type))
4994 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4995
4996 if (TREE_CODE (type) == ARRAY_TYPE)
4997 return TYPE_ALIGN (TREE_TYPE (type));
4998
4999 unsigned int alignment = 0;
5000 unsigned int bitfield_alignment = 0;
5001 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5002 if (TREE_CODE (field) == FIELD_DECL)
5003 {
5004 alignment = std::max (alignment, DECL_ALIGN (field));
5005 if (DECL_BIT_FIELD_TYPE (field))
5006 bitfield_alignment
5007 = std::max (bitfield_alignment,
5008 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5009 }
5010
5011 if (bitfield_alignment > alignment)
5012 {
5013 *abi_break = true;
5014 return bitfield_alignment;
5015 }
5016
5017 return alignment;
5018 }
5019
5020 /* Layout a function argument according to the AAPCS64 rules. The rule
5021 numbers refer to the rule numbers in the AAPCS64. */
5022
5023 static void
5024 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5025 {
5026 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5027 tree type = arg.type;
5028 machine_mode mode = arg.mode;
5029 int ncrn, nvrn, nregs;
5030 bool allocate_ncrn, allocate_nvrn;
5031 HOST_WIDE_INT size;
5032 bool abi_break;
5033
5034 /* We need to do this once per argument. */
5035 if (pcum->aapcs_arg_processed)
5036 return;
5037
5038 pcum->aapcs_arg_processed = true;
5039
5040 unsigned int num_zr, num_pr;
5041 if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
5042 {
5043 /* The PCS says that it is invalid to pass an SVE value to an
5044 unprototyped function. There is no ABI-defined location we
5045 can return in this case, so we have no real choice but to raise
5046 an error immediately, even though this is only a query function. */
5047 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5048 {
5049 gcc_assert (!pcum->silent_p);
5050 error ("SVE type %qT cannot be passed to an unprototyped function",
5051 arg.type);
5052 /* Avoid repeating the message, and avoid tripping the assert
5053 below. */
5054 pcum->pcs_variant = ARM_PCS_SVE;
5055 }
5056
5057 /* We would have converted the argument into pass-by-reference
5058 form if it didn't fit in registers. */
5059 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + num_zr;
5060 pcum->aapcs_nextnprn = pcum->aapcs_nprn + num_pr;
5061 gcc_assert (arg.named
5062 && pcum->pcs_variant == ARM_PCS_SVE
5063 && aarch64_sve_mode_p (mode)
5064 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5065 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5066
5067 if (num_zr > 0 && num_pr == 0)
5068 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + pcum->aapcs_nvrn);
5069 else if (num_zr == 0 && num_pr == 1)
5070 pcum->aapcs_reg = gen_rtx_REG (mode, P0_REGNUM + pcum->aapcs_nprn);
5071 else
5072 gcc_unreachable ();
5073 return;
5074 }
5075
5076 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
5077 passed by reference, not by value. */
5078 gcc_assert (!aarch64_sve_mode_p (mode));
5079
5080 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
5081 if (type)
5082 size = int_size_in_bytes (type);
5083 else
5084 /* No frontends can create types with variable-sized modes, so we
5085 shouldn't be asked to pass or return them. */
5086 size = GET_MODE_SIZE (mode).to_constant ();
5087 size = ROUND_UP (size, UNITS_PER_WORD);
5088
5089 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5090 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5091 mode,
5092 type,
5093 &nregs);
5094
5095 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5096 The following code thus handles passing by SIMD/FP registers first. */
5097
5098 nvrn = pcum->aapcs_nvrn;
5099
5100 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5101 and homogenous short-vector aggregates (HVA). */
5102 if (allocate_nvrn)
5103 {
5104 if (!pcum->silent_p && !TARGET_FLOAT)
5105 aarch64_err_no_fpadvsimd (mode);
5106
5107 if (nvrn + nregs <= NUM_FP_ARG_REGS)
5108 {
5109 pcum->aapcs_nextnvrn = nvrn + nregs;
5110 if (!aarch64_composite_type_p (type, mode))
5111 {
5112 gcc_assert (nregs == 1);
5113 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5114 }
5115 else
5116 {
5117 rtx par;
5118 int i;
5119 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5120 for (i = 0; i < nregs; i++)
5121 {
5122 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5123 V0_REGNUM + nvrn + i);
5124 rtx offset = gen_int_mode
5125 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5126 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5127 XVECEXP (par, 0, i) = tmp;
5128 }
5129 pcum->aapcs_reg = par;
5130 }
5131 return;
5132 }
5133 else
5134 {
5135 /* C.3 NSRN is set to 8. */
5136 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5137 goto on_stack;
5138 }
5139 }
5140
5141 ncrn = pcum->aapcs_ncrn;
5142 nregs = size / UNITS_PER_WORD;
5143
5144 /* C6 - C9. though the sign and zero extension semantics are
5145 handled elsewhere. This is the case where the argument fits
5146 entirely general registers. */
5147 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5148 {
5149 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5150
5151 /* C.8 if the argument has an alignment of 16 then the NGRN is
5152 rounded up to the next even number. */
5153 if (nregs == 2
5154 && ncrn % 2
5155 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5156 comparison is there because for > 16 * BITS_PER_UNIT
5157 alignment nregs should be > 2 and therefore it should be
5158 passed by reference rather than value. */
5159 && (aarch64_function_arg_alignment (mode, type, &abi_break)
5160 == 16 * BITS_PER_UNIT))
5161 {
5162 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5163 inform (input_location, "parameter passing for argument of type "
5164 "%qT changed in GCC 9.1", type);
5165 ++ncrn;
5166 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5167 }
5168
5169 /* NREGS can be 0 when e.g. an empty structure is to be passed.
5170 A reg is still generated for it, but the caller should be smart
5171 enough not to use it. */
5172 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
5173 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5174 else
5175 {
5176 rtx par;
5177 int i;
5178
5179 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5180 for (i = 0; i < nregs; i++)
5181 {
5182 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
5183 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5184 GEN_INT (i * UNITS_PER_WORD));
5185 XVECEXP (par, 0, i) = tmp;
5186 }
5187 pcum->aapcs_reg = par;
5188 }
5189
5190 pcum->aapcs_nextncrn = ncrn + nregs;
5191 return;
5192 }
5193
5194 /* C.11 */
5195 pcum->aapcs_nextncrn = NUM_ARG_REGS;
5196
5197 /* The argument is passed on stack; record the needed number of words for
5198 this argument and align the total size if necessary. */
5199 on_stack:
5200 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5201
5202 if (aarch64_function_arg_alignment (mode, type, &abi_break)
5203 == 16 * BITS_PER_UNIT)
5204 {
5205 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5206 if (pcum->aapcs_stack_size != new_size)
5207 {
5208 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5209 inform (input_location, "parameter passing for argument of type "
5210 "%qT changed in GCC 9.1", type);
5211 pcum->aapcs_stack_size = new_size;
5212 }
5213 }
5214 return;
5215 }
5216
5217 /* Implement TARGET_FUNCTION_ARG. */
5218
5219 static rtx
5220 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5221 {
5222 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5223 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5224 || pcum->pcs_variant == ARM_PCS_SIMD
5225 || pcum->pcs_variant == ARM_PCS_SVE);
5226
5227 if (arg.end_marker_p ())
5228 return gen_int_mode (pcum->pcs_variant, DImode);
5229
5230 aarch64_layout_arg (pcum_v, arg);
5231 return pcum->aapcs_reg;
5232 }
5233
5234 void
5235 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5236 const_tree fntype,
5237 rtx libname ATTRIBUTE_UNUSED,
5238 const_tree fndecl ATTRIBUTE_UNUSED,
5239 unsigned n_named ATTRIBUTE_UNUSED,
5240 bool silent_p)
5241 {
5242 pcum->aapcs_ncrn = 0;
5243 pcum->aapcs_nvrn = 0;
5244 pcum->aapcs_nprn = 0;
5245 pcum->aapcs_nextncrn = 0;
5246 pcum->aapcs_nextnvrn = 0;
5247 pcum->aapcs_nextnprn = 0;
5248 if (fntype)
5249 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5250 else
5251 pcum->pcs_variant = ARM_PCS_AAPCS64;
5252 pcum->aapcs_reg = NULL_RTX;
5253 pcum->aapcs_arg_processed = false;
5254 pcum->aapcs_stack_words = 0;
5255 pcum->aapcs_stack_size = 0;
5256 pcum->silent_p = silent_p;
5257
5258 if (!silent_p
5259 && !TARGET_FLOAT
5260 && fndecl && TREE_PUBLIC (fndecl)
5261 && fntype && fntype != error_mark_node)
5262 {
5263 const_tree type = TREE_TYPE (fntype);
5264 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
5265 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
5266 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5267 &mode, &nregs, NULL))
5268 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5269 }
5270
5271 if (!silent_p
5272 && !TARGET_SVE
5273 && pcum->pcs_variant == ARM_PCS_SVE)
5274 {
5275 /* We can't gracefully recover at this point, so make this a
5276 fatal error. */
5277 if (fndecl)
5278 fatal_error (input_location, "%qE requires the SVE ISA extension",
5279 fndecl);
5280 else
5281 fatal_error (input_location, "calls to functions of type %qT require"
5282 " the SVE ISA extension", fntype);
5283 }
5284 }
5285
5286 static void
5287 aarch64_function_arg_advance (cumulative_args_t pcum_v,
5288 const function_arg_info &arg)
5289 {
5290 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5291 if (pcum->pcs_variant == ARM_PCS_AAPCS64
5292 || pcum->pcs_variant == ARM_PCS_SIMD
5293 || pcum->pcs_variant == ARM_PCS_SVE)
5294 {
5295 aarch64_layout_arg (pcum_v, arg);
5296 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
5297 != (pcum->aapcs_stack_words != 0));
5298 pcum->aapcs_arg_processed = false;
5299 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
5300 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
5301 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
5302 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
5303 pcum->aapcs_stack_words = 0;
5304 pcum->aapcs_reg = NULL_RTX;
5305 }
5306 }
5307
5308 bool
5309 aarch64_function_arg_regno_p (unsigned regno)
5310 {
5311 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
5312 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
5313 }
5314
5315 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
5316 PARM_BOUNDARY bits of alignment, but will be given anything up
5317 to STACK_BOUNDARY bits if the type requires it. This makes sure
5318 that both before and after the layout of each argument, the Next
5319 Stacked Argument Address (NSAA) will have a minimum alignment of
5320 8 bytes. */
5321
5322 static unsigned int
5323 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
5324 {
5325 bool abi_break;
5326 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
5327 &abi_break);
5328 if (abi_break & warn_psabi)
5329 inform (input_location, "parameter passing for argument of type "
5330 "%qT changed in GCC 9.1", type);
5331
5332 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
5333 }
5334
5335 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
5336
5337 static fixed_size_mode
5338 aarch64_get_reg_raw_mode (int regno)
5339 {
5340 if (TARGET_SVE && FP_REGNUM_P (regno))
5341 /* Don't use the SVE part of the register for __builtin_apply and
5342 __builtin_return. The SVE registers aren't used by the normal PCS,
5343 so using them there would be a waste of time. The PCS extensions
5344 for SVE types are fundamentally incompatible with the
5345 __builtin_return/__builtin_apply interface. */
5346 return as_a <fixed_size_mode> (V16QImode);
5347 return default_get_reg_raw_mode (regno);
5348 }
5349
5350 /* Implement TARGET_FUNCTION_ARG_PADDING.
5351
5352 Small aggregate types are placed in the lowest memory address.
5353
5354 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
5355
5356 static pad_direction
5357 aarch64_function_arg_padding (machine_mode mode, const_tree type)
5358 {
5359 /* On little-endian targets, the least significant byte of every stack
5360 argument is passed at the lowest byte address of the stack slot. */
5361 if (!BYTES_BIG_ENDIAN)
5362 return PAD_UPWARD;
5363
5364 /* Otherwise, integral, floating-point and pointer types are padded downward:
5365 the least significant byte of a stack argument is passed at the highest
5366 byte address of the stack slot. */
5367 if (type
5368 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
5369 || POINTER_TYPE_P (type))
5370 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
5371 return PAD_DOWNWARD;
5372
5373 /* Everything else padded upward, i.e. data in first byte of stack slot. */
5374 return PAD_UPWARD;
5375 }
5376
5377 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
5378
5379 It specifies padding for the last (may also be the only)
5380 element of a block move between registers and memory. If
5381 assuming the block is in the memory, padding upward means that
5382 the last element is padded after its highest significant byte,
5383 while in downward padding, the last element is padded at the
5384 its least significant byte side.
5385
5386 Small aggregates and small complex types are always padded
5387 upwards.
5388
5389 We don't need to worry about homogeneous floating-point or
5390 short-vector aggregates; their move is not affected by the
5391 padding direction determined here. Regardless of endianness,
5392 each element of such an aggregate is put in the least
5393 significant bits of a fp/simd register.
5394
5395 Return !BYTES_BIG_ENDIAN if the least significant byte of the
5396 register has useful data, and return the opposite if the most
5397 significant byte does. */
5398
5399 bool
5400 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
5401 bool first ATTRIBUTE_UNUSED)
5402 {
5403
5404 /* Small composite types are always padded upward. */
5405 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
5406 {
5407 HOST_WIDE_INT size;
5408 if (type)
5409 size = int_size_in_bytes (type);
5410 else
5411 /* No frontends can create types with variable-sized modes, so we
5412 shouldn't be asked to pass or return them. */
5413 size = GET_MODE_SIZE (mode).to_constant ();
5414 if (size < 2 * UNITS_PER_WORD)
5415 return true;
5416 }
5417
5418 /* Otherwise, use the default padding. */
5419 return !BYTES_BIG_ENDIAN;
5420 }
5421
5422 static scalar_int_mode
5423 aarch64_libgcc_cmp_return_mode (void)
5424 {
5425 return SImode;
5426 }
5427
5428 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5429
5430 /* We use the 12-bit shifted immediate arithmetic instructions so values
5431 must be multiple of (1 << 12), i.e. 4096. */
5432 #define ARITH_FACTOR 4096
5433
5434 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5435 #error Cannot use simple address calculation for stack probing
5436 #endif
5437
5438 /* The pair of scratch registers used for stack probing. */
5439 #define PROBE_STACK_FIRST_REG R9_REGNUM
5440 #define PROBE_STACK_SECOND_REG R10_REGNUM
5441
5442 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5443 inclusive. These are offsets from the current stack pointer. */
5444
5445 static void
5446 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5447 {
5448 HOST_WIDE_INT size;
5449 if (!poly_size.is_constant (&size))
5450 {
5451 sorry ("stack probes for SVE frames");
5452 return;
5453 }
5454
5455 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5456
5457 /* See the same assertion on PROBE_INTERVAL above. */
5458 gcc_assert ((first % ARITH_FACTOR) == 0);
5459
5460 /* See if we have a constant small number of probes to generate. If so,
5461 that's the easy case. */
5462 if (size <= PROBE_INTERVAL)
5463 {
5464 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5465
5466 emit_set_insn (reg1,
5467 plus_constant (Pmode,
5468 stack_pointer_rtx, -(first + base)));
5469 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5470 }
5471
5472 /* The run-time loop is made up of 8 insns in the generic case while the
5473 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5474 else if (size <= 4 * PROBE_INTERVAL)
5475 {
5476 HOST_WIDE_INT i, rem;
5477
5478 emit_set_insn (reg1,
5479 plus_constant (Pmode,
5480 stack_pointer_rtx,
5481 -(first + PROBE_INTERVAL)));
5482 emit_stack_probe (reg1);
5483
5484 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5485 it exceeds SIZE. If only two probes are needed, this will not
5486 generate any code. Then probe at FIRST + SIZE. */
5487 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5488 {
5489 emit_set_insn (reg1,
5490 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5491 emit_stack_probe (reg1);
5492 }
5493
5494 rem = size - (i - PROBE_INTERVAL);
5495 if (rem > 256)
5496 {
5497 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5498
5499 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5500 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5501 }
5502 else
5503 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5504 }
5505
5506 /* Otherwise, do the same as above, but in a loop. Note that we must be
5507 extra careful with variables wrapping around because we might be at
5508 the very top (or the very bottom) of the address space and we have
5509 to be able to handle this case properly; in particular, we use an
5510 equality test for the loop condition. */
5511 else
5512 {
5513 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5514
5515 /* Step 1: round SIZE to the previous multiple of the interval. */
5516
5517 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5518
5519
5520 /* Step 2: compute initial and final value of the loop counter. */
5521
5522 /* TEST_ADDR = SP + FIRST. */
5523 emit_set_insn (reg1,
5524 plus_constant (Pmode, stack_pointer_rtx, -first));
5525
5526 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5527 HOST_WIDE_INT adjustment = - (first + rounded_size);
5528 if (! aarch64_uimm12_shift (adjustment))
5529 {
5530 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5531 true, Pmode);
5532 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5533 }
5534 else
5535 emit_set_insn (reg2,
5536 plus_constant (Pmode, stack_pointer_rtx, adjustment));
5537
5538 /* Step 3: the loop
5539
5540 do
5541 {
5542 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5543 probe at TEST_ADDR
5544 }
5545 while (TEST_ADDR != LAST_ADDR)
5546
5547 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5548 until it is equal to ROUNDED_SIZE. */
5549
5550 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5551
5552
5553 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5554 that SIZE is equal to ROUNDED_SIZE. */
5555
5556 if (size != rounded_size)
5557 {
5558 HOST_WIDE_INT rem = size - rounded_size;
5559
5560 if (rem > 256)
5561 {
5562 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5563
5564 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5565 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5566 }
5567 else
5568 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5569 }
5570 }
5571
5572 /* Make sure nothing is scheduled before we are done. */
5573 emit_insn (gen_blockage ());
5574 }
5575
5576 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5577 absolute addresses. */
5578
5579 const char *
5580 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5581 {
5582 static int labelno = 0;
5583 char loop_lab[32];
5584 rtx xops[2];
5585
5586 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5587
5588 /* Loop. */
5589 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5590
5591 HOST_WIDE_INT stack_clash_probe_interval
5592 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5593
5594 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5595 xops[0] = reg1;
5596 HOST_WIDE_INT interval;
5597 if (flag_stack_clash_protection)
5598 interval = stack_clash_probe_interval;
5599 else
5600 interval = PROBE_INTERVAL;
5601
5602 gcc_assert (aarch64_uimm12_shift (interval));
5603 xops[1] = GEN_INT (interval);
5604
5605 output_asm_insn ("sub\t%0, %0, %1", xops);
5606
5607 /* If doing stack clash protection then we probe up by the ABI specified
5608 amount. We do this because we're dropping full pages at a time in the
5609 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5610 if (flag_stack_clash_protection)
5611 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5612 else
5613 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5614
5615 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5616 by this amount for each iteration. */
5617 output_asm_insn ("str\txzr, [%0, %1]", xops);
5618
5619 /* Test if TEST_ADDR == LAST_ADDR. */
5620 xops[1] = reg2;
5621 output_asm_insn ("cmp\t%0, %1", xops);
5622
5623 /* Branch. */
5624 fputs ("\tb.ne\t", asm_out_file);
5625 assemble_name_raw (asm_out_file, loop_lab);
5626 fputc ('\n', asm_out_file);
5627
5628 return "";
5629 }
5630
5631 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5632 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5633 of GUARD_SIZE. When a probe is emitted it is done at most
5634 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5635 at most MIN_PROBE_THRESHOLD. By the end of this function
5636 BASE = BASE - ADJUSTMENT. */
5637
5638 const char *
5639 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5640 rtx min_probe_threshold, rtx guard_size)
5641 {
5642 /* This function is not allowed to use any instruction generation function
5643 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5644 so instead emit the code you want using output_asm_insn. */
5645 gcc_assert (flag_stack_clash_protection);
5646 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5647 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5648
5649 /* The minimum required allocation before the residual requires probing. */
5650 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5651
5652 /* Clamp the value down to the nearest value that can be used with a cmp. */
5653 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5654 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5655
5656 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5657 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5658
5659 static int labelno = 0;
5660 char loop_start_lab[32];
5661 char loop_end_lab[32];
5662 rtx xops[2];
5663
5664 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5665 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5666
5667 /* Emit loop start label. */
5668 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5669
5670 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5671 xops[0] = adjustment;
5672 xops[1] = probe_offset_value_rtx;
5673 output_asm_insn ("cmp\t%0, %1", xops);
5674
5675 /* Branch to end if not enough adjustment to probe. */
5676 fputs ("\tb.lt\t", asm_out_file);
5677 assemble_name_raw (asm_out_file, loop_end_lab);
5678 fputc ('\n', asm_out_file);
5679
5680 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5681 xops[0] = base;
5682 xops[1] = probe_offset_value_rtx;
5683 output_asm_insn ("sub\t%0, %0, %1", xops);
5684
5685 /* Probe at BASE. */
5686 xops[1] = const0_rtx;
5687 output_asm_insn ("str\txzr, [%0, %1]", xops);
5688
5689 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5690 xops[0] = adjustment;
5691 xops[1] = probe_offset_value_rtx;
5692 output_asm_insn ("sub\t%0, %0, %1", xops);
5693
5694 /* Branch to start if still more bytes to allocate. */
5695 fputs ("\tb\t", asm_out_file);
5696 assemble_name_raw (asm_out_file, loop_start_lab);
5697 fputc ('\n', asm_out_file);
5698
5699 /* No probe leave. */
5700 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5701
5702 /* BASE = BASE - ADJUSTMENT. */
5703 xops[0] = base;
5704 xops[1] = adjustment;
5705 output_asm_insn ("sub\t%0, %0, %1", xops);
5706 return "";
5707 }
5708
5709 /* Determine whether a frame chain needs to be generated. */
5710 static bool
5711 aarch64_needs_frame_chain (void)
5712 {
5713 /* Force a frame chain for EH returns so the return address is at FP+8. */
5714 if (frame_pointer_needed || crtl->calls_eh_return)
5715 return true;
5716
5717 /* A leaf function cannot have calls or write LR. */
5718 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5719
5720 /* Don't use a frame chain in leaf functions if leaf frame pointers
5721 are disabled. */
5722 if (flag_omit_leaf_frame_pointer && is_leaf)
5723 return false;
5724
5725 return aarch64_use_frame_pointer;
5726 }
5727
5728 /* Mark the registers that need to be saved by the callee and calculate
5729 the size of the callee-saved registers area and frame record (both FP
5730 and LR may be omitted). */
5731 static void
5732 aarch64_layout_frame (void)
5733 {
5734 poly_int64 offset = 0;
5735 int regno, last_fp_reg = INVALID_REGNUM;
5736 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
5737 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
5738 bool frame_related_fp_reg_p = false;
5739 aarch64_frame &frame = cfun->machine->frame;
5740
5741 frame.emit_frame_chain = aarch64_needs_frame_chain ();
5742
5743 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5744 the mid-end is doing. */
5745 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5746
5747 #define SLOT_NOT_REQUIRED (-2)
5748 #define SLOT_REQUIRED (-1)
5749
5750 frame.wb_candidate1 = INVALID_REGNUM;
5751 frame.wb_candidate2 = INVALID_REGNUM;
5752 frame.spare_pred_reg = INVALID_REGNUM;
5753
5754 /* First mark all the registers that really need to be saved... */
5755 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5756 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5757
5758 /* ... that includes the eh data registers (if needed)... */
5759 if (crtl->calls_eh_return)
5760 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5761 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
5762
5763 /* ... and any callee saved register that dataflow says is live. */
5764 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5765 if (df_regs_ever_live_p (regno)
5766 && !fixed_regs[regno]
5767 && (regno == R30_REGNUM
5768 || !crtl->abi->clobbers_full_reg_p (regno)))
5769 frame.reg_offset[regno] = SLOT_REQUIRED;
5770
5771 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5772 if (df_regs_ever_live_p (regno)
5773 && !fixed_regs[regno]
5774 && !crtl->abi->clobbers_full_reg_p (regno))
5775 {
5776 frame.reg_offset[regno] = SLOT_REQUIRED;
5777 last_fp_reg = regno;
5778 if (aarch64_emit_cfi_for_reg_p (regno))
5779 frame_related_fp_reg_p = true;
5780 }
5781
5782 /* Big-endian SVE frames need a spare predicate register in order
5783 to save Z8-Z15. Decide which register they should use. Prefer
5784 an unused argument register if possible, so that we don't force P4
5785 to be saved unnecessarily. */
5786 if (frame_related_fp_reg_p
5787 && crtl->abi->id () == ARM_PCS_SVE
5788 && BYTES_BIG_ENDIAN)
5789 {
5790 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5791 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
5792 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
5793 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
5794 break;
5795 gcc_assert (regno <= P7_REGNUM);
5796 frame.spare_pred_reg = regno;
5797 df_set_regs_ever_live (regno, true);
5798 }
5799
5800 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
5801 if (df_regs_ever_live_p (regno)
5802 && !fixed_regs[regno]
5803 && !crtl->abi->clobbers_full_reg_p (regno))
5804 frame.reg_offset[regno] = SLOT_REQUIRED;
5805
5806 /* With stack-clash, LR must be saved in non-leaf functions. */
5807 gcc_assert (crtl->is_leaf
5808 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
5809
5810 /* Now assign stack slots for the registers. Start with the predicate
5811 registers, since predicate LDR and STR have a relatively small
5812 offset range. These saves happen below the hard frame pointer. */
5813 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
5814 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
5815 {
5816 frame.reg_offset[regno] = offset;
5817 offset += BYTES_PER_SVE_PRED;
5818 }
5819
5820 /* We save a maximum of 8 predicate registers, and since vector
5821 registers are 8 times the size of a predicate register, all the
5822 saved predicates fit within a single vector. Doing this also
5823 rounds the offset to a 128-bit boundary. */
5824 if (maybe_ne (offset, 0))
5825 {
5826 gcc_assert (known_le (offset, vector_save_size));
5827 offset = vector_save_size;
5828 }
5829
5830 /* If we need to save any SVE vector registers, add them next. */
5831 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
5832 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5833 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
5834 {
5835 frame.reg_offset[regno] = offset;
5836 offset += vector_save_size;
5837 }
5838
5839 /* OFFSET is now the offset of the hard frame pointer from the bottom
5840 of the callee save area. */
5841 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
5842 frame.below_hard_fp_saved_regs_size = offset;
5843 if (frame.emit_frame_chain)
5844 {
5845 /* FP and LR are placed in the linkage record. */
5846 frame.reg_offset[R29_REGNUM] = offset;
5847 frame.wb_candidate1 = R29_REGNUM;
5848 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
5849 frame.wb_candidate2 = R30_REGNUM;
5850 offset += 2 * UNITS_PER_WORD;
5851 }
5852
5853 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5854 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
5855 {
5856 frame.reg_offset[regno] = offset;
5857 if (frame.wb_candidate1 == INVALID_REGNUM)
5858 frame.wb_candidate1 = regno;
5859 else if (frame.wb_candidate2 == INVALID_REGNUM)
5860 frame.wb_candidate2 = regno;
5861 offset += UNITS_PER_WORD;
5862 }
5863
5864 poly_int64 max_int_offset = offset;
5865 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5866 bool has_align_gap = maybe_ne (offset, max_int_offset);
5867
5868 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5869 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
5870 {
5871 /* If there is an alignment gap between integer and fp callee-saves,
5872 allocate the last fp register to it if possible. */
5873 if (regno == last_fp_reg
5874 && has_align_gap
5875 && known_eq (vector_save_size, 8)
5876 && multiple_p (offset, 16))
5877 {
5878 frame.reg_offset[regno] = max_int_offset;
5879 break;
5880 }
5881
5882 frame.reg_offset[regno] = offset;
5883 if (frame.wb_candidate1 == INVALID_REGNUM)
5884 frame.wb_candidate1 = regno;
5885 else if (frame.wb_candidate2 == INVALID_REGNUM
5886 && frame.wb_candidate1 >= V0_REGNUM)
5887 frame.wb_candidate2 = regno;
5888 offset += vector_save_size;
5889 }
5890
5891 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5892
5893 frame.saved_regs_size = offset;
5894
5895 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
5896
5897 poly_int64 above_outgoing_args
5898 = aligned_upper_bound (varargs_and_saved_regs_size
5899 + get_frame_size (),
5900 STACK_BOUNDARY / BITS_PER_UNIT);
5901
5902 frame.hard_fp_offset
5903 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
5904
5905 /* Both these values are already aligned. */
5906 gcc_assert (multiple_p (crtl->outgoing_args_size,
5907 STACK_BOUNDARY / BITS_PER_UNIT));
5908 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
5909
5910 frame.locals_offset = frame.saved_varargs_size;
5911
5912 frame.initial_adjust = 0;
5913 frame.final_adjust = 0;
5914 frame.callee_adjust = 0;
5915 frame.sve_callee_adjust = 0;
5916 frame.callee_offset = 0;
5917
5918 HOST_WIDE_INT max_push_offset = 0;
5919 if (frame.wb_candidate2 != INVALID_REGNUM)
5920 max_push_offset = 512;
5921 else if (frame.wb_candidate1 != INVALID_REGNUM)
5922 max_push_offset = 256;
5923
5924 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
5925 HOST_WIDE_INT const_saved_regs_size;
5926 if (frame.frame_size.is_constant (&const_size)
5927 && const_size < max_push_offset
5928 && known_eq (frame.hard_fp_offset, const_size))
5929 {
5930 /* Simple, small frame with no outgoing arguments:
5931
5932 stp reg1, reg2, [sp, -frame_size]!
5933 stp reg3, reg4, [sp, 16] */
5934 frame.callee_adjust = const_size;
5935 }
5936 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
5937 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
5938 && const_outgoing_args_size + const_saved_regs_size < 512
5939 /* We could handle this case even with outgoing args, provided
5940 that the number of args left us with valid offsets for all
5941 predicate and vector save slots. It's such a rare case that
5942 it hardly seems worth the effort though. */
5943 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
5944 && !(cfun->calls_alloca
5945 && frame.hard_fp_offset.is_constant (&const_fp_offset)
5946 && const_fp_offset < max_push_offset))
5947 {
5948 /* Frame with small outgoing arguments:
5949
5950 sub sp, sp, frame_size
5951 stp reg1, reg2, [sp, outgoing_args_size]
5952 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5953 frame.initial_adjust = frame.frame_size;
5954 frame.callee_offset = const_outgoing_args_size;
5955 }
5956 else if (saves_below_hard_fp_p
5957 && known_eq (frame.saved_regs_size,
5958 frame.below_hard_fp_saved_regs_size))
5959 {
5960 /* Frame in which all saves are SVE saves:
5961
5962 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
5963 save SVE registers relative to SP
5964 sub sp, sp, outgoing_args_size */
5965 frame.initial_adjust = (frame.hard_fp_offset
5966 + frame.below_hard_fp_saved_regs_size);
5967 frame.final_adjust = crtl->outgoing_args_size;
5968 }
5969 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
5970 && const_fp_offset < max_push_offset)
5971 {
5972 /* Frame with large outgoing arguments or SVE saves, but with
5973 a small local area:
5974
5975 stp reg1, reg2, [sp, -hard_fp_offset]!
5976 stp reg3, reg4, [sp, 16]
5977 [sub sp, sp, below_hard_fp_saved_regs_size]
5978 [save SVE registers relative to SP]
5979 sub sp, sp, outgoing_args_size */
5980 frame.callee_adjust = const_fp_offset;
5981 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
5982 frame.final_adjust = crtl->outgoing_args_size;
5983 }
5984 else
5985 {
5986 /* Frame with large local area and outgoing arguments or SVE saves,
5987 using frame pointer:
5988
5989 sub sp, sp, hard_fp_offset
5990 stp x29, x30, [sp, 0]
5991 add x29, sp, 0
5992 stp reg3, reg4, [sp, 16]
5993 [sub sp, sp, below_hard_fp_saved_regs_size]
5994 [save SVE registers relative to SP]
5995 sub sp, sp, outgoing_args_size */
5996 frame.initial_adjust = frame.hard_fp_offset;
5997 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
5998 frame.final_adjust = crtl->outgoing_args_size;
5999 }
6000
6001 /* Make sure the individual adjustments add up to the full frame size. */
6002 gcc_assert (known_eq (frame.initial_adjust
6003 + frame.callee_adjust
6004 + frame.sve_callee_adjust
6005 + frame.final_adjust, frame.frame_size));
6006
6007 frame.laid_out = true;
6008 }
6009
6010 /* Return true if the register REGNO is saved on entry to
6011 the current function. */
6012
6013 static bool
6014 aarch64_register_saved_on_entry (int regno)
6015 {
6016 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6017 }
6018
6019 /* Return the next register up from REGNO up to LIMIT for the callee
6020 to save. */
6021
6022 static unsigned
6023 aarch64_next_callee_save (unsigned regno, unsigned limit)
6024 {
6025 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6026 regno ++;
6027 return regno;
6028 }
6029
6030 /* Push the register number REGNO of mode MODE to the stack with write-back
6031 adjusting the stack by ADJUSTMENT. */
6032
6033 static void
6034 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6035 HOST_WIDE_INT adjustment)
6036 {
6037 rtx base_rtx = stack_pointer_rtx;
6038 rtx insn, reg, mem;
6039
6040 reg = gen_rtx_REG (mode, regno);
6041 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6042 plus_constant (Pmode, base_rtx, -adjustment));
6043 mem = gen_frame_mem (mode, mem);
6044
6045 insn = emit_move_insn (mem, reg);
6046 RTX_FRAME_RELATED_P (insn) = 1;
6047 }
6048
6049 /* Generate and return an instruction to store the pair of registers
6050 REG and REG2 of mode MODE to location BASE with write-back adjusting
6051 the stack location BASE by ADJUSTMENT. */
6052
6053 static rtx
6054 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6055 HOST_WIDE_INT adjustment)
6056 {
6057 switch (mode)
6058 {
6059 case E_DImode:
6060 return gen_storewb_pairdi_di (base, base, reg, reg2,
6061 GEN_INT (-adjustment),
6062 GEN_INT (UNITS_PER_WORD - adjustment));
6063 case E_DFmode:
6064 return gen_storewb_pairdf_di (base, base, reg, reg2,
6065 GEN_INT (-adjustment),
6066 GEN_INT (UNITS_PER_WORD - adjustment));
6067 case E_TFmode:
6068 return gen_storewb_pairtf_di (base, base, reg, reg2,
6069 GEN_INT (-adjustment),
6070 GEN_INT (UNITS_PER_VREG - adjustment));
6071 default:
6072 gcc_unreachable ();
6073 }
6074 }
6075
6076 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6077 stack pointer by ADJUSTMENT. */
6078
6079 static void
6080 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6081 {
6082 rtx_insn *insn;
6083 machine_mode mode = aarch64_reg_save_mode (regno1);
6084
6085 if (regno2 == INVALID_REGNUM)
6086 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6087
6088 rtx reg1 = gen_rtx_REG (mode, regno1);
6089 rtx reg2 = gen_rtx_REG (mode, regno2);
6090
6091 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6092 reg2, adjustment));
6093 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6094 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6095 RTX_FRAME_RELATED_P (insn) = 1;
6096 }
6097
6098 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6099 adjusting it by ADJUSTMENT afterwards. */
6100
6101 static rtx
6102 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6103 HOST_WIDE_INT adjustment)
6104 {
6105 switch (mode)
6106 {
6107 case E_DImode:
6108 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6109 GEN_INT (UNITS_PER_WORD));
6110 case E_DFmode:
6111 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6112 GEN_INT (UNITS_PER_WORD));
6113 case E_TFmode:
6114 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6115 GEN_INT (UNITS_PER_VREG));
6116 default:
6117 gcc_unreachable ();
6118 }
6119 }
6120
6121 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6122 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6123 into CFI_OPS. */
6124
6125 static void
6126 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6127 rtx *cfi_ops)
6128 {
6129 machine_mode mode = aarch64_reg_save_mode (regno1);
6130 rtx reg1 = gen_rtx_REG (mode, regno1);
6131
6132 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6133
6134 if (regno2 == INVALID_REGNUM)
6135 {
6136 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6137 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6138 emit_move_insn (reg1, gen_frame_mem (mode, mem));
6139 }
6140 else
6141 {
6142 rtx reg2 = gen_rtx_REG (mode, regno2);
6143 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6144 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6145 reg2, adjustment));
6146 }
6147 }
6148
6149 /* Generate and return a store pair instruction of mode MODE to store
6150 register REG1 to MEM1 and register REG2 to MEM2. */
6151
6152 static rtx
6153 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6154 rtx reg2)
6155 {
6156 switch (mode)
6157 {
6158 case E_DImode:
6159 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6160
6161 case E_DFmode:
6162 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6163
6164 case E_TFmode:
6165 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6166
6167 default:
6168 gcc_unreachable ();
6169 }
6170 }
6171
6172 /* Generate and regurn a load pair isntruction of mode MODE to load register
6173 REG1 from MEM1 and register REG2 from MEM2. */
6174
6175 static rtx
6176 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6177 rtx mem2)
6178 {
6179 switch (mode)
6180 {
6181 case E_DImode:
6182 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6183
6184 case E_DFmode:
6185 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6186
6187 case E_TFmode:
6188 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6189
6190 default:
6191 gcc_unreachable ();
6192 }
6193 }
6194
6195 /* Return TRUE if return address signing should be enabled for the current
6196 function, otherwise return FALSE. */
6197
6198 bool
6199 aarch64_return_address_signing_enabled (void)
6200 {
6201 /* This function should only be called after frame laid out. */
6202 gcc_assert (cfun->machine->frame.laid_out);
6203
6204 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6205 if its LR is pushed onto stack. */
6206 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6207 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6208 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6209 }
6210
6211 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
6212 bool
6213 aarch64_bti_enabled (void)
6214 {
6215 return (aarch64_enable_bti == 1);
6216 }
6217
6218 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6219 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6220 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
6221
6222 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6223 or LD1D address
6224
6225 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6226 if the variable isn't already nonnull
6227
6228 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6229 Handle this case using a temporary base register that is suitable for
6230 all offsets in that range. Use ANCHOR_REG as this base register if it
6231 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
6232
6233 static inline void
6234 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
6235 rtx &anchor_reg, poly_int64 &offset,
6236 rtx &ptrue)
6237 {
6238 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
6239 {
6240 /* This is the maximum valid offset of the anchor from the base.
6241 Lower values would be valid too. */
6242 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
6243 if (!anchor_reg)
6244 {
6245 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6246 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6247 gen_int_mode (anchor_offset, Pmode)));
6248 }
6249 base_rtx = anchor_reg;
6250 offset -= anchor_offset;
6251 }
6252 if (!ptrue)
6253 {
6254 int pred_reg = cfun->machine->frame.spare_pred_reg;
6255 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
6256 CONSTM1_RTX (VNx16BImode));
6257 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
6258 }
6259 }
6260
6261 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6262 is saved at BASE + OFFSET. */
6263
6264 static void
6265 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
6266 rtx base, poly_int64 offset)
6267 {
6268 rtx mem = gen_frame_mem (GET_MODE (reg),
6269 plus_constant (Pmode, base, offset));
6270 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
6271 }
6272
6273 /* Emit code to save the callee-saved registers from register number START
6274 to LIMIT to the stack at the location starting at offset START_OFFSET,
6275 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
6276 is true if the hard frame pointer has been set up. */
6277
6278 static void
6279 aarch64_save_callee_saves (poly_int64 start_offset,
6280 unsigned start, unsigned limit, bool skip_wb,
6281 bool hard_fp_valid_p)
6282 {
6283 rtx_insn *insn;
6284 unsigned regno;
6285 unsigned regno2;
6286 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6287
6288 for (regno = aarch64_next_callee_save (start, limit);
6289 regno <= limit;
6290 regno = aarch64_next_callee_save (regno + 1, limit))
6291 {
6292 rtx reg, mem;
6293 poly_int64 offset;
6294 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6295
6296 if (skip_wb
6297 && (regno == cfun->machine->frame.wb_candidate1
6298 || regno == cfun->machine->frame.wb_candidate2))
6299 continue;
6300
6301 if (cfun->machine->reg_is_wrapped_separately[regno])
6302 continue;
6303
6304 machine_mode mode = aarch64_reg_save_mode (regno);
6305 reg = gen_rtx_REG (mode, regno);
6306 offset = start_offset + cfun->machine->frame.reg_offset[regno];
6307 rtx base_rtx = stack_pointer_rtx;
6308 poly_int64 sp_offset = offset;
6309
6310 HOST_WIDE_INT const_offset;
6311 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6312 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6313 offset, ptrue);
6314 else if (GP_REGNUM_P (regno)
6315 && (!offset.is_constant (&const_offset) || const_offset >= 512))
6316 {
6317 gcc_assert (known_eq (start_offset, 0));
6318 poly_int64 fp_offset
6319 = cfun->machine->frame.below_hard_fp_saved_regs_size;
6320 if (hard_fp_valid_p)
6321 base_rtx = hard_frame_pointer_rtx;
6322 else
6323 {
6324 if (!anchor_reg)
6325 {
6326 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6327 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6328 gen_int_mode (fp_offset, Pmode)));
6329 }
6330 base_rtx = anchor_reg;
6331 }
6332 offset -= fp_offset;
6333 }
6334 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6335 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
6336
6337 if (!aarch64_sve_mode_p (mode)
6338 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6339 && !cfun->machine->reg_is_wrapped_separately[regno2]
6340 && known_eq (GET_MODE_SIZE (mode),
6341 cfun->machine->frame.reg_offset[regno2]
6342 - cfun->machine->frame.reg_offset[regno]))
6343 {
6344 rtx reg2 = gen_rtx_REG (mode, regno2);
6345 rtx mem2;
6346
6347 offset += GET_MODE_SIZE (mode);
6348 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6349 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
6350 reg2));
6351
6352 /* The first part of a frame-related parallel insn is
6353 always assumed to be relevant to the frame
6354 calculations; subsequent parts, are only
6355 frame-related if explicitly marked. */
6356 if (aarch64_emit_cfi_for_reg_p (regno2))
6357 {
6358 if (need_cfa_note_p)
6359 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
6360 sp_offset + GET_MODE_SIZE (mode));
6361 else
6362 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6363 }
6364
6365 regno = regno2;
6366 }
6367 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6368 {
6369 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
6370 need_cfa_note_p = true;
6371 }
6372 else if (aarch64_sve_mode_p (mode))
6373 insn = emit_insn (gen_rtx_SET (mem, reg));
6374 else
6375 insn = emit_move_insn (mem, reg);
6376
6377 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6378 if (frame_related_p && need_cfa_note_p)
6379 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
6380 }
6381 }
6382
6383 /* Emit code to restore the callee registers from register number START
6384 up to and including LIMIT. Restore from the stack offset START_OFFSET,
6385 skipping any write-back candidates if SKIP_WB is true. Write the
6386 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
6387
6388 static void
6389 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
6390 unsigned limit, bool skip_wb, rtx *cfi_ops)
6391 {
6392 unsigned regno;
6393 unsigned regno2;
6394 poly_int64 offset;
6395 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6396
6397 for (regno = aarch64_next_callee_save (start, limit);
6398 regno <= limit;
6399 regno = aarch64_next_callee_save (regno + 1, limit))
6400 {
6401 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6402 if (cfun->machine->reg_is_wrapped_separately[regno])
6403 continue;
6404
6405 rtx reg, mem;
6406
6407 if (skip_wb
6408 && (regno == cfun->machine->frame.wb_candidate1
6409 || regno == cfun->machine->frame.wb_candidate2))
6410 continue;
6411
6412 machine_mode mode = aarch64_reg_save_mode (regno);
6413 reg = gen_rtx_REG (mode, regno);
6414 offset = start_offset + cfun->machine->frame.reg_offset[regno];
6415 rtx base_rtx = stack_pointer_rtx;
6416 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6417 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6418 offset, ptrue);
6419 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6420
6421 if (!aarch64_sve_mode_p (mode)
6422 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6423 && !cfun->machine->reg_is_wrapped_separately[regno2]
6424 && known_eq (GET_MODE_SIZE (mode),
6425 cfun->machine->frame.reg_offset[regno2]
6426 - cfun->machine->frame.reg_offset[regno]))
6427 {
6428 rtx reg2 = gen_rtx_REG (mode, regno2);
6429 rtx mem2;
6430
6431 offset += GET_MODE_SIZE (mode);
6432 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6433 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6434
6435 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6436 regno = regno2;
6437 }
6438 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6439 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
6440 else if (aarch64_sve_mode_p (mode))
6441 emit_insn (gen_rtx_SET (reg, mem));
6442 else
6443 emit_move_insn (reg, mem);
6444 if (frame_related_p)
6445 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
6446 }
6447 }
6448
6449 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
6450 of MODE. */
6451
6452 static inline bool
6453 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6454 {
6455 HOST_WIDE_INT multiple;
6456 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6457 && IN_RANGE (multiple, -8, 7));
6458 }
6459
6460 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
6461 of MODE. */
6462
6463 static inline bool
6464 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6465 {
6466 HOST_WIDE_INT multiple;
6467 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6468 && IN_RANGE (multiple, 0, 63));
6469 }
6470
6471 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
6472 of MODE. */
6473
6474 bool
6475 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6476 {
6477 HOST_WIDE_INT multiple;
6478 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6479 && IN_RANGE (multiple, -64, 63));
6480 }
6481
6482 /* Return true if OFFSET is a signed 9-bit value. */
6483
6484 bool
6485 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
6486 poly_int64 offset)
6487 {
6488 HOST_WIDE_INT const_offset;
6489 return (offset.is_constant (&const_offset)
6490 && IN_RANGE (const_offset, -256, 255));
6491 }
6492
6493 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
6494 of MODE. */
6495
6496 static inline bool
6497 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6498 {
6499 HOST_WIDE_INT multiple;
6500 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6501 && IN_RANGE (multiple, -256, 255));
6502 }
6503
6504 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
6505 of MODE. */
6506
6507 static inline bool
6508 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6509 {
6510 HOST_WIDE_INT multiple;
6511 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6512 && IN_RANGE (multiple, 0, 4095));
6513 }
6514
6515 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
6516
6517 static sbitmap
6518 aarch64_get_separate_components (void)
6519 {
6520 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6521 bitmap_clear (components);
6522
6523 /* The registers we need saved to the frame. */
6524 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6525 if (aarch64_register_saved_on_entry (regno))
6526 {
6527 /* Punt on saves and restores that use ST1D and LD1D. We could
6528 try to be smarter, but it would involve making sure that the
6529 spare predicate register itself is safe to use at the save
6530 and restore points. Also, when a frame pointer is being used,
6531 the slots are often out of reach of ST1D and LD1D anyway. */
6532 machine_mode mode = aarch64_reg_save_mode (regno);
6533 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6534 continue;
6535
6536 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6537
6538 /* If the register is saved in the first SVE save slot, we use
6539 it as a stack probe for -fstack-clash-protection. */
6540 if (flag_stack_clash_protection
6541 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
6542 && known_eq (offset, 0))
6543 continue;
6544
6545 /* Get the offset relative to the register we'll use. */
6546 if (frame_pointer_needed)
6547 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6548 else
6549 offset += crtl->outgoing_args_size;
6550
6551 /* Check that we can access the stack slot of the register with one
6552 direct load with no adjustments needed. */
6553 if (aarch64_sve_mode_p (mode)
6554 ? offset_9bit_signed_scaled_p (mode, offset)
6555 : offset_12bit_unsigned_scaled_p (mode, offset))
6556 bitmap_set_bit (components, regno);
6557 }
6558
6559 /* Don't mess with the hard frame pointer. */
6560 if (frame_pointer_needed)
6561 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
6562
6563 /* If the spare predicate register used by big-endian SVE code
6564 is call-preserved, it must be saved in the main prologue
6565 before any saves that use it. */
6566 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
6567 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
6568
6569 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6570 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6571 /* If registers have been chosen to be stored/restored with
6572 writeback don't interfere with them to avoid having to output explicit
6573 stack adjustment instructions. */
6574 if (reg2 != INVALID_REGNUM)
6575 bitmap_clear_bit (components, reg2);
6576 if (reg1 != INVALID_REGNUM)
6577 bitmap_clear_bit (components, reg1);
6578
6579 bitmap_clear_bit (components, LR_REGNUM);
6580 bitmap_clear_bit (components, SP_REGNUM);
6581
6582 return components;
6583 }
6584
6585 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
6586
6587 static sbitmap
6588 aarch64_components_for_bb (basic_block bb)
6589 {
6590 bitmap in = DF_LIVE_IN (bb);
6591 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
6592 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
6593
6594 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6595 bitmap_clear (components);
6596
6597 /* Clobbered registers don't generate values in any meaningful sense,
6598 since nothing after the clobber can rely on their value. And we can't
6599 say that partially-clobbered registers are unconditionally killed,
6600 because whether they're killed or not depends on the mode of the
6601 value they're holding. Thus partially call-clobbered registers
6602 appear in neither the kill set nor the gen set.
6603
6604 Check manually for any calls that clobber more of a register than the
6605 current function can. */
6606 function_abi_aggregator callee_abis;
6607 rtx_insn *insn;
6608 FOR_BB_INSNS (bb, insn)
6609 if (CALL_P (insn))
6610 callee_abis.note_callee_abi (insn_callee_abi (insn));
6611 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
6612
6613 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
6614 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6615 if (!fixed_regs[regno]
6616 && !crtl->abi->clobbers_full_reg_p (regno)
6617 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
6618 || bitmap_bit_p (in, regno)
6619 || bitmap_bit_p (gen, regno)
6620 || bitmap_bit_p (kill, regno)))
6621 {
6622 bitmap_set_bit (components, regno);
6623
6624 /* If there is a callee-save at an adjacent offset, add it too
6625 to increase the use of LDP/STP. */
6626 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6627 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
6628
6629 if (regno2 <= LAST_SAVED_REGNUM)
6630 {
6631 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6632 if (regno < regno2
6633 ? known_eq (offset + 8, offset2)
6634 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
6635 bitmap_set_bit (components, regno2);
6636 }
6637 }
6638
6639 return components;
6640 }
6641
6642 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6643 Nothing to do for aarch64. */
6644
6645 static void
6646 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
6647 {
6648 }
6649
6650 /* Return the next set bit in BMP from START onwards. Return the total number
6651 of bits in BMP if no set bit is found at or after START. */
6652
6653 static unsigned int
6654 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6655 {
6656 unsigned int nbits = SBITMAP_SIZE (bmp);
6657 if (start == nbits)
6658 return start;
6659
6660 gcc_assert (start < nbits);
6661 for (unsigned int i = start; i < nbits; i++)
6662 if (bitmap_bit_p (bmp, i))
6663 return i;
6664
6665 return nbits;
6666 }
6667
6668 /* Do the work for aarch64_emit_prologue_components and
6669 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6670 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6671 for these components or the epilogue sequence. That is, it determines
6672 whether we should emit stores or loads and what kind of CFA notes to attach
6673 to the insns. Otherwise the logic for the two sequences is very
6674 similar. */
6675
6676 static void
6677 aarch64_process_components (sbitmap components, bool prologue_p)
6678 {
6679 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6680 ? HARD_FRAME_POINTER_REGNUM
6681 : STACK_POINTER_REGNUM);
6682
6683 unsigned last_regno = SBITMAP_SIZE (components);
6684 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6685 rtx_insn *insn = NULL;
6686
6687 while (regno != last_regno)
6688 {
6689 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6690 machine_mode mode = aarch64_reg_save_mode (regno);
6691
6692 rtx reg = gen_rtx_REG (mode, regno);
6693 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6694 if (frame_pointer_needed)
6695 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6696 else
6697 offset += crtl->outgoing_args_size;
6698
6699 rtx addr = plus_constant (Pmode, ptr_reg, offset);
6700 rtx mem = gen_frame_mem (mode, addr);
6701
6702 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6703 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6704 /* No more registers to handle after REGNO.
6705 Emit a single save/restore and exit. */
6706 if (regno2 == last_regno)
6707 {
6708 insn = emit_insn (set);
6709 if (frame_related_p)
6710 {
6711 RTX_FRAME_RELATED_P (insn) = 1;
6712 if (prologue_p)
6713 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6714 else
6715 add_reg_note (insn, REG_CFA_RESTORE, reg);
6716 }
6717 break;
6718 }
6719
6720 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6721 /* The next register is not of the same class or its offset is not
6722 mergeable with the current one into a pair. */
6723 if (aarch64_sve_mode_p (mode)
6724 || !satisfies_constraint_Ump (mem)
6725 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6726 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6727 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6728 GET_MODE_SIZE (mode)))
6729 {
6730 insn = emit_insn (set);
6731 if (frame_related_p)
6732 {
6733 RTX_FRAME_RELATED_P (insn) = 1;
6734 if (prologue_p)
6735 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6736 else
6737 add_reg_note (insn, REG_CFA_RESTORE, reg);
6738 }
6739
6740 regno = regno2;
6741 continue;
6742 }
6743
6744 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
6745
6746 /* REGNO2 can be saved/restored in a pair with REGNO. */
6747 rtx reg2 = gen_rtx_REG (mode, regno2);
6748 if (frame_pointer_needed)
6749 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6750 else
6751 offset2 += crtl->outgoing_args_size;
6752 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6753 rtx mem2 = gen_frame_mem (mode, addr2);
6754 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6755 : gen_rtx_SET (reg2, mem2);
6756
6757 if (prologue_p)
6758 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6759 else
6760 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6761
6762 if (frame_related_p || frame_related2_p)
6763 {
6764 RTX_FRAME_RELATED_P (insn) = 1;
6765 if (prologue_p)
6766 {
6767 if (frame_related_p)
6768 add_reg_note (insn, REG_CFA_OFFSET, set);
6769 if (frame_related2_p)
6770 add_reg_note (insn, REG_CFA_OFFSET, set2);
6771 }
6772 else
6773 {
6774 if (frame_related_p)
6775 add_reg_note (insn, REG_CFA_RESTORE, reg);
6776 if (frame_related2_p)
6777 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6778 }
6779 }
6780
6781 regno = aarch64_get_next_set_bit (components, regno2 + 1);
6782 }
6783 }
6784
6785 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6786
6787 static void
6788 aarch64_emit_prologue_components (sbitmap components)
6789 {
6790 aarch64_process_components (components, true);
6791 }
6792
6793 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6794
6795 static void
6796 aarch64_emit_epilogue_components (sbitmap components)
6797 {
6798 aarch64_process_components (components, false);
6799 }
6800
6801 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6802
6803 static void
6804 aarch64_set_handled_components (sbitmap components)
6805 {
6806 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6807 if (bitmap_bit_p (components, regno))
6808 cfun->machine->reg_is_wrapped_separately[regno] = true;
6809 }
6810
6811 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6812 determining the probe offset for alloca. */
6813
6814 static HOST_WIDE_INT
6815 aarch64_stack_clash_protection_alloca_probe_range (void)
6816 {
6817 return STACK_CLASH_CALLER_GUARD;
6818 }
6819
6820
6821 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6822 registers. If POLY_SIZE is not large enough to require a probe this function
6823 will only adjust the stack. When allocating the stack space
6824 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6825 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6826 arguments. If we are then we ensure that any allocation larger than the ABI
6827 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6828 maintained.
6829
6830 We emit barriers after each stack adjustment to prevent optimizations from
6831 breaking the invariant that we never drop the stack more than a page. This
6832 invariant is needed to make it easier to correctly handle asynchronous
6833 events, e.g. if we were to allow the stack to be dropped by more than a page
6834 and then have multiple probes up and we take a signal somewhere in between
6835 then the signal handler doesn't know the state of the stack and can make no
6836 assumptions about which pages have been probed. */
6837
6838 static void
6839 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6840 poly_int64 poly_size,
6841 bool frame_related_p,
6842 bool final_adjustment_p)
6843 {
6844 HOST_WIDE_INT guard_size
6845 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6846 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6847 HOST_WIDE_INT min_probe_threshold
6848 = (final_adjustment_p
6849 ? guard_used_by_caller
6850 : guard_size - guard_used_by_caller);
6851 /* When doing the final adjustment for the outgoing arguments, take into
6852 account any unprobed space there is above the current SP. There are
6853 two cases:
6854
6855 - When saving SVE registers below the hard frame pointer, we force
6856 the lowest save to take place in the prologue before doing the final
6857 adjustment (i.e. we don't allow the save to be shrink-wrapped).
6858 This acts as a probe at SP, so there is no unprobed space.
6859
6860 - When there are no SVE register saves, we use the store of the link
6861 register as a probe. We can't assume that LR was saved at position 0
6862 though, so treat any space below it as unprobed. */
6863 if (final_adjustment_p
6864 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
6865 {
6866 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
6867 if (known_ge (lr_offset, 0))
6868 min_probe_threshold -= lr_offset.to_constant ();
6869 else
6870 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
6871 }
6872
6873 poly_int64 frame_size = cfun->machine->frame.frame_size;
6874
6875 /* We should always have a positive probe threshold. */
6876 gcc_assert (min_probe_threshold > 0);
6877
6878 if (flag_stack_clash_protection && !final_adjustment_p)
6879 {
6880 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6881 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
6882 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6883
6884 if (known_eq (frame_size, 0))
6885 {
6886 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6887 }
6888 else if (known_lt (initial_adjust + sve_callee_adjust,
6889 guard_size - guard_used_by_caller)
6890 && known_lt (final_adjust, guard_used_by_caller))
6891 {
6892 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6893 }
6894 }
6895
6896 /* If SIZE is not large enough to require probing, just adjust the stack and
6897 exit. */
6898 if (known_lt (poly_size, min_probe_threshold)
6899 || !flag_stack_clash_protection)
6900 {
6901 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6902 return;
6903 }
6904
6905 HOST_WIDE_INT size;
6906 /* Handle the SVE non-constant case first. */
6907 if (!poly_size.is_constant (&size))
6908 {
6909 if (dump_file)
6910 {
6911 fprintf (dump_file, "Stack clash SVE prologue: ");
6912 print_dec (poly_size, dump_file);
6913 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6914 }
6915
6916 /* First calculate the amount of bytes we're actually spilling. */
6917 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6918 poly_size, temp1, temp2, false, true);
6919
6920 rtx_insn *insn = get_last_insn ();
6921
6922 if (frame_related_p)
6923 {
6924 /* This is done to provide unwinding information for the stack
6925 adjustments we're about to do, however to prevent the optimizers
6926 from removing the R11 move and leaving the CFA note (which would be
6927 very wrong) we tie the old and new stack pointer together.
6928 The tie will expand to nothing but the optimizers will not touch
6929 the instruction. */
6930 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6931 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6932 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6933
6934 /* We want the CFA independent of the stack pointer for the
6935 duration of the loop. */
6936 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6937 RTX_FRAME_RELATED_P (insn) = 1;
6938 }
6939
6940 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6941 rtx guard_const = gen_int_mode (guard_size, Pmode);
6942
6943 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6944 stack_pointer_rtx, temp1,
6945 probe_const, guard_const));
6946
6947 /* Now reset the CFA register if needed. */
6948 if (frame_related_p)
6949 {
6950 add_reg_note (insn, REG_CFA_DEF_CFA,
6951 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6952 gen_int_mode (poly_size, Pmode)));
6953 RTX_FRAME_RELATED_P (insn) = 1;
6954 }
6955
6956 return;
6957 }
6958
6959 if (dump_file)
6960 fprintf (dump_file,
6961 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6962 " bytes, probing will be required.\n", size);
6963
6964 /* Round size to the nearest multiple of guard_size, and calculate the
6965 residual as the difference between the original size and the rounded
6966 size. */
6967 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6968 HOST_WIDE_INT residual = size - rounded_size;
6969
6970 /* We can handle a small number of allocations/probes inline. Otherwise
6971 punt to a loop. */
6972 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6973 {
6974 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6975 {
6976 aarch64_sub_sp (NULL, temp2, guard_size, true);
6977 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6978 guard_used_by_caller));
6979 emit_insn (gen_blockage ());
6980 }
6981 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6982 }
6983 else
6984 {
6985 /* Compute the ending address. */
6986 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6987 temp1, NULL, false, true);
6988 rtx_insn *insn = get_last_insn ();
6989
6990 /* For the initial allocation, we don't have a frame pointer
6991 set up, so we always need CFI notes. If we're doing the
6992 final allocation, then we may have a frame pointer, in which
6993 case it is the CFA, otherwise we need CFI notes.
6994
6995 We can determine which allocation we are doing by looking at
6996 the value of FRAME_RELATED_P since the final allocations are not
6997 frame related. */
6998 if (frame_related_p)
6999 {
7000 /* We want the CFA independent of the stack pointer for the
7001 duration of the loop. */
7002 add_reg_note (insn, REG_CFA_DEF_CFA,
7003 plus_constant (Pmode, temp1, rounded_size));
7004 RTX_FRAME_RELATED_P (insn) = 1;
7005 }
7006
7007 /* This allocates and probes the stack. Note that this re-uses some of
7008 the existing Ada stack protection code. However we are guaranteed not
7009 to enter the non loop or residual branches of that code.
7010
7011 The non-loop part won't be entered because if our allocation amount
7012 doesn't require a loop, the case above would handle it.
7013
7014 The residual amount won't be entered because TEMP1 is a mutliple of
7015 the allocation size. The residual will always be 0. As such, the only
7016 part we are actually using from that code is the loop setup. The
7017 actual probing is done in aarch64_output_probe_stack_range. */
7018 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7019 stack_pointer_rtx, temp1));
7020
7021 /* Now reset the CFA register if needed. */
7022 if (frame_related_p)
7023 {
7024 add_reg_note (insn, REG_CFA_DEF_CFA,
7025 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7026 RTX_FRAME_RELATED_P (insn) = 1;
7027 }
7028
7029 emit_insn (gen_blockage ());
7030 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7031 }
7032
7033 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
7034 be probed. This maintains the requirement that each page is probed at
7035 least once. For initial probing we probe only if the allocation is
7036 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7037 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
7038 GUARD_SIZE. This works that for any allocation that is large enough to
7039 trigger a probe here, we'll have at least one, and if they're not large
7040 enough for this code to emit anything for them, The page would have been
7041 probed by the saving of FP/LR either by this function or any callees. If
7042 we don't have any callees then we won't have more stack adjustments and so
7043 are still safe. */
7044 if (residual)
7045 {
7046 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7047 /* If we're doing final adjustments, and we've done any full page
7048 allocations then any residual needs to be probed. */
7049 if (final_adjustment_p && rounded_size != 0)
7050 min_probe_threshold = 0;
7051 /* If doing a small final adjustment, we always probe at offset 0.
7052 This is done to avoid issues when LR is not at position 0 or when
7053 the final adjustment is smaller than the probing offset. */
7054 else if (final_adjustment_p && rounded_size == 0)
7055 residual_probe_offset = 0;
7056
7057 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7058 if (residual >= min_probe_threshold)
7059 {
7060 if (dump_file)
7061 fprintf (dump_file,
7062 "Stack clash AArch64 prologue residuals: "
7063 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7064 "\n", residual);
7065
7066 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7067 residual_probe_offset));
7068 emit_insn (gen_blockage ());
7069 }
7070 }
7071 }
7072
7073 /* Return 1 if the register is used by the epilogue. We need to say the
7074 return register is used, but only after epilogue generation is complete.
7075 Note that in the case of sibcalls, the values "used by the epilogue" are
7076 considered live at the start of the called function.
7077
7078 For SIMD functions we need to return 1 for FP registers that are saved and
7079 restored by a function but are not zero in call_used_regs. If we do not do
7080 this optimizations may remove the restore of the register. */
7081
7082 int
7083 aarch64_epilogue_uses (int regno)
7084 {
7085 if (epilogue_completed)
7086 {
7087 if (regno == LR_REGNUM)
7088 return 1;
7089 }
7090 return 0;
7091 }
7092
7093 /* AArch64 stack frames generated by this compiler look like:
7094
7095 +-------------------------------+
7096 | |
7097 | incoming stack arguments |
7098 | |
7099 +-------------------------------+
7100 | | <-- incoming stack pointer (aligned)
7101 | callee-allocated save area |
7102 | for register varargs |
7103 | |
7104 +-------------------------------+
7105 | local variables | <-- frame_pointer_rtx
7106 | |
7107 +-------------------------------+
7108 | padding | \
7109 +-------------------------------+ |
7110 | callee-saved registers | | frame.saved_regs_size
7111 +-------------------------------+ |
7112 | LR' | |
7113 +-------------------------------+ |
7114 | FP' | |
7115 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
7116 | SVE vector registers | | \
7117 +-------------------------------+ | | below_hard_fp_saved_regs_size
7118 | SVE predicate registers | / /
7119 +-------------------------------+
7120 | dynamic allocation |
7121 +-------------------------------+
7122 | padding |
7123 +-------------------------------+
7124 | outgoing stack arguments | <-- arg_pointer
7125 | |
7126 +-------------------------------+
7127 | | <-- stack_pointer_rtx (aligned)
7128
7129 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7130 but leave frame_pointer_rtx and hard_frame_pointer_rtx
7131 unchanged.
7132
7133 By default for stack-clash we assume the guard is at least 64KB, but this
7134 value is configurable to either 4KB or 64KB. We also force the guard size to
7135 be the same as the probing interval and both values are kept in sync.
7136
7137 With those assumptions the callee can allocate up to 63KB (or 3KB depending
7138 on the guard size) of stack space without probing.
7139
7140 When probing is needed, we emit a probe at the start of the prologue
7141 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7142
7143 We have to track how much space has been allocated and the only stores
7144 to the stack we track as implicit probes are the FP/LR stores.
7145
7146 For outgoing arguments we probe if the size is larger than 1KB, such that
7147 the ABI specified buffer is maintained for the next callee.
7148
7149 The following registers are reserved during frame layout and should not be
7150 used for any other purpose:
7151
7152 - r11: Used by stack clash protection when SVE is enabled, and also
7153 as an anchor register when saving and restoring registers
7154 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7155 - r14 and r15: Used for speculation tracking.
7156 - r16(IP0), r17(IP1): Used by indirect tailcalls.
7157 - r30(LR), r29(FP): Used by standard frame layout.
7158
7159 These registers must be avoided in frame layout related code unless the
7160 explicit intention is to interact with one of the features listed above. */
7161
7162 /* Generate the prologue instructions for entry into a function.
7163 Establish the stack frame by decreasing the stack pointer with a
7164 properly calculated size and, if necessary, create a frame record
7165 filled with the values of LR and previous frame pointer. The
7166 current FP is also set up if it is in use. */
7167
7168 void
7169 aarch64_expand_prologue (void)
7170 {
7171 poly_int64 frame_size = cfun->machine->frame.frame_size;
7172 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7173 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7174 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7175 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7176 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7177 poly_int64 below_hard_fp_saved_regs_size
7178 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7179 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7180 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7181 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7182 rtx_insn *insn;
7183
7184 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7185 {
7186 /* Fold the SVE allocation into the initial allocation.
7187 We don't do this in aarch64_layout_arg to avoid pessimizing
7188 the epilogue code. */
7189 initial_adjust += sve_callee_adjust;
7190 sve_callee_adjust = 0;
7191 }
7192
7193 /* Sign return address for functions. */
7194 if (aarch64_return_address_signing_enabled ())
7195 {
7196 switch (aarch64_ra_sign_key)
7197 {
7198 case AARCH64_KEY_A:
7199 insn = emit_insn (gen_paciasp ());
7200 break;
7201 case AARCH64_KEY_B:
7202 insn = emit_insn (gen_pacibsp ());
7203 break;
7204 default:
7205 gcc_unreachable ();
7206 }
7207 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7208 RTX_FRAME_RELATED_P (insn) = 1;
7209 }
7210
7211 if (flag_stack_usage_info)
7212 current_function_static_stack_size = constant_lower_bound (frame_size);
7213
7214 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7215 {
7216 if (crtl->is_leaf && !cfun->calls_alloca)
7217 {
7218 if (maybe_gt (frame_size, PROBE_INTERVAL)
7219 && maybe_gt (frame_size, get_stack_check_protect ()))
7220 aarch64_emit_probe_stack_range (get_stack_check_protect (),
7221 (frame_size
7222 - get_stack_check_protect ()));
7223 }
7224 else if (maybe_gt (frame_size, 0))
7225 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7226 }
7227
7228 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7229 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7230
7231 /* In theory we should never have both an initial adjustment
7232 and a callee save adjustment. Verify that is the case since the
7233 code below does not handle it for -fstack-clash-protection. */
7234 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
7235
7236 /* Will only probe if the initial adjustment is larger than the guard
7237 less the amount of the guard reserved for use by the caller's
7238 outgoing args. */
7239 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
7240 true, false);
7241
7242 if (callee_adjust != 0)
7243 aarch64_push_regs (reg1, reg2, callee_adjust);
7244
7245 /* The offset of the frame chain record (if any) from the current SP. */
7246 poly_int64 chain_offset = (initial_adjust + callee_adjust
7247 - cfun->machine->frame.hard_fp_offset);
7248 gcc_assert (known_ge (chain_offset, 0));
7249
7250 /* The offset of the bottom of the save area from the current SP. */
7251 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
7252
7253 if (emit_frame_chain)
7254 {
7255 if (callee_adjust == 0)
7256 {
7257 reg1 = R29_REGNUM;
7258 reg2 = R30_REGNUM;
7259 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
7260 false, false);
7261 }
7262 else
7263 gcc_assert (known_eq (chain_offset, 0));
7264 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
7265 stack_pointer_rtx, chain_offset,
7266 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
7267 if (frame_pointer_needed && !frame_size.is_constant ())
7268 {
7269 /* Variable-sized frames need to describe the save slot
7270 address using DW_CFA_expression rather than DW_CFA_offset.
7271 This means that, without taking further action, the
7272 locations of the registers that we've already saved would
7273 remain based on the stack pointer even after we redefine
7274 the CFA based on the frame pointer. We therefore need new
7275 DW_CFA_expressions to re-express the save slots with addresses
7276 based on the frame pointer. */
7277 rtx_insn *insn = get_last_insn ();
7278 gcc_assert (RTX_FRAME_RELATED_P (insn));
7279
7280 /* Add an explicit CFA definition if this was previously
7281 implicit. */
7282 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
7283 {
7284 rtx src = plus_constant (Pmode, stack_pointer_rtx,
7285 callee_offset);
7286 add_reg_note (insn, REG_CFA_ADJUST_CFA,
7287 gen_rtx_SET (hard_frame_pointer_rtx, src));
7288 }
7289
7290 /* Change the save slot expressions for the registers that
7291 we've already saved. */
7292 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
7293 hard_frame_pointer_rtx, UNITS_PER_WORD);
7294 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
7295 hard_frame_pointer_rtx, 0);
7296 }
7297 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
7298 }
7299
7300 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
7301 callee_adjust != 0 || emit_frame_chain,
7302 emit_frame_chain);
7303 if (maybe_ne (sve_callee_adjust, 0))
7304 {
7305 gcc_assert (!flag_stack_clash_protection
7306 || known_eq (initial_adjust, 0));
7307 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
7308 sve_callee_adjust,
7309 !frame_pointer_needed, false);
7310 saved_regs_offset += sve_callee_adjust;
7311 }
7312 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
7313 false, emit_frame_chain);
7314 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
7315 callee_adjust != 0 || emit_frame_chain,
7316 emit_frame_chain);
7317
7318 /* We may need to probe the final adjustment if it is larger than the guard
7319 that is assumed by the called. */
7320 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
7321 !frame_pointer_needed, true);
7322 }
7323
7324 /* Return TRUE if we can use a simple_return insn.
7325
7326 This function checks whether the callee saved stack is empty, which
7327 means no restore actions are need. The pro_and_epilogue will use
7328 this to check whether shrink-wrapping opt is feasible. */
7329
7330 bool
7331 aarch64_use_return_insn_p (void)
7332 {
7333 if (!reload_completed)
7334 return false;
7335
7336 if (crtl->profile)
7337 return false;
7338
7339 return known_eq (cfun->machine->frame.frame_size, 0);
7340 }
7341
7342 /* Generate the epilogue instructions for returning from a function.
7343 This is almost exactly the reverse of the prolog sequence, except
7344 that we need to insert barriers to avoid scheduling loads that read
7345 from a deallocated stack, and we optimize the unwind records by
7346 emitting them all together if possible. */
7347 void
7348 aarch64_expand_epilogue (bool for_sibcall)
7349 {
7350 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7351 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7352 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7353 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7354 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7355 poly_int64 below_hard_fp_saved_regs_size
7356 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7357 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7358 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7359 rtx cfi_ops = NULL;
7360 rtx_insn *insn;
7361 /* A stack clash protection prologue may not have left EP0_REGNUM or
7362 EP1_REGNUM in a usable state. The same is true for allocations
7363 with an SVE component, since we then need both temporary registers
7364 for each allocation. For stack clash we are in a usable state if
7365 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
7366 HOST_WIDE_INT guard_size
7367 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
7368 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7369
7370 /* We can re-use the registers when:
7371
7372 (a) the deallocation amount is the same as the corresponding
7373 allocation amount (which is false if we combine the initial
7374 and SVE callee save allocations in the prologue); and
7375
7376 (b) the allocation amount doesn't need a probe (which is false
7377 if the amount is guard_size - guard_used_by_caller or greater).
7378
7379 In such situations the register should remain live with the correct
7380 value. */
7381 bool can_inherit_p = (initial_adjust.is_constant ()
7382 && final_adjust.is_constant ()
7383 && (!flag_stack_clash_protection
7384 || (known_lt (initial_adjust,
7385 guard_size - guard_used_by_caller)
7386 && known_eq (sve_callee_adjust, 0))));
7387
7388 /* We need to add memory barrier to prevent read from deallocated stack. */
7389 bool need_barrier_p
7390 = maybe_ne (get_frame_size ()
7391 + cfun->machine->frame.saved_varargs_size, 0);
7392
7393 /* Emit a barrier to prevent loads from a deallocated stack. */
7394 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
7395 || cfun->calls_alloca
7396 || crtl->calls_eh_return)
7397 {
7398 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7399 need_barrier_p = false;
7400 }
7401
7402 /* Restore the stack pointer from the frame pointer if it may not
7403 be the same as the stack pointer. */
7404 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7405 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7406 if (frame_pointer_needed
7407 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
7408 /* If writeback is used when restoring callee-saves, the CFA
7409 is restored on the instruction doing the writeback. */
7410 aarch64_add_offset (Pmode, stack_pointer_rtx,
7411 hard_frame_pointer_rtx,
7412 -callee_offset - below_hard_fp_saved_regs_size,
7413 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
7414 else
7415 /* The case where we need to re-use the register here is very rare, so
7416 avoid the complicated condition and just always emit a move if the
7417 immediate doesn't fit. */
7418 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
7419
7420 /* Restore the vector registers before the predicate registers,
7421 so that we can use P4 as a temporary for big-endian SVE frames. */
7422 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
7423 callee_adjust != 0, &cfi_ops);
7424 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
7425 false, &cfi_ops);
7426 if (maybe_ne (sve_callee_adjust, 0))
7427 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
7428 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
7429 R0_REGNUM, R30_REGNUM,
7430 callee_adjust != 0, &cfi_ops);
7431
7432 if (need_barrier_p)
7433 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7434
7435 if (callee_adjust != 0)
7436 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
7437
7438 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
7439 {
7440 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
7441 insn = get_last_insn ();
7442 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
7443 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
7444 RTX_FRAME_RELATED_P (insn) = 1;
7445 cfi_ops = NULL;
7446 }
7447
7448 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
7449 add restriction on emit_move optimization to leaf functions. */
7450 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
7451 (!can_inherit_p || !crtl->is_leaf
7452 || df_regs_ever_live_p (EP0_REGNUM)));
7453
7454 if (cfi_ops)
7455 {
7456 /* Emit delayed restores and reset the CFA to be SP. */
7457 insn = get_last_insn ();
7458 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
7459 REG_NOTES (insn) = cfi_ops;
7460 RTX_FRAME_RELATED_P (insn) = 1;
7461 }
7462
7463 /* We prefer to emit the combined return/authenticate instruction RETAA,
7464 however there are three cases in which we must instead emit an explicit
7465 authentication instruction.
7466
7467 1) Sibcalls don't return in a normal way, so if we're about to call one
7468 we must authenticate.
7469
7470 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
7471 generating code for !TARGET_ARMV8_3 we can't use it and must
7472 explicitly authenticate.
7473
7474 3) On an eh_return path we make extra stack adjustments to update the
7475 canonical frame address to be the exception handler's CFA. We want
7476 to authenticate using the CFA of the function which calls eh_return.
7477 */
7478 if (aarch64_return_address_signing_enabled ()
7479 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
7480 {
7481 switch (aarch64_ra_sign_key)
7482 {
7483 case AARCH64_KEY_A:
7484 insn = emit_insn (gen_autiasp ());
7485 break;
7486 case AARCH64_KEY_B:
7487 insn = emit_insn (gen_autibsp ());
7488 break;
7489 default:
7490 gcc_unreachable ();
7491 }
7492 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7493 RTX_FRAME_RELATED_P (insn) = 1;
7494 }
7495
7496 /* Stack adjustment for exception handler. */
7497 if (crtl->calls_eh_return && !for_sibcall)
7498 {
7499 /* We need to unwind the stack by the offset computed by
7500 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
7501 to be SP; letting the CFA move during this adjustment
7502 is just as correct as retaining the CFA from the body
7503 of the function. Therefore, do nothing special. */
7504 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
7505 }
7506
7507 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
7508 if (!for_sibcall)
7509 emit_jump_insn (ret_rtx);
7510 }
7511
7512 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
7513 normally or return to a previous frame after unwinding.
7514
7515 An EH return uses a single shared return sequence. The epilogue is
7516 exactly like a normal epilogue except that it has an extra input
7517 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
7518 that must be applied after the frame has been destroyed. An extra label
7519 is inserted before the epilogue which initializes this register to zero,
7520 and this is the entry point for a normal return.
7521
7522 An actual EH return updates the return address, initializes the stack
7523 adjustment and jumps directly into the epilogue (bypassing the zeroing
7524 of the adjustment). Since the return address is typically saved on the
7525 stack when a function makes a call, the saved LR must be updated outside
7526 the epilogue.
7527
7528 This poses problems as the store is generated well before the epilogue,
7529 so the offset of LR is not known yet. Also optimizations will remove the
7530 store as it appears dead, even after the epilogue is generated (as the
7531 base or offset for loading LR is different in many cases).
7532
7533 To avoid these problems this implementation forces the frame pointer
7534 in eh_return functions so that the location of LR is fixed and known early.
7535 It also marks the store volatile, so no optimization is permitted to
7536 remove the store. */
7537 rtx
7538 aarch64_eh_return_handler_rtx (void)
7539 {
7540 rtx tmp = gen_frame_mem (Pmode,
7541 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
7542
7543 /* Mark the store volatile, so no optimization is permitted to remove it. */
7544 MEM_VOLATILE_P (tmp) = true;
7545 return tmp;
7546 }
7547
7548 /* Output code to add DELTA to the first argument, and then jump
7549 to FUNCTION. Used for C++ multiple inheritance. */
7550 static void
7551 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
7552 HOST_WIDE_INT delta,
7553 HOST_WIDE_INT vcall_offset,
7554 tree function)
7555 {
7556 /* The this pointer is always in x0. Note that this differs from
7557 Arm where the this pointer maybe bumped to r1 if r0 is required
7558 to return a pointer to an aggregate. On AArch64 a result value
7559 pointer will be in x8. */
7560 int this_regno = R0_REGNUM;
7561 rtx this_rtx, temp0, temp1, addr, funexp;
7562 rtx_insn *insn;
7563 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
7564
7565 if (aarch64_bti_enabled ())
7566 emit_insn (gen_bti_c());
7567
7568 reload_completed = 1;
7569 emit_note (NOTE_INSN_PROLOGUE_END);
7570
7571 this_rtx = gen_rtx_REG (Pmode, this_regno);
7572 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
7573 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
7574
7575 if (vcall_offset == 0)
7576 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
7577 else
7578 {
7579 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
7580
7581 addr = this_rtx;
7582 if (delta != 0)
7583 {
7584 if (delta >= -256 && delta < 256)
7585 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
7586 plus_constant (Pmode, this_rtx, delta));
7587 else
7588 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
7589 temp1, temp0, false);
7590 }
7591
7592 if (Pmode == ptr_mode)
7593 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
7594 else
7595 aarch64_emit_move (temp0,
7596 gen_rtx_ZERO_EXTEND (Pmode,
7597 gen_rtx_MEM (ptr_mode, addr)));
7598
7599 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
7600 addr = plus_constant (Pmode, temp0, vcall_offset);
7601 else
7602 {
7603 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
7604 Pmode);
7605 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
7606 }
7607
7608 if (Pmode == ptr_mode)
7609 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
7610 else
7611 aarch64_emit_move (temp1,
7612 gen_rtx_SIGN_EXTEND (Pmode,
7613 gen_rtx_MEM (ptr_mode, addr)));
7614
7615 emit_insn (gen_add2_insn (this_rtx, temp1));
7616 }
7617
7618 /* Generate a tail call to the target function. */
7619 if (!TREE_USED (function))
7620 {
7621 assemble_external (function);
7622 TREE_USED (function) = 1;
7623 }
7624 funexp = XEXP (DECL_RTL (function), 0);
7625 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
7626 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
7627 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
7628 SIBLING_CALL_P (insn) = 1;
7629
7630 insn = get_insns ();
7631 shorten_branches (insn);
7632
7633 assemble_start_function (thunk, fnname);
7634 final_start_function (insn, file, 1);
7635 final (insn, file, 1);
7636 final_end_function ();
7637 assemble_end_function (thunk, fnname);
7638
7639 /* Stop pretending to be a post-reload pass. */
7640 reload_completed = 0;
7641 }
7642
7643 static bool
7644 aarch64_tls_referenced_p (rtx x)
7645 {
7646 if (!TARGET_HAVE_TLS)
7647 return false;
7648 subrtx_iterator::array_type array;
7649 FOR_EACH_SUBRTX (iter, array, x, ALL)
7650 {
7651 const_rtx x = *iter;
7652 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
7653 return true;
7654 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
7655 TLS offsets, not real symbol references. */
7656 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7657 iter.skip_subrtxes ();
7658 }
7659 return false;
7660 }
7661
7662
7663 /* Return true if val can be encoded as a 12-bit unsigned immediate with
7664 a left shift of 0 or 12 bits. */
7665 bool
7666 aarch64_uimm12_shift (HOST_WIDE_INT val)
7667 {
7668 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
7669 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
7670 );
7671 }
7672
7673 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
7674 that can be created with a left shift of 0 or 12. */
7675 static HOST_WIDE_INT
7676 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
7677 {
7678 /* Check to see if the value fits in 24 bits, as that is the maximum we can
7679 handle correctly. */
7680 gcc_assert ((val & 0xffffff) == val);
7681
7682 if (((val & 0xfff) << 0) == val)
7683 return val;
7684
7685 return val & (0xfff << 12);
7686 }
7687
7688 /* Return true if val is an immediate that can be loaded into a
7689 register by a MOVZ instruction. */
7690 static bool
7691 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
7692 {
7693 if (GET_MODE_SIZE (mode) > 4)
7694 {
7695 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
7696 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
7697 return 1;
7698 }
7699 else
7700 {
7701 /* Ignore sign extension. */
7702 val &= (HOST_WIDE_INT) 0xffffffff;
7703 }
7704 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
7705 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
7706 }
7707
7708 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
7709 64-bit (DImode) integer. */
7710
7711 static unsigned HOST_WIDE_INT
7712 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
7713 {
7714 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
7715 while (size < 64)
7716 {
7717 val &= (HOST_WIDE_INT_1U << size) - 1;
7718 val |= val << size;
7719 size *= 2;
7720 }
7721 return val;
7722 }
7723
7724 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7725
7726 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7727 {
7728 0x0000000100000001ull,
7729 0x0001000100010001ull,
7730 0x0101010101010101ull,
7731 0x1111111111111111ull,
7732 0x5555555555555555ull,
7733 };
7734
7735
7736 /* Return true if val is a valid bitmask immediate. */
7737
7738 bool
7739 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7740 {
7741 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7742 int bits;
7743
7744 /* Check for a single sequence of one bits and return quickly if so.
7745 The special cases of all ones and all zeroes returns false. */
7746 val = aarch64_replicate_bitmask_imm (val_in, mode);
7747 tmp = val + (val & -val);
7748
7749 if (tmp == (tmp & -tmp))
7750 return (val + 1) > 1;
7751
7752 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7753 if (mode == SImode)
7754 val = (val << 32) | (val & 0xffffffff);
7755
7756 /* Invert if the immediate doesn't start with a zero bit - this means we
7757 only need to search for sequences of one bits. */
7758 if (val & 1)
7759 val = ~val;
7760
7761 /* Find the first set bit and set tmp to val with the first sequence of one
7762 bits removed. Return success if there is a single sequence of ones. */
7763 first_one = val & -val;
7764 tmp = val & (val + first_one);
7765
7766 if (tmp == 0)
7767 return true;
7768
7769 /* Find the next set bit and compute the difference in bit position. */
7770 next_one = tmp & -tmp;
7771 bits = clz_hwi (first_one) - clz_hwi (next_one);
7772 mask = val ^ tmp;
7773
7774 /* Check the bit position difference is a power of 2, and that the first
7775 sequence of one bits fits within 'bits' bits. */
7776 if ((mask >> bits) != 0 || bits != (bits & -bits))
7777 return false;
7778
7779 /* Check the sequence of one bits is repeated 64/bits times. */
7780 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7781 }
7782
7783 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7784 Assumed precondition: VAL_IN Is not zero. */
7785
7786 unsigned HOST_WIDE_INT
7787 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7788 {
7789 int lowest_bit_set = ctz_hwi (val_in);
7790 int highest_bit_set = floor_log2 (val_in);
7791 gcc_assert (val_in != 0);
7792
7793 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7794 (HOST_WIDE_INT_1U << lowest_bit_set));
7795 }
7796
7797 /* Create constant where bits outside of lowest bit set to highest bit set
7798 are set to 1. */
7799
7800 unsigned HOST_WIDE_INT
7801 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7802 {
7803 return val_in | ~aarch64_and_split_imm1 (val_in);
7804 }
7805
7806 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7807
7808 bool
7809 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7810 {
7811 scalar_int_mode int_mode;
7812 if (!is_a <scalar_int_mode> (mode, &int_mode))
7813 return false;
7814
7815 if (aarch64_bitmask_imm (val_in, int_mode))
7816 return false;
7817
7818 if (aarch64_move_imm (val_in, int_mode))
7819 return false;
7820
7821 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7822
7823 return aarch64_bitmask_imm (imm2, int_mode);
7824 }
7825
7826 /* Return true if val is an immediate that can be loaded into a
7827 register in a single instruction. */
7828 bool
7829 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7830 {
7831 scalar_int_mode int_mode;
7832 if (!is_a <scalar_int_mode> (mode, &int_mode))
7833 return false;
7834
7835 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7836 return 1;
7837 return aarch64_bitmask_imm (val, int_mode);
7838 }
7839
7840 static bool
7841 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7842 {
7843 rtx base, offset;
7844
7845 if (GET_CODE (x) == HIGH)
7846 return true;
7847
7848 /* There's no way to calculate VL-based values using relocations. */
7849 subrtx_iterator::array_type array;
7850 FOR_EACH_SUBRTX (iter, array, x, ALL)
7851 if (GET_CODE (*iter) == CONST_POLY_INT)
7852 return true;
7853
7854 split_const (x, &base, &offset);
7855 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7856 {
7857 if (aarch64_classify_symbol (base, INTVAL (offset))
7858 != SYMBOL_FORCE_TO_MEM)
7859 return true;
7860 else
7861 /* Avoid generating a 64-bit relocation in ILP32; leave
7862 to aarch64_expand_mov_immediate to handle it properly. */
7863 return mode != ptr_mode;
7864 }
7865
7866 return aarch64_tls_referenced_p (x);
7867 }
7868
7869 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7870 The expansion for a table switch is quite expensive due to the number
7871 of instructions, the table lookup and hard to predict indirect jump.
7872 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7873 set, otherwise use tables for > 16 cases as a tradeoff between size and
7874 performance. When optimizing for size, use the default setting. */
7875
7876 static unsigned int
7877 aarch64_case_values_threshold (void)
7878 {
7879 /* Use the specified limit for the number of cases before using jump
7880 tables at higher optimization levels. */
7881 if (optimize > 2
7882 && selected_cpu->tune->max_case_values != 0)
7883 return selected_cpu->tune->max_case_values;
7884 else
7885 return optimize_size ? default_case_values_threshold () : 17;
7886 }
7887
7888 /* Return true if register REGNO is a valid index register.
7889 STRICT_P is true if REG_OK_STRICT is in effect. */
7890
7891 bool
7892 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7893 {
7894 if (!HARD_REGISTER_NUM_P (regno))
7895 {
7896 if (!strict_p)
7897 return true;
7898
7899 if (!reg_renumber)
7900 return false;
7901
7902 regno = reg_renumber[regno];
7903 }
7904 return GP_REGNUM_P (regno);
7905 }
7906
7907 /* Return true if register REGNO is a valid base register for mode MODE.
7908 STRICT_P is true if REG_OK_STRICT is in effect. */
7909
7910 bool
7911 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7912 {
7913 if (!HARD_REGISTER_NUM_P (regno))
7914 {
7915 if (!strict_p)
7916 return true;
7917
7918 if (!reg_renumber)
7919 return false;
7920
7921 regno = reg_renumber[regno];
7922 }
7923
7924 /* The fake registers will be eliminated to either the stack or
7925 hard frame pointer, both of which are usually valid base registers.
7926 Reload deals with the cases where the eliminated form isn't valid. */
7927 return (GP_REGNUM_P (regno)
7928 || regno == SP_REGNUM
7929 || regno == FRAME_POINTER_REGNUM
7930 || regno == ARG_POINTER_REGNUM);
7931 }
7932
7933 /* Return true if X is a valid base register for mode MODE.
7934 STRICT_P is true if REG_OK_STRICT is in effect. */
7935
7936 static bool
7937 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7938 {
7939 if (!strict_p
7940 && GET_CODE (x) == SUBREG
7941 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7942 x = SUBREG_REG (x);
7943
7944 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7945 }
7946
7947 /* Return true if address offset is a valid index. If it is, fill in INFO
7948 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7949
7950 static bool
7951 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7952 machine_mode mode, bool strict_p)
7953 {
7954 enum aarch64_address_type type;
7955 rtx index;
7956 int shift;
7957
7958 /* (reg:P) */
7959 if ((REG_P (x) || GET_CODE (x) == SUBREG)
7960 && GET_MODE (x) == Pmode)
7961 {
7962 type = ADDRESS_REG_REG;
7963 index = x;
7964 shift = 0;
7965 }
7966 /* (sign_extend:DI (reg:SI)) */
7967 else if ((GET_CODE (x) == SIGN_EXTEND
7968 || GET_CODE (x) == ZERO_EXTEND)
7969 && GET_MODE (x) == DImode
7970 && GET_MODE (XEXP (x, 0)) == SImode)
7971 {
7972 type = (GET_CODE (x) == SIGN_EXTEND)
7973 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7974 index = XEXP (x, 0);
7975 shift = 0;
7976 }
7977 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7978 else if (GET_CODE (x) == MULT
7979 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7980 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7981 && GET_MODE (XEXP (x, 0)) == DImode
7982 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7983 && CONST_INT_P (XEXP (x, 1)))
7984 {
7985 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7986 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7987 index = XEXP (XEXP (x, 0), 0);
7988 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7989 }
7990 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7991 else if (GET_CODE (x) == ASHIFT
7992 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7993 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7994 && GET_MODE (XEXP (x, 0)) == DImode
7995 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7996 && CONST_INT_P (XEXP (x, 1)))
7997 {
7998 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7999 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8000 index = XEXP (XEXP (x, 0), 0);
8001 shift = INTVAL (XEXP (x, 1));
8002 }
8003 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8004 else if ((GET_CODE (x) == SIGN_EXTRACT
8005 || GET_CODE (x) == ZERO_EXTRACT)
8006 && GET_MODE (x) == DImode
8007 && GET_CODE (XEXP (x, 0)) == MULT
8008 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8009 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8010 {
8011 type = (GET_CODE (x) == SIGN_EXTRACT)
8012 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8013 index = XEXP (XEXP (x, 0), 0);
8014 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8015 if (INTVAL (XEXP (x, 1)) != 32 + shift
8016 || INTVAL (XEXP (x, 2)) != 0)
8017 shift = -1;
8018 }
8019 /* (and:DI (mult:DI (reg:DI) (const_int scale))
8020 (const_int 0xffffffff<<shift)) */
8021 else if (GET_CODE (x) == AND
8022 && GET_MODE (x) == DImode
8023 && GET_CODE (XEXP (x, 0)) == MULT
8024 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8025 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8026 && CONST_INT_P (XEXP (x, 1)))
8027 {
8028 type = ADDRESS_REG_UXTW;
8029 index = XEXP (XEXP (x, 0), 0);
8030 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8031 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8032 shift = -1;
8033 }
8034 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8035 else if ((GET_CODE (x) == SIGN_EXTRACT
8036 || GET_CODE (x) == ZERO_EXTRACT)
8037 && GET_MODE (x) == DImode
8038 && GET_CODE (XEXP (x, 0)) == ASHIFT
8039 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8040 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8041 {
8042 type = (GET_CODE (x) == SIGN_EXTRACT)
8043 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8044 index = XEXP (XEXP (x, 0), 0);
8045 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8046 if (INTVAL (XEXP (x, 1)) != 32 + shift
8047 || INTVAL (XEXP (x, 2)) != 0)
8048 shift = -1;
8049 }
8050 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8051 (const_int 0xffffffff<<shift)) */
8052 else if (GET_CODE (x) == AND
8053 && GET_MODE (x) == DImode
8054 && GET_CODE (XEXP (x, 0)) == ASHIFT
8055 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8056 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8057 && CONST_INT_P (XEXP (x, 1)))
8058 {
8059 type = ADDRESS_REG_UXTW;
8060 index = XEXP (XEXP (x, 0), 0);
8061 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8062 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8063 shift = -1;
8064 }
8065 /* (mult:P (reg:P) (const_int scale)) */
8066 else if (GET_CODE (x) == MULT
8067 && GET_MODE (x) == Pmode
8068 && GET_MODE (XEXP (x, 0)) == Pmode
8069 && CONST_INT_P (XEXP (x, 1)))
8070 {
8071 type = ADDRESS_REG_REG;
8072 index = XEXP (x, 0);
8073 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8074 }
8075 /* (ashift:P (reg:P) (const_int shift)) */
8076 else if (GET_CODE (x) == ASHIFT
8077 && GET_MODE (x) == Pmode
8078 && GET_MODE (XEXP (x, 0)) == Pmode
8079 && CONST_INT_P (XEXP (x, 1)))
8080 {
8081 type = ADDRESS_REG_REG;
8082 index = XEXP (x, 0);
8083 shift = INTVAL (XEXP (x, 1));
8084 }
8085 else
8086 return false;
8087
8088 if (!strict_p
8089 && GET_CODE (index) == SUBREG
8090 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8091 index = SUBREG_REG (index);
8092
8093 if (aarch64_sve_data_mode_p (mode))
8094 {
8095 if (type != ADDRESS_REG_REG
8096 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8097 return false;
8098 }
8099 else
8100 {
8101 if (shift != 0
8102 && !(IN_RANGE (shift, 1, 3)
8103 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8104 return false;
8105 }
8106
8107 if (REG_P (index)
8108 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8109 {
8110 info->type = type;
8111 info->offset = index;
8112 info->shift = shift;
8113 return true;
8114 }
8115
8116 return false;
8117 }
8118
8119 /* Return true if MODE is one of the modes for which we
8120 support LDP/STP operations. */
8121
8122 static bool
8123 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8124 {
8125 return mode == SImode || mode == DImode
8126 || mode == SFmode || mode == DFmode
8127 || (aarch64_vector_mode_supported_p (mode)
8128 && (known_eq (GET_MODE_SIZE (mode), 8)
8129 || (known_eq (GET_MODE_SIZE (mode), 16)
8130 && (aarch64_tune_params.extra_tuning_flags
8131 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8132 }
8133
8134 /* Return true if REGNO is a virtual pointer register, or an eliminable
8135 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
8136 include stack_pointer or hard_frame_pointer. */
8137 static bool
8138 virt_or_elim_regno_p (unsigned regno)
8139 {
8140 return ((regno >= FIRST_VIRTUAL_REGISTER
8141 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8142 || regno == FRAME_POINTER_REGNUM
8143 || regno == ARG_POINTER_REGNUM);
8144 }
8145
8146 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8147 If it is, fill in INFO appropriately. STRICT_P is true if
8148 REG_OK_STRICT is in effect. */
8149
8150 bool
8151 aarch64_classify_address (struct aarch64_address_info *info,
8152 rtx x, machine_mode mode, bool strict_p,
8153 aarch64_addr_query_type type)
8154 {
8155 enum rtx_code code = GET_CODE (x);
8156 rtx op0, op1;
8157 poly_int64 offset;
8158
8159 HOST_WIDE_INT const_size;
8160
8161 /* Whether a vector mode is partial doesn't affect address legitimacy.
8162 Partial vectors like VNx8QImode allow the same indexed addressing
8163 mode and MUL VL addressing mode as full vectors like VNx16QImode;
8164 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
8165 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8166 vec_flags &= ~VEC_PARTIAL;
8167
8168 /* On BE, we use load/store pair for all large int mode load/stores.
8169 TI/TFmode may also use a load/store pair. */
8170 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8171 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8172 || type == ADDR_QUERY_LDP_STP_N
8173 || mode == TImode
8174 || mode == TFmode
8175 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8176
8177 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8178 corresponds to the actual size of the memory being loaded/stored and the
8179 mode of the corresponding addressing mode is half of that. */
8180 if (type == ADDR_QUERY_LDP_STP_N
8181 && known_eq (GET_MODE_SIZE (mode), 16))
8182 mode = DFmode;
8183
8184 bool allow_reg_index_p = (!load_store_pair_p
8185 && (known_lt (GET_MODE_SIZE (mode), 16)
8186 || vec_flags == VEC_ADVSIMD
8187 || vec_flags & VEC_SVE_DATA));
8188
8189 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8190 [Rn, #offset, MUL VL]. */
8191 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8192 && (code != REG && code != PLUS))
8193 return false;
8194
8195 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8196 REG addressing. */
8197 if (advsimd_struct_p
8198 && !BYTES_BIG_ENDIAN
8199 && (code != POST_INC && code != REG))
8200 return false;
8201
8202 gcc_checking_assert (GET_MODE (x) == VOIDmode
8203 || SCALAR_INT_MODE_P (GET_MODE (x)));
8204
8205 switch (code)
8206 {
8207 case REG:
8208 case SUBREG:
8209 info->type = ADDRESS_REG_IMM;
8210 info->base = x;
8211 info->offset = const0_rtx;
8212 info->const_offset = 0;
8213 return aarch64_base_register_rtx_p (x, strict_p);
8214
8215 case PLUS:
8216 op0 = XEXP (x, 0);
8217 op1 = XEXP (x, 1);
8218
8219 if (! strict_p
8220 && REG_P (op0)
8221 && virt_or_elim_regno_p (REGNO (op0))
8222 && poly_int_rtx_p (op1, &offset))
8223 {
8224 info->type = ADDRESS_REG_IMM;
8225 info->base = op0;
8226 info->offset = op1;
8227 info->const_offset = offset;
8228
8229 return true;
8230 }
8231
8232 if (maybe_ne (GET_MODE_SIZE (mode), 0)
8233 && aarch64_base_register_rtx_p (op0, strict_p)
8234 && poly_int_rtx_p (op1, &offset))
8235 {
8236 info->type = ADDRESS_REG_IMM;
8237 info->base = op0;
8238 info->offset = op1;
8239 info->const_offset = offset;
8240
8241 /* TImode and TFmode values are allowed in both pairs of X
8242 registers and individual Q registers. The available
8243 address modes are:
8244 X,X: 7-bit signed scaled offset
8245 Q: 9-bit signed offset
8246 We conservatively require an offset representable in either mode.
8247 When performing the check for pairs of X registers i.e. LDP/STP
8248 pass down DImode since that is the natural size of the LDP/STP
8249 instruction memory accesses. */
8250 if (mode == TImode || mode == TFmode)
8251 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
8252 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8253 || offset_12bit_unsigned_scaled_p (mode, offset)));
8254
8255 /* A 7bit offset check because OImode will emit a ldp/stp
8256 instruction (only big endian will get here).
8257 For ldp/stp instructions, the offset is scaled for the size of a
8258 single element of the pair. */
8259 if (mode == OImode)
8260 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
8261
8262 /* Three 9/12 bit offsets checks because CImode will emit three
8263 ldr/str instructions (only big endian will get here). */
8264 if (mode == CImode)
8265 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8266 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
8267 offset + 32)
8268 || offset_12bit_unsigned_scaled_p (V16QImode,
8269 offset + 32)));
8270
8271 /* Two 7bit offsets checks because XImode will emit two ldp/stp
8272 instructions (only big endian will get here). */
8273 if (mode == XImode)
8274 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8275 && aarch64_offset_7bit_signed_scaled_p (TImode,
8276 offset + 32));
8277
8278 /* Make "m" use the LD1 offset range for SVE data modes, so
8279 that pre-RTL optimizers like ivopts will work to that
8280 instead of the wider LDR/STR range. */
8281 if (vec_flags == VEC_SVE_DATA)
8282 return (type == ADDR_QUERY_M
8283 ? offset_4bit_signed_scaled_p (mode, offset)
8284 : offset_9bit_signed_scaled_p (mode, offset));
8285
8286 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
8287 {
8288 poly_int64 end_offset = (offset
8289 + GET_MODE_SIZE (mode)
8290 - BYTES_PER_SVE_VECTOR);
8291 return (type == ADDR_QUERY_M
8292 ? offset_4bit_signed_scaled_p (mode, offset)
8293 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
8294 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
8295 end_offset)));
8296 }
8297
8298 if (vec_flags == VEC_SVE_PRED)
8299 return offset_9bit_signed_scaled_p (mode, offset);
8300
8301 if (load_store_pair_p)
8302 return ((known_eq (GET_MODE_SIZE (mode), 4)
8303 || known_eq (GET_MODE_SIZE (mode), 8)
8304 || known_eq (GET_MODE_SIZE (mode), 16))
8305 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8306 else
8307 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8308 || offset_12bit_unsigned_scaled_p (mode, offset));
8309 }
8310
8311 if (allow_reg_index_p)
8312 {
8313 /* Look for base + (scaled/extended) index register. */
8314 if (aarch64_base_register_rtx_p (op0, strict_p)
8315 && aarch64_classify_index (info, op1, mode, strict_p))
8316 {
8317 info->base = op0;
8318 return true;
8319 }
8320 if (aarch64_base_register_rtx_p (op1, strict_p)
8321 && aarch64_classify_index (info, op0, mode, strict_p))
8322 {
8323 info->base = op1;
8324 return true;
8325 }
8326 }
8327
8328 return false;
8329
8330 case POST_INC:
8331 case POST_DEC:
8332 case PRE_INC:
8333 case PRE_DEC:
8334 info->type = ADDRESS_REG_WB;
8335 info->base = XEXP (x, 0);
8336 info->offset = NULL_RTX;
8337 return aarch64_base_register_rtx_p (info->base, strict_p);
8338
8339 case POST_MODIFY:
8340 case PRE_MODIFY:
8341 info->type = ADDRESS_REG_WB;
8342 info->base = XEXP (x, 0);
8343 if (GET_CODE (XEXP (x, 1)) == PLUS
8344 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
8345 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
8346 && aarch64_base_register_rtx_p (info->base, strict_p))
8347 {
8348 info->offset = XEXP (XEXP (x, 1), 1);
8349 info->const_offset = offset;
8350
8351 /* TImode and TFmode values are allowed in both pairs of X
8352 registers and individual Q registers. The available
8353 address modes are:
8354 X,X: 7-bit signed scaled offset
8355 Q: 9-bit signed offset
8356 We conservatively require an offset representable in either mode.
8357 */
8358 if (mode == TImode || mode == TFmode)
8359 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
8360 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
8361
8362 if (load_store_pair_p)
8363 return ((known_eq (GET_MODE_SIZE (mode), 4)
8364 || known_eq (GET_MODE_SIZE (mode), 8)
8365 || known_eq (GET_MODE_SIZE (mode), 16))
8366 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8367 else
8368 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
8369 }
8370 return false;
8371
8372 case CONST:
8373 case SYMBOL_REF:
8374 case LABEL_REF:
8375 /* load literal: pc-relative constant pool entry. Only supported
8376 for SI mode or larger. */
8377 info->type = ADDRESS_SYMBOLIC;
8378
8379 if (!load_store_pair_p
8380 && GET_MODE_SIZE (mode).is_constant (&const_size)
8381 && const_size >= 4)
8382 {
8383 rtx sym, addend;
8384
8385 split_const (x, &sym, &addend);
8386 return ((GET_CODE (sym) == LABEL_REF
8387 || (GET_CODE (sym) == SYMBOL_REF
8388 && CONSTANT_POOL_ADDRESS_P (sym)
8389 && aarch64_pcrelative_literal_loads)));
8390 }
8391 return false;
8392
8393 case LO_SUM:
8394 info->type = ADDRESS_LO_SUM;
8395 info->base = XEXP (x, 0);
8396 info->offset = XEXP (x, 1);
8397 if (allow_reg_index_p
8398 && aarch64_base_register_rtx_p (info->base, strict_p))
8399 {
8400 rtx sym, offs;
8401 split_const (info->offset, &sym, &offs);
8402 if (GET_CODE (sym) == SYMBOL_REF
8403 && (aarch64_classify_symbol (sym, INTVAL (offs))
8404 == SYMBOL_SMALL_ABSOLUTE))
8405 {
8406 /* The symbol and offset must be aligned to the access size. */
8407 unsigned int align;
8408
8409 if (CONSTANT_POOL_ADDRESS_P (sym))
8410 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
8411 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
8412 {
8413 tree exp = SYMBOL_REF_DECL (sym);
8414 align = TYPE_ALIGN (TREE_TYPE (exp));
8415 align = aarch64_constant_alignment (exp, align);
8416 }
8417 else if (SYMBOL_REF_DECL (sym))
8418 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
8419 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
8420 && SYMBOL_REF_BLOCK (sym) != NULL)
8421 align = SYMBOL_REF_BLOCK (sym)->alignment;
8422 else
8423 align = BITS_PER_UNIT;
8424
8425 poly_int64 ref_size = GET_MODE_SIZE (mode);
8426 if (known_eq (ref_size, 0))
8427 ref_size = GET_MODE_SIZE (DImode);
8428
8429 return (multiple_p (INTVAL (offs), ref_size)
8430 && multiple_p (align / BITS_PER_UNIT, ref_size));
8431 }
8432 }
8433 return false;
8434
8435 default:
8436 return false;
8437 }
8438 }
8439
8440 /* Return true if the address X is valid for a PRFM instruction.
8441 STRICT_P is true if we should do strict checking with
8442 aarch64_classify_address. */
8443
8444 bool
8445 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
8446 {
8447 struct aarch64_address_info addr;
8448
8449 /* PRFM accepts the same addresses as DImode... */
8450 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
8451 if (!res)
8452 return false;
8453
8454 /* ... except writeback forms. */
8455 return addr.type != ADDRESS_REG_WB;
8456 }
8457
8458 bool
8459 aarch64_symbolic_address_p (rtx x)
8460 {
8461 rtx offset;
8462
8463 split_const (x, &x, &offset);
8464 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
8465 }
8466
8467 /* Classify the base of symbolic expression X. */
8468
8469 enum aarch64_symbol_type
8470 aarch64_classify_symbolic_expression (rtx x)
8471 {
8472 rtx offset;
8473
8474 split_const (x, &x, &offset);
8475 return aarch64_classify_symbol (x, INTVAL (offset));
8476 }
8477
8478
8479 /* Return TRUE if X is a legitimate address for accessing memory in
8480 mode MODE. */
8481 static bool
8482 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
8483 {
8484 struct aarch64_address_info addr;
8485
8486 return aarch64_classify_address (&addr, x, mode, strict_p);
8487 }
8488
8489 /* Return TRUE if X is a legitimate address of type TYPE for accessing
8490 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
8491 bool
8492 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
8493 aarch64_addr_query_type type)
8494 {
8495 struct aarch64_address_info addr;
8496
8497 return aarch64_classify_address (&addr, x, mode, strict_p, type);
8498 }
8499
8500 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
8501
8502 static bool
8503 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
8504 poly_int64 orig_offset,
8505 machine_mode mode)
8506 {
8507 HOST_WIDE_INT size;
8508 if (GET_MODE_SIZE (mode).is_constant (&size))
8509 {
8510 HOST_WIDE_INT const_offset, second_offset;
8511
8512 /* A general SVE offset is A * VQ + B. Remove the A component from
8513 coefficient 0 in order to get the constant B. */
8514 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
8515
8516 /* Split an out-of-range address displacement into a base and
8517 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
8518 range otherwise to increase opportunities for sharing the base
8519 address of different sizes. Unaligned accesses use the signed
8520 9-bit range, TImode/TFmode use the intersection of signed
8521 scaled 7-bit and signed 9-bit offset. */
8522 if (mode == TImode || mode == TFmode)
8523 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
8524 else if ((const_offset & (size - 1)) != 0)
8525 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
8526 else
8527 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
8528
8529 if (second_offset == 0 || known_eq (orig_offset, second_offset))
8530 return false;
8531
8532 /* Split the offset into second_offset and the rest. */
8533 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8534 *offset2 = gen_int_mode (second_offset, Pmode);
8535 return true;
8536 }
8537 else
8538 {
8539 /* Get the mode we should use as the basis of the range. For structure
8540 modes this is the mode of one vector. */
8541 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8542 machine_mode step_mode
8543 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
8544
8545 /* Get the "mul vl" multiplier we'd like to use. */
8546 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
8547 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
8548 if (vec_flags & VEC_SVE_DATA)
8549 /* LDR supports a 9-bit range, but the move patterns for
8550 structure modes require all vectors to be in range of the
8551 same base. The simplest way of accomodating that while still
8552 promoting reuse of anchor points between different modes is
8553 to use an 8-bit range unconditionally. */
8554 vnum = ((vnum + 128) & 255) - 128;
8555 else
8556 /* Predicates are only handled singly, so we might as well use
8557 the full range. */
8558 vnum = ((vnum + 256) & 511) - 256;
8559 if (vnum == 0)
8560 return false;
8561
8562 /* Convert the "mul vl" multiplier into a byte offset. */
8563 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
8564 if (known_eq (second_offset, orig_offset))
8565 return false;
8566
8567 /* Split the offset into second_offset and the rest. */
8568 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8569 *offset2 = gen_int_mode (second_offset, Pmode);
8570 return true;
8571 }
8572 }
8573
8574 /* Return the binary representation of floating point constant VALUE in INTVAL.
8575 If the value cannot be converted, return false without setting INTVAL.
8576 The conversion is done in the given MODE. */
8577 bool
8578 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
8579 {
8580
8581 /* We make a general exception for 0. */
8582 if (aarch64_float_const_zero_rtx_p (value))
8583 {
8584 *intval = 0;
8585 return true;
8586 }
8587
8588 scalar_float_mode mode;
8589 if (GET_CODE (value) != CONST_DOUBLE
8590 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
8591 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
8592 /* Only support up to DF mode. */
8593 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
8594 return false;
8595
8596 unsigned HOST_WIDE_INT ival = 0;
8597
8598 long res[2];
8599 real_to_target (res,
8600 CONST_DOUBLE_REAL_VALUE (value),
8601 REAL_MODE_FORMAT (mode));
8602
8603 if (mode == DFmode)
8604 {
8605 int order = BYTES_BIG_ENDIAN ? 1 : 0;
8606 ival = zext_hwi (res[order], 32);
8607 ival |= (zext_hwi (res[1 - order], 32) << 32);
8608 }
8609 else
8610 ival = zext_hwi (res[0], 32);
8611
8612 *intval = ival;
8613 return true;
8614 }
8615
8616 /* Return TRUE if rtx X is an immediate constant that can be moved using a
8617 single MOV(+MOVK) followed by an FMOV. */
8618 bool
8619 aarch64_float_const_rtx_p (rtx x)
8620 {
8621 machine_mode mode = GET_MODE (x);
8622 if (mode == VOIDmode)
8623 return false;
8624
8625 /* Determine whether it's cheaper to write float constants as
8626 mov/movk pairs over ldr/adrp pairs. */
8627 unsigned HOST_WIDE_INT ival;
8628
8629 if (GET_CODE (x) == CONST_DOUBLE
8630 && SCALAR_FLOAT_MODE_P (mode)
8631 && aarch64_reinterpret_float_as_int (x, &ival))
8632 {
8633 scalar_int_mode imode = (mode == HFmode
8634 ? SImode
8635 : int_mode_for_mode (mode).require ());
8636 int num_instr = aarch64_internal_mov_immediate
8637 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8638 return num_instr < 3;
8639 }
8640
8641 return false;
8642 }
8643
8644 /* Return TRUE if rtx X is immediate constant 0.0 */
8645 bool
8646 aarch64_float_const_zero_rtx_p (rtx x)
8647 {
8648 if (GET_MODE (x) == VOIDmode)
8649 return false;
8650
8651 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
8652 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
8653 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
8654 }
8655
8656 /* Return TRUE if rtx X is immediate constant that fits in a single
8657 MOVI immediate operation. */
8658 bool
8659 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
8660 {
8661 if (!TARGET_SIMD)
8662 return false;
8663
8664 machine_mode vmode;
8665 scalar_int_mode imode;
8666 unsigned HOST_WIDE_INT ival;
8667
8668 if (GET_CODE (x) == CONST_DOUBLE
8669 && SCALAR_FLOAT_MODE_P (mode))
8670 {
8671 if (!aarch64_reinterpret_float_as_int (x, &ival))
8672 return false;
8673
8674 /* We make a general exception for 0. */
8675 if (aarch64_float_const_zero_rtx_p (x))
8676 return true;
8677
8678 imode = int_mode_for_mode (mode).require ();
8679 }
8680 else if (GET_CODE (x) == CONST_INT
8681 && is_a <scalar_int_mode> (mode, &imode))
8682 ival = INTVAL (x);
8683 else
8684 return false;
8685
8686 /* use a 64 bit mode for everything except for DI/DF mode, where we use
8687 a 128 bit vector mode. */
8688 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
8689
8690 vmode = aarch64_simd_container_mode (imode, width);
8691 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
8692
8693 return aarch64_simd_valid_immediate (v_op, NULL);
8694 }
8695
8696
8697 /* Return the fixed registers used for condition codes. */
8698
8699 static bool
8700 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
8701 {
8702 *p1 = CC_REGNUM;
8703 *p2 = INVALID_REGNUM;
8704 return true;
8705 }
8706
8707 /* This function is used by the call expanders of the machine description.
8708 RESULT is the register in which the result is returned. It's NULL for
8709 "call" and "sibcall".
8710 MEM is the location of the function call.
8711 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
8712 SIBCALL indicates whether this function call is normal call or sibling call.
8713 It will generate different pattern accordingly. */
8714
8715 void
8716 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
8717 {
8718 rtx call, callee, tmp;
8719 rtvec vec;
8720 machine_mode mode;
8721
8722 gcc_assert (MEM_P (mem));
8723 callee = XEXP (mem, 0);
8724 mode = GET_MODE (callee);
8725 gcc_assert (mode == Pmode);
8726
8727 /* Decide if we should generate indirect calls by loading the
8728 address of the callee into a register before performing
8729 the branch-and-link. */
8730 if (SYMBOL_REF_P (callee)
8731 ? (aarch64_is_long_call_p (callee)
8732 || aarch64_is_noplt_call_p (callee))
8733 : !REG_P (callee))
8734 XEXP (mem, 0) = force_reg (mode, callee);
8735
8736 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8737
8738 if (result != NULL_RTX)
8739 call = gen_rtx_SET (result, call);
8740
8741 if (sibcall)
8742 tmp = ret_rtx;
8743 else
8744 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8745
8746 gcc_assert (CONST_INT_P (callee_abi));
8747 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
8748 UNSPEC_CALLEE_ABI);
8749
8750 vec = gen_rtvec (3, call, callee_abi, tmp);
8751 call = gen_rtx_PARALLEL (VOIDmode, vec);
8752
8753 aarch64_emit_call_insn (call);
8754 }
8755
8756 /* Emit call insn with PAT and do aarch64-specific handling. */
8757
8758 void
8759 aarch64_emit_call_insn (rtx pat)
8760 {
8761 rtx insn = emit_call_insn (pat);
8762
8763 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8764 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8765 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8766 }
8767
8768 machine_mode
8769 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8770 {
8771 machine_mode mode_x = GET_MODE (x);
8772 rtx_code code_x = GET_CODE (x);
8773
8774 /* All floating point compares return CCFP if it is an equality
8775 comparison, and CCFPE otherwise. */
8776 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8777 {
8778 switch (code)
8779 {
8780 case EQ:
8781 case NE:
8782 case UNORDERED:
8783 case ORDERED:
8784 case UNLT:
8785 case UNLE:
8786 case UNGT:
8787 case UNGE:
8788 case UNEQ:
8789 return CCFPmode;
8790
8791 case LT:
8792 case LE:
8793 case GT:
8794 case GE:
8795 case LTGT:
8796 return CCFPEmode;
8797
8798 default:
8799 gcc_unreachable ();
8800 }
8801 }
8802
8803 /* Equality comparisons of short modes against zero can be performed
8804 using the TST instruction with the appropriate bitmask. */
8805 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8806 && (code == EQ || code == NE)
8807 && (mode_x == HImode || mode_x == QImode))
8808 return CC_NZmode;
8809
8810 /* Similarly, comparisons of zero_extends from shorter modes can
8811 be performed using an ANDS with an immediate mask. */
8812 if (y == const0_rtx && code_x == ZERO_EXTEND
8813 && (mode_x == SImode || mode_x == DImode)
8814 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8815 && (code == EQ || code == NE))
8816 return CC_NZmode;
8817
8818 if ((mode_x == SImode || mode_x == DImode)
8819 && y == const0_rtx
8820 && (code == EQ || code == NE || code == LT || code == GE)
8821 && (code_x == PLUS || code_x == MINUS || code_x == AND
8822 || code_x == NEG
8823 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8824 && CONST_INT_P (XEXP (x, 2)))))
8825 return CC_NZmode;
8826
8827 /* A compare with a shifted operand. Because of canonicalization,
8828 the comparison will have to be swapped when we emit the assembly
8829 code. */
8830 if ((mode_x == SImode || mode_x == DImode)
8831 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8832 && (code_x == ASHIFT || code_x == ASHIFTRT
8833 || code_x == LSHIFTRT
8834 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8835 return CC_SWPmode;
8836
8837 /* Similarly for a negated operand, but we can only do this for
8838 equalities. */
8839 if ((mode_x == SImode || mode_x == DImode)
8840 && (REG_P (y) || GET_CODE (y) == SUBREG)
8841 && (code == EQ || code == NE)
8842 && code_x == NEG)
8843 return CC_Zmode;
8844
8845 /* A test for unsigned overflow from an addition. */
8846 if ((mode_x == DImode || mode_x == TImode)
8847 && (code == LTU || code == GEU)
8848 && code_x == PLUS
8849 && rtx_equal_p (XEXP (x, 0), y))
8850 return CC_Cmode;
8851
8852 /* A test for unsigned overflow from an add with carry. */
8853 if ((mode_x == DImode || mode_x == TImode)
8854 && (code == LTU || code == GEU)
8855 && code_x == PLUS
8856 && CONST_SCALAR_INT_P (y)
8857 && (rtx_mode_t (y, mode_x)
8858 == (wi::shwi (1, mode_x)
8859 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8860 return CC_ADCmode;
8861
8862 /* A test for signed overflow. */
8863 if ((mode_x == DImode || mode_x == TImode)
8864 && code == NE
8865 && code_x == PLUS
8866 && GET_CODE (y) == SIGN_EXTEND)
8867 return CC_Vmode;
8868
8869 /* For everything else, return CCmode. */
8870 return CCmode;
8871 }
8872
8873 static int
8874 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8875
8876 int
8877 aarch64_get_condition_code (rtx x)
8878 {
8879 machine_mode mode = GET_MODE (XEXP (x, 0));
8880 enum rtx_code comp_code = GET_CODE (x);
8881
8882 if (GET_MODE_CLASS (mode) != MODE_CC)
8883 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8884 return aarch64_get_condition_code_1 (mode, comp_code);
8885 }
8886
8887 static int
8888 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8889 {
8890 switch (mode)
8891 {
8892 case E_CCFPmode:
8893 case E_CCFPEmode:
8894 switch (comp_code)
8895 {
8896 case GE: return AARCH64_GE;
8897 case GT: return AARCH64_GT;
8898 case LE: return AARCH64_LS;
8899 case LT: return AARCH64_MI;
8900 case NE: return AARCH64_NE;
8901 case EQ: return AARCH64_EQ;
8902 case ORDERED: return AARCH64_VC;
8903 case UNORDERED: return AARCH64_VS;
8904 case UNLT: return AARCH64_LT;
8905 case UNLE: return AARCH64_LE;
8906 case UNGT: return AARCH64_HI;
8907 case UNGE: return AARCH64_PL;
8908 default: return -1;
8909 }
8910 break;
8911
8912 case E_CCmode:
8913 switch (comp_code)
8914 {
8915 case NE: return AARCH64_NE;
8916 case EQ: return AARCH64_EQ;
8917 case GE: return AARCH64_GE;
8918 case GT: return AARCH64_GT;
8919 case LE: return AARCH64_LE;
8920 case LT: return AARCH64_LT;
8921 case GEU: return AARCH64_CS;
8922 case GTU: return AARCH64_HI;
8923 case LEU: return AARCH64_LS;
8924 case LTU: return AARCH64_CC;
8925 default: return -1;
8926 }
8927 break;
8928
8929 case E_CC_SWPmode:
8930 switch (comp_code)
8931 {
8932 case NE: return AARCH64_NE;
8933 case EQ: return AARCH64_EQ;
8934 case GE: return AARCH64_LE;
8935 case GT: return AARCH64_LT;
8936 case LE: return AARCH64_GE;
8937 case LT: return AARCH64_GT;
8938 case GEU: return AARCH64_LS;
8939 case GTU: return AARCH64_CC;
8940 case LEU: return AARCH64_CS;
8941 case LTU: return AARCH64_HI;
8942 default: return -1;
8943 }
8944 break;
8945
8946 case E_CC_NZCmode:
8947 switch (comp_code)
8948 {
8949 case NE: return AARCH64_NE; /* = any */
8950 case EQ: return AARCH64_EQ; /* = none */
8951 case GE: return AARCH64_PL; /* = nfrst */
8952 case LT: return AARCH64_MI; /* = first */
8953 case GEU: return AARCH64_CS; /* = nlast */
8954 case GTU: return AARCH64_HI; /* = pmore */
8955 case LEU: return AARCH64_LS; /* = plast */
8956 case LTU: return AARCH64_CC; /* = last */
8957 default: return -1;
8958 }
8959 break;
8960
8961 case E_CC_NZmode:
8962 switch (comp_code)
8963 {
8964 case NE: return AARCH64_NE;
8965 case EQ: return AARCH64_EQ;
8966 case GE: return AARCH64_PL;
8967 case LT: return AARCH64_MI;
8968 default: return -1;
8969 }
8970 break;
8971
8972 case E_CC_Zmode:
8973 switch (comp_code)
8974 {
8975 case NE: return AARCH64_NE;
8976 case EQ: return AARCH64_EQ;
8977 default: return -1;
8978 }
8979 break;
8980
8981 case E_CC_Cmode:
8982 switch (comp_code)
8983 {
8984 case LTU: return AARCH64_CS;
8985 case GEU: return AARCH64_CC;
8986 default: return -1;
8987 }
8988 break;
8989
8990 case E_CC_ADCmode:
8991 switch (comp_code)
8992 {
8993 case GEU: return AARCH64_CS;
8994 case LTU: return AARCH64_CC;
8995 default: return -1;
8996 }
8997 break;
8998
8999 case E_CC_Vmode:
9000 switch (comp_code)
9001 {
9002 case NE: return AARCH64_VS;
9003 case EQ: return AARCH64_VC;
9004 default: return -1;
9005 }
9006 break;
9007
9008 default:
9009 return -1;
9010 }
9011
9012 return -1;
9013 }
9014
9015 bool
9016 aarch64_const_vec_all_same_in_range_p (rtx x,
9017 HOST_WIDE_INT minval,
9018 HOST_WIDE_INT maxval)
9019 {
9020 rtx elt;
9021 return (const_vec_duplicate_p (x, &elt)
9022 && CONST_INT_P (elt)
9023 && IN_RANGE (INTVAL (elt), minval, maxval));
9024 }
9025
9026 bool
9027 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9028 {
9029 return aarch64_const_vec_all_same_in_range_p (x, val, val);
9030 }
9031
9032 /* Return true if VEC is a constant in which every element is in the range
9033 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
9034
9035 static bool
9036 aarch64_const_vec_all_in_range_p (rtx vec,
9037 HOST_WIDE_INT minval,
9038 HOST_WIDE_INT maxval)
9039 {
9040 if (GET_CODE (vec) != CONST_VECTOR
9041 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9042 return false;
9043
9044 int nunits;
9045 if (!CONST_VECTOR_STEPPED_P (vec))
9046 nunits = const_vector_encoded_nelts (vec);
9047 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9048 return false;
9049
9050 for (int i = 0; i < nunits; i++)
9051 {
9052 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9053 if (!CONST_INT_P (vec_elem)
9054 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9055 return false;
9056 }
9057 return true;
9058 }
9059
9060 /* N Z C V. */
9061 #define AARCH64_CC_V 1
9062 #define AARCH64_CC_C (1 << 1)
9063 #define AARCH64_CC_Z (1 << 2)
9064 #define AARCH64_CC_N (1 << 3)
9065
9066 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
9067 static const int aarch64_nzcv_codes[] =
9068 {
9069 0, /* EQ, Z == 1. */
9070 AARCH64_CC_Z, /* NE, Z == 0. */
9071 0, /* CS, C == 1. */
9072 AARCH64_CC_C, /* CC, C == 0. */
9073 0, /* MI, N == 1. */
9074 AARCH64_CC_N, /* PL, N == 0. */
9075 0, /* VS, V == 1. */
9076 AARCH64_CC_V, /* VC, V == 0. */
9077 0, /* HI, C ==1 && Z == 0. */
9078 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
9079 AARCH64_CC_V, /* GE, N == V. */
9080 0, /* LT, N != V. */
9081 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
9082 0, /* LE, !(Z == 0 && N == V). */
9083 0, /* AL, Any. */
9084 0 /* NV, Any. */
9085 };
9086
9087 /* Print floating-point vector immediate operand X to F, negating it
9088 first if NEGATE is true. Return true on success, false if it isn't
9089 a constant we can handle. */
9090
9091 static bool
9092 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9093 {
9094 rtx elt;
9095
9096 if (!const_vec_duplicate_p (x, &elt))
9097 return false;
9098
9099 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9100 if (negate)
9101 r = real_value_negate (&r);
9102
9103 /* Handle the SVE single-bit immediates specially, since they have a
9104 fixed form in the assembly syntax. */
9105 if (real_equal (&r, &dconst0))
9106 asm_fprintf (f, "0.0");
9107 else if (real_equal (&r, &dconst2))
9108 asm_fprintf (f, "2.0");
9109 else if (real_equal (&r, &dconst1))
9110 asm_fprintf (f, "1.0");
9111 else if (real_equal (&r, &dconsthalf))
9112 asm_fprintf (f, "0.5");
9113 else
9114 {
9115 const int buf_size = 20;
9116 char float_buf[buf_size] = {'\0'};
9117 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9118 1, GET_MODE (elt));
9119 asm_fprintf (f, "%s", float_buf);
9120 }
9121
9122 return true;
9123 }
9124
9125 /* Return the equivalent letter for size. */
9126 static char
9127 sizetochar (int size)
9128 {
9129 switch (size)
9130 {
9131 case 64: return 'd';
9132 case 32: return 's';
9133 case 16: return 'h';
9134 case 8 : return 'b';
9135 default: gcc_unreachable ();
9136 }
9137 }
9138
9139 /* Print operand X to file F in a target specific manner according to CODE.
9140 The acceptable formatting commands given by CODE are:
9141 'c': An integer or symbol address without a preceding #
9142 sign.
9143 'C': Take the duplicated element in a vector constant
9144 and print it in hex.
9145 'D': Take the duplicated element in a vector constant
9146 and print it as an unsigned integer, in decimal.
9147 'e': Print the sign/zero-extend size as a character 8->b,
9148 16->h, 32->w. Can also be used for masks:
9149 0xff->b, 0xffff->h, 0xffffffff->w.
9150 'I': If the operand is a duplicated vector constant,
9151 replace it with the duplicated scalar. If the
9152 operand is then a floating-point constant, replace
9153 it with the integer bit representation. Print the
9154 transformed constant as a signed decimal number.
9155 'p': Prints N such that 2^N == X (X must be power of 2 and
9156 const int).
9157 'P': Print the number of non-zero bits in X (a const_int).
9158 'H': Print the higher numbered register of a pair (TImode)
9159 of regs.
9160 'm': Print a condition (eq, ne, etc).
9161 'M': Same as 'm', but invert condition.
9162 'N': Take the duplicated element in a vector constant
9163 and print the negative of it in decimal.
9164 'b/h/s/d/q': Print a scalar FP/SIMD register name.
9165 'S/T/U/V': Print a FP/SIMD register name for a register list.
9166 The register printed is the FP/SIMD register name
9167 of X + 0/1/2/3 for S/T/U/V.
9168 'R': Print a scalar Integer/FP/SIMD register name + 1.
9169 'X': Print bottom 16 bits of integer constant in hex.
9170 'w/x': Print a general register name or the zero register
9171 (32-bit or 64-bit).
9172 '0': Print a normal operand, if it's a general register,
9173 then we assume DImode.
9174 'k': Print NZCV for conditional compare instructions.
9175 'A': Output address constant representing the first
9176 argument of X, specifying a relocation offset
9177 if appropriate.
9178 'L': Output constant address specified by X
9179 with a relocation offset if appropriate.
9180 'G': Prints address of X, specifying a PC relative
9181 relocation mode if appropriate.
9182 'y': Output address of LDP or STP - this is used for
9183 some LDP/STPs which don't use a PARALLEL in their
9184 pattern (so the mode needs to be adjusted).
9185 'z': Output address of a typical LDP or STP. */
9186
9187 static void
9188 aarch64_print_operand (FILE *f, rtx x, int code)
9189 {
9190 rtx elt;
9191 switch (code)
9192 {
9193 case 'c':
9194 switch (GET_CODE (x))
9195 {
9196 case CONST_INT:
9197 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9198 break;
9199
9200 case SYMBOL_REF:
9201 output_addr_const (f, x);
9202 break;
9203
9204 case CONST:
9205 if (GET_CODE (XEXP (x, 0)) == PLUS
9206 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
9207 {
9208 output_addr_const (f, x);
9209 break;
9210 }
9211 /* Fall through. */
9212
9213 default:
9214 output_operand_lossage ("unsupported operand for code '%c'", code);
9215 }
9216 break;
9217
9218 case 'e':
9219 {
9220 x = unwrap_const_vec_duplicate (x);
9221 if (!CONST_INT_P (x))
9222 {
9223 output_operand_lossage ("invalid operand for '%%%c'", code);
9224 return;
9225 }
9226
9227 HOST_WIDE_INT val = INTVAL (x);
9228 if ((val & ~7) == 8 || val == 0xff)
9229 fputc ('b', f);
9230 else if ((val & ~7) == 16 || val == 0xffff)
9231 fputc ('h', f);
9232 else if ((val & ~7) == 32 || val == 0xffffffff)
9233 fputc ('w', f);
9234 else
9235 {
9236 output_operand_lossage ("invalid operand for '%%%c'", code);
9237 return;
9238 }
9239 }
9240 break;
9241
9242 case 'p':
9243 {
9244 int n;
9245
9246 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
9247 {
9248 output_operand_lossage ("invalid operand for '%%%c'", code);
9249 return;
9250 }
9251
9252 asm_fprintf (f, "%d", n);
9253 }
9254 break;
9255
9256 case 'P':
9257 if (!CONST_INT_P (x))
9258 {
9259 output_operand_lossage ("invalid operand for '%%%c'", code);
9260 return;
9261 }
9262
9263 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
9264 break;
9265
9266 case 'H':
9267 if (x == const0_rtx)
9268 {
9269 asm_fprintf (f, "xzr");
9270 break;
9271 }
9272
9273 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
9274 {
9275 output_operand_lossage ("invalid operand for '%%%c'", code);
9276 return;
9277 }
9278
9279 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
9280 break;
9281
9282 case 'I':
9283 {
9284 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
9285 if (CONST_INT_P (x))
9286 asm_fprintf (f, "%wd", INTVAL (x));
9287 else
9288 {
9289 output_operand_lossage ("invalid operand for '%%%c'", code);
9290 return;
9291 }
9292 break;
9293 }
9294
9295 case 'M':
9296 case 'm':
9297 {
9298 int cond_code;
9299 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
9300 if (x == const_true_rtx)
9301 {
9302 if (code == 'M')
9303 fputs ("nv", f);
9304 return;
9305 }
9306
9307 if (!COMPARISON_P (x))
9308 {
9309 output_operand_lossage ("invalid operand for '%%%c'", code);
9310 return;
9311 }
9312
9313 cond_code = aarch64_get_condition_code (x);
9314 gcc_assert (cond_code >= 0);
9315 if (code == 'M')
9316 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
9317 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
9318 fputs (aarch64_sve_condition_codes[cond_code], f);
9319 else
9320 fputs (aarch64_condition_codes[cond_code], f);
9321 }
9322 break;
9323
9324 case 'N':
9325 if (!const_vec_duplicate_p (x, &elt))
9326 {
9327 output_operand_lossage ("invalid vector constant");
9328 return;
9329 }
9330
9331 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9332 asm_fprintf (f, "%wd", -INTVAL (elt));
9333 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9334 && aarch64_print_vector_float_operand (f, x, true))
9335 ;
9336 else
9337 {
9338 output_operand_lossage ("invalid vector constant");
9339 return;
9340 }
9341 break;
9342
9343 case 'b':
9344 case 'h':
9345 case 's':
9346 case 'd':
9347 case 'q':
9348 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9349 {
9350 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9351 return;
9352 }
9353 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
9354 break;
9355
9356 case 'S':
9357 case 'T':
9358 case 'U':
9359 case 'V':
9360 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9361 {
9362 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9363 return;
9364 }
9365 asm_fprintf (f, "%c%d",
9366 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
9367 REGNO (x) - V0_REGNUM + (code - 'S'));
9368 break;
9369
9370 case 'R':
9371 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
9372 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
9373 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9374 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
9375 else
9376 output_operand_lossage ("incompatible register operand for '%%%c'",
9377 code);
9378 break;
9379
9380 case 'X':
9381 if (!CONST_INT_P (x))
9382 {
9383 output_operand_lossage ("invalid operand for '%%%c'", code);
9384 return;
9385 }
9386 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
9387 break;
9388
9389 case 'C':
9390 {
9391 /* Print a replicated constant in hex. */
9392 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9393 {
9394 output_operand_lossage ("invalid operand for '%%%c'", code);
9395 return;
9396 }
9397 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9398 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9399 }
9400 break;
9401
9402 case 'D':
9403 {
9404 /* Print a replicated constant in decimal, treating it as
9405 unsigned. */
9406 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9407 {
9408 output_operand_lossage ("invalid operand for '%%%c'", code);
9409 return;
9410 }
9411 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9412 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9413 }
9414 break;
9415
9416 case 'w':
9417 case 'x':
9418 if (x == const0_rtx
9419 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
9420 {
9421 asm_fprintf (f, "%czr", code);
9422 break;
9423 }
9424
9425 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9426 {
9427 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
9428 break;
9429 }
9430
9431 if (REG_P (x) && REGNO (x) == SP_REGNUM)
9432 {
9433 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
9434 break;
9435 }
9436
9437 /* Fall through */
9438
9439 case 0:
9440 if (x == NULL)
9441 {
9442 output_operand_lossage ("missing operand");
9443 return;
9444 }
9445
9446 switch (GET_CODE (x))
9447 {
9448 case REG:
9449 if (aarch64_sve_data_mode_p (GET_MODE (x)))
9450 {
9451 if (REG_NREGS (x) == 1)
9452 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
9453 else
9454 {
9455 char suffix
9456 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
9457 asm_fprintf (f, "{z%d.%c - z%d.%c}",
9458 REGNO (x) - V0_REGNUM, suffix,
9459 END_REGNO (x) - V0_REGNUM - 1, suffix);
9460 }
9461 }
9462 else
9463 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
9464 break;
9465
9466 case MEM:
9467 output_address (GET_MODE (x), XEXP (x, 0));
9468 break;
9469
9470 case LABEL_REF:
9471 case SYMBOL_REF:
9472 output_addr_const (asm_out_file, x);
9473 break;
9474
9475 case CONST_INT:
9476 asm_fprintf (f, "%wd", INTVAL (x));
9477 break;
9478
9479 case CONST:
9480 if (!VECTOR_MODE_P (GET_MODE (x)))
9481 {
9482 output_addr_const (asm_out_file, x);
9483 break;
9484 }
9485 /* fall through */
9486
9487 case CONST_VECTOR:
9488 if (!const_vec_duplicate_p (x, &elt))
9489 {
9490 output_operand_lossage ("invalid vector constant");
9491 return;
9492 }
9493
9494 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9495 asm_fprintf (f, "%wd", INTVAL (elt));
9496 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9497 && aarch64_print_vector_float_operand (f, x, false))
9498 ;
9499 else
9500 {
9501 output_operand_lossage ("invalid vector constant");
9502 return;
9503 }
9504 break;
9505
9506 case CONST_DOUBLE:
9507 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
9508 be getting CONST_DOUBLEs holding integers. */
9509 gcc_assert (GET_MODE (x) != VOIDmode);
9510 if (aarch64_float_const_zero_rtx_p (x))
9511 {
9512 fputc ('0', f);
9513 break;
9514 }
9515 else if (aarch64_float_const_representable_p (x))
9516 {
9517 #define buf_size 20
9518 char float_buf[buf_size] = {'\0'};
9519 real_to_decimal_for_mode (float_buf,
9520 CONST_DOUBLE_REAL_VALUE (x),
9521 buf_size, buf_size,
9522 1, GET_MODE (x));
9523 asm_fprintf (asm_out_file, "%s", float_buf);
9524 break;
9525 #undef buf_size
9526 }
9527 output_operand_lossage ("invalid constant");
9528 return;
9529 default:
9530 output_operand_lossage ("invalid operand");
9531 return;
9532 }
9533 break;
9534
9535 case 'A':
9536 if (GET_CODE (x) == HIGH)
9537 x = XEXP (x, 0);
9538
9539 switch (aarch64_classify_symbolic_expression (x))
9540 {
9541 case SYMBOL_SMALL_GOT_4G:
9542 asm_fprintf (asm_out_file, ":got:");
9543 break;
9544
9545 case SYMBOL_SMALL_TLSGD:
9546 asm_fprintf (asm_out_file, ":tlsgd:");
9547 break;
9548
9549 case SYMBOL_SMALL_TLSDESC:
9550 asm_fprintf (asm_out_file, ":tlsdesc:");
9551 break;
9552
9553 case SYMBOL_SMALL_TLSIE:
9554 asm_fprintf (asm_out_file, ":gottprel:");
9555 break;
9556
9557 case SYMBOL_TLSLE24:
9558 asm_fprintf (asm_out_file, ":tprel:");
9559 break;
9560
9561 case SYMBOL_TINY_GOT:
9562 gcc_unreachable ();
9563 break;
9564
9565 default:
9566 break;
9567 }
9568 output_addr_const (asm_out_file, x);
9569 break;
9570
9571 case 'L':
9572 switch (aarch64_classify_symbolic_expression (x))
9573 {
9574 case SYMBOL_SMALL_GOT_4G:
9575 asm_fprintf (asm_out_file, ":lo12:");
9576 break;
9577
9578 case SYMBOL_SMALL_TLSGD:
9579 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
9580 break;
9581
9582 case SYMBOL_SMALL_TLSDESC:
9583 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
9584 break;
9585
9586 case SYMBOL_SMALL_TLSIE:
9587 asm_fprintf (asm_out_file, ":gottprel_lo12:");
9588 break;
9589
9590 case SYMBOL_TLSLE12:
9591 asm_fprintf (asm_out_file, ":tprel_lo12:");
9592 break;
9593
9594 case SYMBOL_TLSLE24:
9595 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
9596 break;
9597
9598 case SYMBOL_TINY_GOT:
9599 asm_fprintf (asm_out_file, ":got:");
9600 break;
9601
9602 case SYMBOL_TINY_TLSIE:
9603 asm_fprintf (asm_out_file, ":gottprel:");
9604 break;
9605
9606 default:
9607 break;
9608 }
9609 output_addr_const (asm_out_file, x);
9610 break;
9611
9612 case 'G':
9613 switch (aarch64_classify_symbolic_expression (x))
9614 {
9615 case SYMBOL_TLSLE24:
9616 asm_fprintf (asm_out_file, ":tprel_hi12:");
9617 break;
9618 default:
9619 break;
9620 }
9621 output_addr_const (asm_out_file, x);
9622 break;
9623
9624 case 'k':
9625 {
9626 HOST_WIDE_INT cond_code;
9627
9628 if (!CONST_INT_P (x))
9629 {
9630 output_operand_lossage ("invalid operand for '%%%c'", code);
9631 return;
9632 }
9633
9634 cond_code = INTVAL (x);
9635 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
9636 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
9637 }
9638 break;
9639
9640 case 'y':
9641 case 'z':
9642 {
9643 machine_mode mode = GET_MODE (x);
9644
9645 if (GET_CODE (x) != MEM
9646 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
9647 {
9648 output_operand_lossage ("invalid operand for '%%%c'", code);
9649 return;
9650 }
9651
9652 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
9653 code == 'y'
9654 ? ADDR_QUERY_LDP_STP_N
9655 : ADDR_QUERY_LDP_STP))
9656 output_operand_lossage ("invalid operand prefix '%%%c'", code);
9657 }
9658 break;
9659
9660 default:
9661 output_operand_lossage ("invalid operand prefix '%%%c'", code);
9662 return;
9663 }
9664 }
9665
9666 /* Print address 'x' of a memory access with mode 'mode'.
9667 'op' is the context required by aarch64_classify_address. It can either be
9668 MEM for a normal memory access or PARALLEL for LDP/STP. */
9669 static bool
9670 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
9671 aarch64_addr_query_type type)
9672 {
9673 struct aarch64_address_info addr;
9674 unsigned int size, vec_flags;
9675
9676 /* Check all addresses are Pmode - including ILP32. */
9677 if (GET_MODE (x) != Pmode
9678 && (!CONST_INT_P (x)
9679 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
9680 {
9681 output_operand_lossage ("invalid address mode");
9682 return false;
9683 }
9684
9685 if (aarch64_classify_address (&addr, x, mode, true, type))
9686 switch (addr.type)
9687 {
9688 case ADDRESS_REG_IMM:
9689 if (known_eq (addr.const_offset, 0))
9690 {
9691 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
9692 return true;
9693 }
9694
9695 vec_flags = aarch64_classify_vector_mode (mode);
9696 if (vec_flags & VEC_ANY_SVE)
9697 {
9698 HOST_WIDE_INT vnum
9699 = exact_div (addr.const_offset,
9700 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
9701 asm_fprintf (f, "[%s, #%wd, mul vl]",
9702 reg_names[REGNO (addr.base)], vnum);
9703 return true;
9704 }
9705
9706 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
9707 INTVAL (addr.offset));
9708 return true;
9709
9710 case ADDRESS_REG_REG:
9711 if (addr.shift == 0)
9712 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
9713 reg_names [REGNO (addr.offset)]);
9714 else
9715 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
9716 reg_names [REGNO (addr.offset)], addr.shift);
9717 return true;
9718
9719 case ADDRESS_REG_UXTW:
9720 if (addr.shift == 0)
9721 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9722 REGNO (addr.offset) - R0_REGNUM);
9723 else
9724 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9725 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9726 return true;
9727
9728 case ADDRESS_REG_SXTW:
9729 if (addr.shift == 0)
9730 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9731 REGNO (addr.offset) - R0_REGNUM);
9732 else
9733 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9734 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9735 return true;
9736
9737 case ADDRESS_REG_WB:
9738 /* Writeback is only supported for fixed-width modes. */
9739 size = GET_MODE_SIZE (mode).to_constant ();
9740 switch (GET_CODE (x))
9741 {
9742 case PRE_INC:
9743 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9744 return true;
9745 case POST_INC:
9746 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9747 return true;
9748 case PRE_DEC:
9749 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9750 return true;
9751 case POST_DEC:
9752 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9753 return true;
9754 case PRE_MODIFY:
9755 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9756 INTVAL (addr.offset));
9757 return true;
9758 case POST_MODIFY:
9759 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9760 INTVAL (addr.offset));
9761 return true;
9762 default:
9763 break;
9764 }
9765 break;
9766
9767 case ADDRESS_LO_SUM:
9768 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9769 output_addr_const (f, addr.offset);
9770 asm_fprintf (f, "]");
9771 return true;
9772
9773 case ADDRESS_SYMBOLIC:
9774 output_addr_const (f, x);
9775 return true;
9776 }
9777
9778 return false;
9779 }
9780
9781 /* Print address 'x' of a memory access with mode 'mode'. */
9782 static void
9783 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9784 {
9785 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9786 output_addr_const (f, x);
9787 }
9788
9789 bool
9790 aarch64_label_mentioned_p (rtx x)
9791 {
9792 const char *fmt;
9793 int i;
9794
9795 if (GET_CODE (x) == LABEL_REF)
9796 return true;
9797
9798 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9799 referencing instruction, but they are constant offsets, not
9800 symbols. */
9801 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9802 return false;
9803
9804 fmt = GET_RTX_FORMAT (GET_CODE (x));
9805 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9806 {
9807 if (fmt[i] == 'E')
9808 {
9809 int j;
9810
9811 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9812 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9813 return 1;
9814 }
9815 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9816 return 1;
9817 }
9818
9819 return 0;
9820 }
9821
9822 /* Implement REGNO_REG_CLASS. */
9823
9824 enum reg_class
9825 aarch64_regno_regclass (unsigned regno)
9826 {
9827 if (GP_REGNUM_P (regno))
9828 return GENERAL_REGS;
9829
9830 if (regno == SP_REGNUM)
9831 return STACK_REG;
9832
9833 if (regno == FRAME_POINTER_REGNUM
9834 || regno == ARG_POINTER_REGNUM)
9835 return POINTER_REGS;
9836
9837 if (FP_REGNUM_P (regno))
9838 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9839 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9840
9841 if (PR_REGNUM_P (regno))
9842 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9843
9844 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
9845 return FFR_REGS;
9846
9847 return NO_REGS;
9848 }
9849
9850 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9851 If OFFSET is out of range, return an offset of an anchor point
9852 that is in range. Return 0 otherwise. */
9853
9854 static HOST_WIDE_INT
9855 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9856 machine_mode mode)
9857 {
9858 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9859 if (size > 16)
9860 return (offset + 0x400) & ~0x7f0;
9861
9862 /* For offsets that aren't a multiple of the access size, the limit is
9863 -256...255. */
9864 if (offset & (size - 1))
9865 {
9866 /* BLKmode typically uses LDP of X-registers. */
9867 if (mode == BLKmode)
9868 return (offset + 512) & ~0x3ff;
9869 return (offset + 0x100) & ~0x1ff;
9870 }
9871
9872 /* Small negative offsets are supported. */
9873 if (IN_RANGE (offset, -256, 0))
9874 return 0;
9875
9876 if (mode == TImode || mode == TFmode)
9877 return (offset + 0x100) & ~0x1ff;
9878
9879 /* Use 12-bit offset by access size. */
9880 return offset & (~0xfff * size);
9881 }
9882
9883 static rtx
9884 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
9885 {
9886 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9887 where mask is selected by alignment and size of the offset.
9888 We try to pick as large a range for the offset as possible to
9889 maximize the chance of a CSE. However, for aligned addresses
9890 we limit the range to 4k so that structures with different sized
9891 elements are likely to use the same base. We need to be careful
9892 not to split a CONST for some forms of address expression, otherwise
9893 it will generate sub-optimal code. */
9894
9895 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9896 {
9897 rtx base = XEXP (x, 0);
9898 rtx offset_rtx = XEXP (x, 1);
9899 HOST_WIDE_INT offset = INTVAL (offset_rtx);
9900
9901 if (GET_CODE (base) == PLUS)
9902 {
9903 rtx op0 = XEXP (base, 0);
9904 rtx op1 = XEXP (base, 1);
9905
9906 /* Force any scaling into a temp for CSE. */
9907 op0 = force_reg (Pmode, op0);
9908 op1 = force_reg (Pmode, op1);
9909
9910 /* Let the pointer register be in op0. */
9911 if (REG_POINTER (op1))
9912 std::swap (op0, op1);
9913
9914 /* If the pointer is virtual or frame related, then we know that
9915 virtual register instantiation or register elimination is going
9916 to apply a second constant. We want the two constants folded
9917 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9918 if (virt_or_elim_regno_p (REGNO (op0)))
9919 {
9920 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9921 NULL_RTX, true, OPTAB_DIRECT);
9922 return gen_rtx_PLUS (Pmode, base, op1);
9923 }
9924
9925 /* Otherwise, in order to encourage CSE (and thence loop strength
9926 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9927 base = expand_binop (Pmode, add_optab, op0, op1,
9928 NULL_RTX, true, OPTAB_DIRECT);
9929 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9930 }
9931
9932 HOST_WIDE_INT size;
9933 if (GET_MODE_SIZE (mode).is_constant (&size))
9934 {
9935 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9936 mode);
9937 if (base_offset != 0)
9938 {
9939 base = plus_constant (Pmode, base, base_offset);
9940 base = force_operand (base, NULL_RTX);
9941 return plus_constant (Pmode, base, offset - base_offset);
9942 }
9943 }
9944 }
9945
9946 return x;
9947 }
9948
9949 static reg_class_t
9950 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9951 reg_class_t rclass,
9952 machine_mode mode,
9953 secondary_reload_info *sri)
9954 {
9955 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9956 directly by the *aarch64_sve_mov<mode>_[lb]e move patterns. See the
9957 comment at the head of aarch64-sve.md for more details about the
9958 big-endian handling. */
9959 if (BYTES_BIG_ENDIAN
9960 && reg_class_subset_p (rclass, FP_REGS)
9961 && !((REG_P (x) && HARD_REGISTER_P (x))
9962 || aarch64_simd_valid_immediate (x, NULL))
9963 && mode != VNx16QImode
9964 && aarch64_sve_data_mode_p (mode))
9965 {
9966 sri->icode = CODE_FOR_aarch64_sve_reload_be;
9967 return NO_REGS;
9968 }
9969
9970 /* If we have to disable direct literal pool loads and stores because the
9971 function is too big, then we need a scratch register. */
9972 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9973 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9974 || targetm.vector_mode_supported_p (GET_MODE (x)))
9975 && !aarch64_pcrelative_literal_loads)
9976 {
9977 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9978 return NO_REGS;
9979 }
9980
9981 /* Without the TARGET_SIMD instructions we cannot move a Q register
9982 to a Q register directly. We need a scratch. */
9983 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9984 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9985 && reg_class_subset_p (rclass, FP_REGS))
9986 {
9987 sri->icode = code_for_aarch64_reload_mov (mode);
9988 return NO_REGS;
9989 }
9990
9991 /* A TFmode or TImode memory access should be handled via an FP_REGS
9992 because AArch64 has richer addressing modes for LDR/STR instructions
9993 than LDP/STP instructions. */
9994 if (TARGET_FLOAT && rclass == GENERAL_REGS
9995 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9996 return FP_REGS;
9997
9998 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9999 return GENERAL_REGS;
10000
10001 return NO_REGS;
10002 }
10003
10004 static bool
10005 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10006 {
10007 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10008
10009 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10010 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
10011 if (frame_pointer_needed)
10012 return to == HARD_FRAME_POINTER_REGNUM;
10013 return true;
10014 }
10015
10016 poly_int64
10017 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10018 {
10019 if (to == HARD_FRAME_POINTER_REGNUM)
10020 {
10021 if (from == ARG_POINTER_REGNUM)
10022 return cfun->machine->frame.hard_fp_offset;
10023
10024 if (from == FRAME_POINTER_REGNUM)
10025 return cfun->machine->frame.hard_fp_offset
10026 - cfun->machine->frame.locals_offset;
10027 }
10028
10029 if (to == STACK_POINTER_REGNUM)
10030 {
10031 if (from == FRAME_POINTER_REGNUM)
10032 return cfun->machine->frame.frame_size
10033 - cfun->machine->frame.locals_offset;
10034 }
10035
10036 return cfun->machine->frame.frame_size;
10037 }
10038
10039 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
10040 previous frame. */
10041
10042 rtx
10043 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10044 {
10045 if (count != 0)
10046 return const0_rtx;
10047 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
10048 }
10049
10050
10051 static void
10052 aarch64_asm_trampoline_template (FILE *f)
10053 {
10054 int offset1 = 16;
10055 int offset2 = 20;
10056
10057 if (aarch64_bti_enabled ())
10058 {
10059 asm_fprintf (f, "\thint\t34 // bti c\n");
10060 offset1 -= 4;
10061 offset2 -= 4;
10062 }
10063
10064 if (TARGET_ILP32)
10065 {
10066 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
10067 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
10068 offset1);
10069 }
10070 else
10071 {
10072 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
10073 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
10074 offset2);
10075 }
10076 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10077
10078 /* The trampoline needs an extra padding instruction. In case if BTI is
10079 enabled the padding instruction is replaced by the BTI instruction at
10080 the beginning. */
10081 if (!aarch64_bti_enabled ())
10082 assemble_aligned_integer (4, const0_rtx);
10083
10084 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10085 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10086 }
10087
10088 static void
10089 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10090 {
10091 rtx fnaddr, mem, a_tramp;
10092 const int tramp_code_sz = 16;
10093
10094 /* Don't need to copy the trailing D-words, we fill those in below. */
10095 emit_block_move (m_tramp, assemble_trampoline_template (),
10096 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10097 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10098 fnaddr = XEXP (DECL_RTL (fndecl), 0);
10099 if (GET_MODE (fnaddr) != ptr_mode)
10100 fnaddr = convert_memory_address (ptr_mode, fnaddr);
10101 emit_move_insn (mem, fnaddr);
10102
10103 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10104 emit_move_insn (mem, chain_value);
10105
10106 /* XXX We should really define a "clear_cache" pattern and use
10107 gen_clear_cache(). */
10108 a_tramp = XEXP (m_tramp, 0);
10109 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10110 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10111 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10112 ptr_mode);
10113 }
10114
10115 static unsigned char
10116 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10117 {
10118 /* ??? Logically we should only need to provide a value when
10119 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10120 can hold MODE, but at the moment we need to handle all modes.
10121 Just ignore any runtime parts for registers that can't store them. */
10122 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10123 unsigned int nregs, vec_flags;
10124 switch (regclass)
10125 {
10126 case TAILCALL_ADDR_REGS:
10127 case POINTER_REGS:
10128 case GENERAL_REGS:
10129 case ALL_REGS:
10130 case POINTER_AND_FP_REGS:
10131 case FP_REGS:
10132 case FP_LO_REGS:
10133 case FP_LO8_REGS:
10134 vec_flags = aarch64_classify_vector_mode (mode);
10135 if ((vec_flags & VEC_SVE_DATA)
10136 && constant_multiple_p (GET_MODE_SIZE (mode),
10137 aarch64_vl_bytes (mode, vec_flags), &nregs))
10138 return nregs;
10139 return (vec_flags & VEC_ADVSIMD
10140 ? CEIL (lowest_size, UNITS_PER_VREG)
10141 : CEIL (lowest_size, UNITS_PER_WORD));
10142 case STACK_REG:
10143 case PR_REGS:
10144 case PR_LO_REGS:
10145 case PR_HI_REGS:
10146 case FFR_REGS:
10147 case PR_AND_FFR_REGS:
10148 return 1;
10149
10150 case NO_REGS:
10151 return 0;
10152
10153 default:
10154 break;
10155 }
10156 gcc_unreachable ();
10157 }
10158
10159 static reg_class_t
10160 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10161 {
10162 if (regclass == POINTER_REGS)
10163 return GENERAL_REGS;
10164
10165 if (regclass == STACK_REG)
10166 {
10167 if (REG_P(x)
10168 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10169 return regclass;
10170
10171 return NO_REGS;
10172 }
10173
10174 /* Register eliminiation can result in a request for
10175 SP+constant->FP_REGS. We cannot support such operations which
10176 use SP as source and an FP_REG as destination, so reject out
10177 right now. */
10178 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10179 {
10180 rtx lhs = XEXP (x, 0);
10181
10182 /* Look through a possible SUBREG introduced by ILP32. */
10183 if (GET_CODE (lhs) == SUBREG)
10184 lhs = SUBREG_REG (lhs);
10185
10186 gcc_assert (REG_P (lhs));
10187 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
10188 POINTER_REGS));
10189 return NO_REGS;
10190 }
10191
10192 return regclass;
10193 }
10194
10195 void
10196 aarch64_asm_output_labelref (FILE* f, const char *name)
10197 {
10198 asm_fprintf (f, "%U%s", name);
10199 }
10200
10201 static void
10202 aarch64_elf_asm_constructor (rtx symbol, int priority)
10203 {
10204 if (priority == DEFAULT_INIT_PRIORITY)
10205 default_ctor_section_asm_out_constructor (symbol, priority);
10206 else
10207 {
10208 section *s;
10209 /* While priority is known to be in range [0, 65535], so 18 bytes
10210 would be enough, the compiler might not know that. To avoid
10211 -Wformat-truncation false positive, use a larger size. */
10212 char buf[23];
10213 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
10214 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10215 switch_to_section (s);
10216 assemble_align (POINTER_SIZE);
10217 assemble_aligned_integer (POINTER_BYTES, symbol);
10218 }
10219 }
10220
10221 static void
10222 aarch64_elf_asm_destructor (rtx symbol, int priority)
10223 {
10224 if (priority == DEFAULT_INIT_PRIORITY)
10225 default_dtor_section_asm_out_destructor (symbol, priority);
10226 else
10227 {
10228 section *s;
10229 /* While priority is known to be in range [0, 65535], so 18 bytes
10230 would be enough, the compiler might not know that. To avoid
10231 -Wformat-truncation false positive, use a larger size. */
10232 char buf[23];
10233 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
10234 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10235 switch_to_section (s);
10236 assemble_align (POINTER_SIZE);
10237 assemble_aligned_integer (POINTER_BYTES, symbol);
10238 }
10239 }
10240
10241 const char*
10242 aarch64_output_casesi (rtx *operands)
10243 {
10244 char buf[100];
10245 char label[100];
10246 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
10247 int index;
10248 static const char *const patterns[4][2] =
10249 {
10250 {
10251 "ldrb\t%w3, [%0,%w1,uxtw]",
10252 "add\t%3, %4, %w3, sxtb #2"
10253 },
10254 {
10255 "ldrh\t%w3, [%0,%w1,uxtw #1]",
10256 "add\t%3, %4, %w3, sxth #2"
10257 },
10258 {
10259 "ldr\t%w3, [%0,%w1,uxtw #2]",
10260 "add\t%3, %4, %w3, sxtw #2"
10261 },
10262 /* We assume that DImode is only generated when not optimizing and
10263 that we don't really need 64-bit address offsets. That would
10264 imply an object file with 8GB of code in a single function! */
10265 {
10266 "ldr\t%w3, [%0,%w1,uxtw #2]",
10267 "add\t%3, %4, %w3, sxtw #2"
10268 }
10269 };
10270
10271 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
10272
10273 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
10274 index = exact_log2 (GET_MODE_SIZE (mode));
10275
10276 gcc_assert (index >= 0 && index <= 3);
10277
10278 /* Need to implement table size reduction, by chaning the code below. */
10279 output_asm_insn (patterns[index][0], operands);
10280 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
10281 snprintf (buf, sizeof (buf),
10282 "adr\t%%4, %s", targetm.strip_name_encoding (label));
10283 output_asm_insn (buf, operands);
10284 output_asm_insn (patterns[index][1], operands);
10285 output_asm_insn ("br\t%3", operands);
10286 assemble_label (asm_out_file, label);
10287 return "";
10288 }
10289
10290
10291 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10292 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10293 operator. */
10294
10295 int
10296 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
10297 {
10298 if (shift >= 0 && shift <= 3)
10299 {
10300 int size;
10301 for (size = 8; size <= 32; size *= 2)
10302 {
10303 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
10304 if (mask == bits << shift)
10305 return size;
10306 }
10307 }
10308 return 0;
10309 }
10310
10311 /* Constant pools are per function only when PC relative
10312 literal loads are true or we are in the large memory
10313 model. */
10314
10315 static inline bool
10316 aarch64_can_use_per_function_literal_pools_p (void)
10317 {
10318 return (aarch64_pcrelative_literal_loads
10319 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
10320 }
10321
10322 static bool
10323 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
10324 {
10325 /* We can't use blocks for constants when we're using a per-function
10326 constant pool. */
10327 return !aarch64_can_use_per_function_literal_pools_p ();
10328 }
10329
10330 /* Select appropriate section for constants depending
10331 on where we place literal pools. */
10332
10333 static section *
10334 aarch64_select_rtx_section (machine_mode mode,
10335 rtx x,
10336 unsigned HOST_WIDE_INT align)
10337 {
10338 if (aarch64_can_use_per_function_literal_pools_p ())
10339 return function_section (current_function_decl);
10340
10341 return default_elf_select_rtx_section (mode, x, align);
10342 }
10343
10344 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
10345 void
10346 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
10347 HOST_WIDE_INT offset)
10348 {
10349 /* When using per-function literal pools, we must ensure that any code
10350 section is aligned to the minimal instruction length, lest we get
10351 errors from the assembler re "unaligned instructions". */
10352 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
10353 ASM_OUTPUT_ALIGN (f, 2);
10354 }
10355
10356 /* Costs. */
10357
10358 /* Helper function for rtx cost calculation. Strip a shift expression
10359 from X. Returns the inner operand if successful, or the original
10360 expression on failure. */
10361 static rtx
10362 aarch64_strip_shift (rtx x)
10363 {
10364 rtx op = x;
10365
10366 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
10367 we can convert both to ROR during final output. */
10368 if ((GET_CODE (op) == ASHIFT
10369 || GET_CODE (op) == ASHIFTRT
10370 || GET_CODE (op) == LSHIFTRT
10371 || GET_CODE (op) == ROTATERT
10372 || GET_CODE (op) == ROTATE)
10373 && CONST_INT_P (XEXP (op, 1)))
10374 return XEXP (op, 0);
10375
10376 if (GET_CODE (op) == MULT
10377 && CONST_INT_P (XEXP (op, 1))
10378 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
10379 return XEXP (op, 0);
10380
10381 return x;
10382 }
10383
10384 /* Helper function for rtx cost calculation. Strip an extend
10385 expression from X. Returns the inner operand if successful, or the
10386 original expression on failure. We deal with a number of possible
10387 canonicalization variations here. If STRIP_SHIFT is true, then
10388 we can strip off a shift also. */
10389 static rtx
10390 aarch64_strip_extend (rtx x, bool strip_shift)
10391 {
10392 scalar_int_mode mode;
10393 rtx op = x;
10394
10395 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
10396 return op;
10397
10398 /* Zero and sign extraction of a widened value. */
10399 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
10400 && XEXP (op, 2) == const0_rtx
10401 && GET_CODE (XEXP (op, 0)) == MULT
10402 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
10403 XEXP (op, 1)))
10404 return XEXP (XEXP (op, 0), 0);
10405
10406 /* It can also be represented (for zero-extend) as an AND with an
10407 immediate. */
10408 if (GET_CODE (op) == AND
10409 && GET_CODE (XEXP (op, 0)) == MULT
10410 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
10411 && CONST_INT_P (XEXP (op, 1))
10412 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
10413 INTVAL (XEXP (op, 1))) != 0)
10414 return XEXP (XEXP (op, 0), 0);
10415
10416 /* Now handle extended register, as this may also have an optional
10417 left shift by 1..4. */
10418 if (strip_shift
10419 && GET_CODE (op) == ASHIFT
10420 && CONST_INT_P (XEXP (op, 1))
10421 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
10422 op = XEXP (op, 0);
10423
10424 if (GET_CODE (op) == ZERO_EXTEND
10425 || GET_CODE (op) == SIGN_EXTEND)
10426 op = XEXP (op, 0);
10427
10428 if (op != x)
10429 return op;
10430
10431 return x;
10432 }
10433
10434 /* Return true iff CODE is a shift supported in combination
10435 with arithmetic instructions. */
10436
10437 static bool
10438 aarch64_shift_p (enum rtx_code code)
10439 {
10440 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
10441 }
10442
10443
10444 /* Return true iff X is a cheap shift without a sign extend. */
10445
10446 static bool
10447 aarch64_cheap_mult_shift_p (rtx x)
10448 {
10449 rtx op0, op1;
10450
10451 op0 = XEXP (x, 0);
10452 op1 = XEXP (x, 1);
10453
10454 if (!(aarch64_tune_params.extra_tuning_flags
10455 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
10456 return false;
10457
10458 if (GET_CODE (op0) == SIGN_EXTEND)
10459 return false;
10460
10461 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
10462 && UINTVAL (op1) <= 4)
10463 return true;
10464
10465 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
10466 return false;
10467
10468 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
10469
10470 if (l2 > 0 && l2 <= 4)
10471 return true;
10472
10473 return false;
10474 }
10475
10476 /* Helper function for rtx cost calculation. Calculate the cost of
10477 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
10478 Return the calculated cost of the expression, recursing manually in to
10479 operands where needed. */
10480
10481 static int
10482 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
10483 {
10484 rtx op0, op1;
10485 const struct cpu_cost_table *extra_cost
10486 = aarch64_tune_params.insn_extra_cost;
10487 int cost = 0;
10488 bool compound_p = (outer == PLUS || outer == MINUS);
10489 machine_mode mode = GET_MODE (x);
10490
10491 gcc_checking_assert (code == MULT);
10492
10493 op0 = XEXP (x, 0);
10494 op1 = XEXP (x, 1);
10495
10496 if (VECTOR_MODE_P (mode))
10497 mode = GET_MODE_INNER (mode);
10498
10499 /* Integer multiply/fma. */
10500 if (GET_MODE_CLASS (mode) == MODE_INT)
10501 {
10502 /* The multiply will be canonicalized as a shift, cost it as such. */
10503 if (aarch64_shift_p (GET_CODE (x))
10504 || (CONST_INT_P (op1)
10505 && exact_log2 (INTVAL (op1)) > 0))
10506 {
10507 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
10508 || GET_CODE (op0) == SIGN_EXTEND;
10509 if (speed)
10510 {
10511 if (compound_p)
10512 {
10513 /* If the shift is considered cheap,
10514 then don't add any cost. */
10515 if (aarch64_cheap_mult_shift_p (x))
10516 ;
10517 else if (REG_P (op1))
10518 /* ARITH + shift-by-register. */
10519 cost += extra_cost->alu.arith_shift_reg;
10520 else if (is_extend)
10521 /* ARITH + extended register. We don't have a cost field
10522 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
10523 cost += extra_cost->alu.extend_arith;
10524 else
10525 /* ARITH + shift-by-immediate. */
10526 cost += extra_cost->alu.arith_shift;
10527 }
10528 else
10529 /* LSL (immediate). */
10530 cost += extra_cost->alu.shift;
10531
10532 }
10533 /* Strip extends as we will have costed them in the case above. */
10534 if (is_extend)
10535 op0 = aarch64_strip_extend (op0, true);
10536
10537 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
10538
10539 return cost;
10540 }
10541
10542 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
10543 compound and let the below cases handle it. After all, MNEG is a
10544 special-case alias of MSUB. */
10545 if (GET_CODE (op0) == NEG)
10546 {
10547 op0 = XEXP (op0, 0);
10548 compound_p = true;
10549 }
10550
10551 /* Integer multiplies or FMAs have zero/sign extending variants. */
10552 if ((GET_CODE (op0) == ZERO_EXTEND
10553 && GET_CODE (op1) == ZERO_EXTEND)
10554 || (GET_CODE (op0) == SIGN_EXTEND
10555 && GET_CODE (op1) == SIGN_EXTEND))
10556 {
10557 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
10558 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
10559
10560 if (speed)
10561 {
10562 if (compound_p)
10563 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
10564 cost += extra_cost->mult[0].extend_add;
10565 else
10566 /* MUL/SMULL/UMULL. */
10567 cost += extra_cost->mult[0].extend;
10568 }
10569
10570 return cost;
10571 }
10572
10573 /* This is either an integer multiply or a MADD. In both cases
10574 we want to recurse and cost the operands. */
10575 cost += rtx_cost (op0, mode, MULT, 0, speed);
10576 cost += rtx_cost (op1, mode, MULT, 1, speed);
10577
10578 if (speed)
10579 {
10580 if (compound_p)
10581 /* MADD/MSUB. */
10582 cost += extra_cost->mult[mode == DImode].add;
10583 else
10584 /* MUL. */
10585 cost += extra_cost->mult[mode == DImode].simple;
10586 }
10587
10588 return cost;
10589 }
10590 else
10591 {
10592 if (speed)
10593 {
10594 /* Floating-point FMA/FMUL can also support negations of the
10595 operands, unless the rounding mode is upward or downward in
10596 which case FNMUL is different than FMUL with operand negation. */
10597 bool neg0 = GET_CODE (op0) == NEG;
10598 bool neg1 = GET_CODE (op1) == NEG;
10599 if (compound_p || !flag_rounding_math || (neg0 && neg1))
10600 {
10601 if (neg0)
10602 op0 = XEXP (op0, 0);
10603 if (neg1)
10604 op1 = XEXP (op1, 0);
10605 }
10606
10607 if (compound_p)
10608 /* FMADD/FNMADD/FNMSUB/FMSUB. */
10609 cost += extra_cost->fp[mode == DFmode].fma;
10610 else
10611 /* FMUL/FNMUL. */
10612 cost += extra_cost->fp[mode == DFmode].mult;
10613 }
10614
10615 cost += rtx_cost (op0, mode, MULT, 0, speed);
10616 cost += rtx_cost (op1, mode, MULT, 1, speed);
10617 return cost;
10618 }
10619 }
10620
10621 static int
10622 aarch64_address_cost (rtx x,
10623 machine_mode mode,
10624 addr_space_t as ATTRIBUTE_UNUSED,
10625 bool speed)
10626 {
10627 enum rtx_code c = GET_CODE (x);
10628 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
10629 struct aarch64_address_info info;
10630 int cost = 0;
10631 info.shift = 0;
10632
10633 if (!aarch64_classify_address (&info, x, mode, false))
10634 {
10635 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
10636 {
10637 /* This is a CONST or SYMBOL ref which will be split
10638 in a different way depending on the code model in use.
10639 Cost it through the generic infrastructure. */
10640 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
10641 /* Divide through by the cost of one instruction to
10642 bring it to the same units as the address costs. */
10643 cost_symbol_ref /= COSTS_N_INSNS (1);
10644 /* The cost is then the cost of preparing the address,
10645 followed by an immediate (possibly 0) offset. */
10646 return cost_symbol_ref + addr_cost->imm_offset;
10647 }
10648 else
10649 {
10650 /* This is most likely a jump table from a case
10651 statement. */
10652 return addr_cost->register_offset;
10653 }
10654 }
10655
10656 switch (info.type)
10657 {
10658 case ADDRESS_LO_SUM:
10659 case ADDRESS_SYMBOLIC:
10660 case ADDRESS_REG_IMM:
10661 cost += addr_cost->imm_offset;
10662 break;
10663
10664 case ADDRESS_REG_WB:
10665 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
10666 cost += addr_cost->pre_modify;
10667 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
10668 cost += addr_cost->post_modify;
10669 else
10670 gcc_unreachable ();
10671
10672 break;
10673
10674 case ADDRESS_REG_REG:
10675 cost += addr_cost->register_offset;
10676 break;
10677
10678 case ADDRESS_REG_SXTW:
10679 cost += addr_cost->register_sextend;
10680 break;
10681
10682 case ADDRESS_REG_UXTW:
10683 cost += addr_cost->register_zextend;
10684 break;
10685
10686 default:
10687 gcc_unreachable ();
10688 }
10689
10690
10691 if (info.shift > 0)
10692 {
10693 /* For the sake of calculating the cost of the shifted register
10694 component, we can treat same sized modes in the same way. */
10695 if (known_eq (GET_MODE_BITSIZE (mode), 16))
10696 cost += addr_cost->addr_scale_costs.hi;
10697 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
10698 cost += addr_cost->addr_scale_costs.si;
10699 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
10700 cost += addr_cost->addr_scale_costs.di;
10701 else
10702 /* We can't tell, or this is a 128-bit vector. */
10703 cost += addr_cost->addr_scale_costs.ti;
10704 }
10705
10706 return cost;
10707 }
10708
10709 /* Return the cost of a branch. If SPEED_P is true then the compiler is
10710 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
10711 to be taken. */
10712
10713 int
10714 aarch64_branch_cost (bool speed_p, bool predictable_p)
10715 {
10716 /* When optimizing for speed, use the cost of unpredictable branches. */
10717 const struct cpu_branch_cost *branch_costs =
10718 aarch64_tune_params.branch_costs;
10719
10720 if (!speed_p || predictable_p)
10721 return branch_costs->predictable;
10722 else
10723 return branch_costs->unpredictable;
10724 }
10725
10726 /* Return true if the RTX X in mode MODE is a zero or sign extract
10727 usable in an ADD or SUB (extended register) instruction. */
10728 static bool
10729 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10730 {
10731 /* Catch add with a sign extract.
10732 This is add_<optab><mode>_multp2. */
10733 if (GET_CODE (x) == SIGN_EXTRACT
10734 || GET_CODE (x) == ZERO_EXTRACT)
10735 {
10736 rtx op0 = XEXP (x, 0);
10737 rtx op1 = XEXP (x, 1);
10738 rtx op2 = XEXP (x, 2);
10739
10740 if (GET_CODE (op0) == MULT
10741 && CONST_INT_P (op1)
10742 && op2 == const0_rtx
10743 && CONST_INT_P (XEXP (op0, 1))
10744 && aarch64_is_extend_from_extract (mode,
10745 XEXP (op0, 1),
10746 op1))
10747 {
10748 return true;
10749 }
10750 }
10751 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10752 No shift. */
10753 else if (GET_CODE (x) == SIGN_EXTEND
10754 || GET_CODE (x) == ZERO_EXTEND)
10755 return REG_P (XEXP (x, 0));
10756
10757 return false;
10758 }
10759
10760 static bool
10761 aarch64_frint_unspec_p (unsigned int u)
10762 {
10763 switch (u)
10764 {
10765 case UNSPEC_FRINTZ:
10766 case UNSPEC_FRINTP:
10767 case UNSPEC_FRINTM:
10768 case UNSPEC_FRINTA:
10769 case UNSPEC_FRINTN:
10770 case UNSPEC_FRINTX:
10771 case UNSPEC_FRINTI:
10772 return true;
10773
10774 default:
10775 return false;
10776 }
10777 }
10778
10779 /* Return true iff X is an rtx that will match an extr instruction
10780 i.e. as described in the *extr<mode>5_insn family of patterns.
10781 OP0 and OP1 will be set to the operands of the shifts involved
10782 on success and will be NULL_RTX otherwise. */
10783
10784 static bool
10785 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10786 {
10787 rtx op0, op1;
10788 scalar_int_mode mode;
10789 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10790 return false;
10791
10792 *res_op0 = NULL_RTX;
10793 *res_op1 = NULL_RTX;
10794
10795 if (GET_CODE (x) != IOR)
10796 return false;
10797
10798 op0 = XEXP (x, 0);
10799 op1 = XEXP (x, 1);
10800
10801 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10802 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10803 {
10804 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10805 if (GET_CODE (op1) == ASHIFT)
10806 std::swap (op0, op1);
10807
10808 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10809 return false;
10810
10811 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10812 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10813
10814 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10815 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10816 {
10817 *res_op0 = XEXP (op0, 0);
10818 *res_op1 = XEXP (op1, 0);
10819 return true;
10820 }
10821 }
10822
10823 return false;
10824 }
10825
10826 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10827 storing it in *COST. Result is true if the total cost of the operation
10828 has now been calculated. */
10829 static bool
10830 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10831 {
10832 rtx inner;
10833 rtx comparator;
10834 enum rtx_code cmpcode;
10835
10836 if (COMPARISON_P (op0))
10837 {
10838 inner = XEXP (op0, 0);
10839 comparator = XEXP (op0, 1);
10840 cmpcode = GET_CODE (op0);
10841 }
10842 else
10843 {
10844 inner = op0;
10845 comparator = const0_rtx;
10846 cmpcode = NE;
10847 }
10848
10849 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10850 {
10851 /* Conditional branch. */
10852 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10853 return true;
10854 else
10855 {
10856 if (cmpcode == NE || cmpcode == EQ)
10857 {
10858 if (comparator == const0_rtx)
10859 {
10860 /* TBZ/TBNZ/CBZ/CBNZ. */
10861 if (GET_CODE (inner) == ZERO_EXTRACT)
10862 /* TBZ/TBNZ. */
10863 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10864 ZERO_EXTRACT, 0, speed);
10865 else
10866 /* CBZ/CBNZ. */
10867 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10868
10869 return true;
10870 }
10871 }
10872 else if (cmpcode == LT || cmpcode == GE)
10873 {
10874 /* TBZ/TBNZ. */
10875 if (comparator == const0_rtx)
10876 return true;
10877 }
10878 }
10879 }
10880 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10881 {
10882 /* CCMP. */
10883 if (GET_CODE (op1) == COMPARE)
10884 {
10885 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10886 if (XEXP (op1, 1) == const0_rtx)
10887 *cost += 1;
10888 if (speed)
10889 {
10890 machine_mode mode = GET_MODE (XEXP (op1, 0));
10891 const struct cpu_cost_table *extra_cost
10892 = aarch64_tune_params.insn_extra_cost;
10893
10894 if (GET_MODE_CLASS (mode) == MODE_INT)
10895 *cost += extra_cost->alu.arith;
10896 else
10897 *cost += extra_cost->fp[mode == DFmode].compare;
10898 }
10899 return true;
10900 }
10901
10902 /* It's a conditional operation based on the status flags,
10903 so it must be some flavor of CSEL. */
10904
10905 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10906 if (GET_CODE (op1) == NEG
10907 || GET_CODE (op1) == NOT
10908 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10909 op1 = XEXP (op1, 0);
10910 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10911 {
10912 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10913 op1 = XEXP (op1, 0);
10914 op2 = XEXP (op2, 0);
10915 }
10916
10917 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10918 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10919 return true;
10920 }
10921
10922 /* We don't know what this is, cost all operands. */
10923 return false;
10924 }
10925
10926 /* Check whether X is a bitfield operation of the form shift + extend that
10927 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10928 operand to which the bitfield operation is applied. Otherwise return
10929 NULL_RTX. */
10930
10931 static rtx
10932 aarch64_extend_bitfield_pattern_p (rtx x)
10933 {
10934 rtx_code outer_code = GET_CODE (x);
10935 machine_mode outer_mode = GET_MODE (x);
10936
10937 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10938 && outer_mode != SImode && outer_mode != DImode)
10939 return NULL_RTX;
10940
10941 rtx inner = XEXP (x, 0);
10942 rtx_code inner_code = GET_CODE (inner);
10943 machine_mode inner_mode = GET_MODE (inner);
10944 rtx op = NULL_RTX;
10945
10946 switch (inner_code)
10947 {
10948 case ASHIFT:
10949 if (CONST_INT_P (XEXP (inner, 1))
10950 && (inner_mode == QImode || inner_mode == HImode))
10951 op = XEXP (inner, 0);
10952 break;
10953 case LSHIFTRT:
10954 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10955 && (inner_mode == QImode || inner_mode == HImode))
10956 op = XEXP (inner, 0);
10957 break;
10958 case ASHIFTRT:
10959 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10960 && (inner_mode == QImode || inner_mode == HImode))
10961 op = XEXP (inner, 0);
10962 break;
10963 default:
10964 break;
10965 }
10966
10967 return op;
10968 }
10969
10970 /* Return true if the mask and a shift amount from an RTX of the form
10971 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10972 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10973
10974 bool
10975 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10976 rtx shft_amnt)
10977 {
10978 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10979 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10980 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10981 && (INTVAL (mask)
10982 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10983 }
10984
10985 /* Return true if the masks and a shift amount from an RTX of the form
10986 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10987 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10988
10989 bool
10990 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10991 unsigned HOST_WIDE_INT mask1,
10992 unsigned HOST_WIDE_INT shft_amnt,
10993 unsigned HOST_WIDE_INT mask2)
10994 {
10995 unsigned HOST_WIDE_INT t;
10996
10997 /* Verify that there is no overlap in what bits are set in the two masks. */
10998 if (mask1 != ~mask2)
10999 return false;
11000
11001 /* Verify that mask2 is not all zeros or ones. */
11002 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11003 return false;
11004
11005 /* The shift amount should always be less than the mode size. */
11006 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11007
11008 /* Verify that the mask being shifted is contiguous and would be in the
11009 least significant bits after shifting by shft_amnt. */
11010 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11011 return (t == (t & -t));
11012 }
11013
11014 /* Calculate the cost of calculating X, storing it in *COST. Result
11015 is true if the total cost of the operation has now been calculated. */
11016 static bool
11017 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11018 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11019 {
11020 rtx op0, op1, op2;
11021 const struct cpu_cost_table *extra_cost
11022 = aarch64_tune_params.insn_extra_cost;
11023 int code = GET_CODE (x);
11024 scalar_int_mode int_mode;
11025
11026 /* By default, assume that everything has equivalent cost to the
11027 cheapest instruction. Any additional costs are applied as a delta
11028 above this default. */
11029 *cost = COSTS_N_INSNS (1);
11030
11031 switch (code)
11032 {
11033 case SET:
11034 /* The cost depends entirely on the operands to SET. */
11035 *cost = 0;
11036 op0 = SET_DEST (x);
11037 op1 = SET_SRC (x);
11038
11039 switch (GET_CODE (op0))
11040 {
11041 case MEM:
11042 if (speed)
11043 {
11044 rtx address = XEXP (op0, 0);
11045 if (VECTOR_MODE_P (mode))
11046 *cost += extra_cost->ldst.storev;
11047 else if (GET_MODE_CLASS (mode) == MODE_INT)
11048 *cost += extra_cost->ldst.store;
11049 else if (mode == SFmode)
11050 *cost += extra_cost->ldst.storef;
11051 else if (mode == DFmode)
11052 *cost += extra_cost->ldst.stored;
11053
11054 *cost +=
11055 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11056 0, speed));
11057 }
11058
11059 *cost += rtx_cost (op1, mode, SET, 1, speed);
11060 return true;
11061
11062 case SUBREG:
11063 if (! REG_P (SUBREG_REG (op0)))
11064 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11065
11066 /* Fall through. */
11067 case REG:
11068 /* The cost is one per vector-register copied. */
11069 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11070 {
11071 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11072 *cost = COSTS_N_INSNS (nregs);
11073 }
11074 /* const0_rtx is in general free, but we will use an
11075 instruction to set a register to 0. */
11076 else if (REG_P (op1) || op1 == const0_rtx)
11077 {
11078 /* The cost is 1 per register copied. */
11079 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11080 *cost = COSTS_N_INSNS (nregs);
11081 }
11082 else
11083 /* Cost is just the cost of the RHS of the set. */
11084 *cost += rtx_cost (op1, mode, SET, 1, speed);
11085 return true;
11086
11087 case ZERO_EXTRACT:
11088 case SIGN_EXTRACT:
11089 /* Bit-field insertion. Strip any redundant widening of
11090 the RHS to meet the width of the target. */
11091 if (GET_CODE (op1) == SUBREG)
11092 op1 = SUBREG_REG (op1);
11093 if ((GET_CODE (op1) == ZERO_EXTEND
11094 || GET_CODE (op1) == SIGN_EXTEND)
11095 && CONST_INT_P (XEXP (op0, 1))
11096 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11097 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11098 op1 = XEXP (op1, 0);
11099
11100 if (CONST_INT_P (op1))
11101 {
11102 /* MOV immediate is assumed to always be cheap. */
11103 *cost = COSTS_N_INSNS (1);
11104 }
11105 else
11106 {
11107 /* BFM. */
11108 if (speed)
11109 *cost += extra_cost->alu.bfi;
11110 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11111 }
11112
11113 return true;
11114
11115 default:
11116 /* We can't make sense of this, assume default cost. */
11117 *cost = COSTS_N_INSNS (1);
11118 return false;
11119 }
11120 return false;
11121
11122 case CONST_INT:
11123 /* If an instruction can incorporate a constant within the
11124 instruction, the instruction's expression avoids calling
11125 rtx_cost() on the constant. If rtx_cost() is called on a
11126 constant, then it is usually because the constant must be
11127 moved into a register by one or more instructions.
11128
11129 The exception is constant 0, which can be expressed
11130 as XZR/WZR and is therefore free. The exception to this is
11131 if we have (set (reg) (const0_rtx)) in which case we must cost
11132 the move. However, we can catch that when we cost the SET, so
11133 we don't need to consider that here. */
11134 if (x == const0_rtx)
11135 *cost = 0;
11136 else
11137 {
11138 /* To an approximation, building any other constant is
11139 proportionally expensive to the number of instructions
11140 required to build that constant. This is true whether we
11141 are compiling for SPEED or otherwise. */
11142 if (!is_a <scalar_int_mode> (mode, &int_mode))
11143 int_mode = word_mode;
11144 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
11145 (NULL_RTX, x, false, int_mode));
11146 }
11147 return true;
11148
11149 case CONST_DOUBLE:
11150
11151 /* First determine number of instructions to do the move
11152 as an integer constant. */
11153 if (!aarch64_float_const_representable_p (x)
11154 && !aarch64_can_const_movi_rtx_p (x, mode)
11155 && aarch64_float_const_rtx_p (x))
11156 {
11157 unsigned HOST_WIDE_INT ival;
11158 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
11159 gcc_assert (succeed);
11160
11161 scalar_int_mode imode = (mode == HFmode
11162 ? SImode
11163 : int_mode_for_mode (mode).require ());
11164 int ncost = aarch64_internal_mov_immediate
11165 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11166 *cost += COSTS_N_INSNS (ncost);
11167 return true;
11168 }
11169
11170 if (speed)
11171 {
11172 /* mov[df,sf]_aarch64. */
11173 if (aarch64_float_const_representable_p (x))
11174 /* FMOV (scalar immediate). */
11175 *cost += extra_cost->fp[mode == DFmode].fpconst;
11176 else if (!aarch64_float_const_zero_rtx_p (x))
11177 {
11178 /* This will be a load from memory. */
11179 if (mode == DFmode)
11180 *cost += extra_cost->ldst.loadd;
11181 else
11182 *cost += extra_cost->ldst.loadf;
11183 }
11184 else
11185 /* Otherwise this is +0.0. We get this using MOVI d0, #0
11186 or MOV v0.s[0], wzr - neither of which are modeled by the
11187 cost tables. Just use the default cost. */
11188 {
11189 }
11190 }
11191
11192 return true;
11193
11194 case MEM:
11195 if (speed)
11196 {
11197 /* For loads we want the base cost of a load, plus an
11198 approximation for the additional cost of the addressing
11199 mode. */
11200 rtx address = XEXP (x, 0);
11201 if (VECTOR_MODE_P (mode))
11202 *cost += extra_cost->ldst.loadv;
11203 else if (GET_MODE_CLASS (mode) == MODE_INT)
11204 *cost += extra_cost->ldst.load;
11205 else if (mode == SFmode)
11206 *cost += extra_cost->ldst.loadf;
11207 else if (mode == DFmode)
11208 *cost += extra_cost->ldst.loadd;
11209
11210 *cost +=
11211 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11212 0, speed));
11213 }
11214
11215 return true;
11216
11217 case NEG:
11218 op0 = XEXP (x, 0);
11219
11220 if (VECTOR_MODE_P (mode))
11221 {
11222 if (speed)
11223 {
11224 /* FNEG. */
11225 *cost += extra_cost->vect.alu;
11226 }
11227 return false;
11228 }
11229
11230 if (GET_MODE_CLASS (mode) == MODE_INT)
11231 {
11232 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11233 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11234 {
11235 /* CSETM. */
11236 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
11237 return true;
11238 }
11239
11240 /* Cost this as SUB wzr, X. */
11241 op0 = CONST0_RTX (mode);
11242 op1 = XEXP (x, 0);
11243 goto cost_minus;
11244 }
11245
11246 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11247 {
11248 /* Support (neg(fma...)) as a single instruction only if
11249 sign of zeros is unimportant. This matches the decision
11250 making in aarch64.md. */
11251 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
11252 {
11253 /* FNMADD. */
11254 *cost = rtx_cost (op0, mode, NEG, 0, speed);
11255 return true;
11256 }
11257 if (GET_CODE (op0) == MULT)
11258 {
11259 /* FNMUL. */
11260 *cost = rtx_cost (op0, mode, NEG, 0, speed);
11261 return true;
11262 }
11263 if (speed)
11264 /* FNEG. */
11265 *cost += extra_cost->fp[mode == DFmode].neg;
11266 return false;
11267 }
11268
11269 return false;
11270
11271 case CLRSB:
11272 case CLZ:
11273 if (speed)
11274 {
11275 if (VECTOR_MODE_P (mode))
11276 *cost += extra_cost->vect.alu;
11277 else
11278 *cost += extra_cost->alu.clz;
11279 }
11280
11281 return false;
11282
11283 case COMPARE:
11284 op0 = XEXP (x, 0);
11285 op1 = XEXP (x, 1);
11286
11287 if (op1 == const0_rtx
11288 && GET_CODE (op0) == AND)
11289 {
11290 x = op0;
11291 mode = GET_MODE (op0);
11292 goto cost_logic;
11293 }
11294
11295 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
11296 {
11297 /* TODO: A write to the CC flags possibly costs extra, this
11298 needs encoding in the cost tables. */
11299
11300 mode = GET_MODE (op0);
11301 /* ANDS. */
11302 if (GET_CODE (op0) == AND)
11303 {
11304 x = op0;
11305 goto cost_logic;
11306 }
11307
11308 if (GET_CODE (op0) == PLUS)
11309 {
11310 /* ADDS (and CMN alias). */
11311 x = op0;
11312 goto cost_plus;
11313 }
11314
11315 if (GET_CODE (op0) == MINUS)
11316 {
11317 /* SUBS. */
11318 x = op0;
11319 goto cost_minus;
11320 }
11321
11322 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
11323 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
11324 && CONST_INT_P (XEXP (op0, 2)))
11325 {
11326 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
11327 Handle it here directly rather than going to cost_logic
11328 since we know the immediate generated for the TST is valid
11329 so we can avoid creating an intermediate rtx for it only
11330 for costing purposes. */
11331 if (speed)
11332 *cost += extra_cost->alu.logical;
11333
11334 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
11335 ZERO_EXTRACT, 0, speed);
11336 return true;
11337 }
11338
11339 if (GET_CODE (op1) == NEG)
11340 {
11341 /* CMN. */
11342 if (speed)
11343 *cost += extra_cost->alu.arith;
11344
11345 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
11346 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
11347 return true;
11348 }
11349
11350 /* CMP.
11351
11352 Compare can freely swap the order of operands, and
11353 canonicalization puts the more complex operation first.
11354 But the integer MINUS logic expects the shift/extend
11355 operation in op1. */
11356 if (! (REG_P (op0)
11357 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
11358 {
11359 op0 = XEXP (x, 1);
11360 op1 = XEXP (x, 0);
11361 }
11362 goto cost_minus;
11363 }
11364
11365 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
11366 {
11367 /* FCMP. */
11368 if (speed)
11369 *cost += extra_cost->fp[mode == DFmode].compare;
11370
11371 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
11372 {
11373 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
11374 /* FCMP supports constant 0.0 for no extra cost. */
11375 return true;
11376 }
11377 return false;
11378 }
11379
11380 if (VECTOR_MODE_P (mode))
11381 {
11382 /* Vector compare. */
11383 if (speed)
11384 *cost += extra_cost->vect.alu;
11385
11386 if (aarch64_float_const_zero_rtx_p (op1))
11387 {
11388 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
11389 cost. */
11390 return true;
11391 }
11392 return false;
11393 }
11394 return false;
11395
11396 case MINUS:
11397 {
11398 op0 = XEXP (x, 0);
11399 op1 = XEXP (x, 1);
11400
11401 cost_minus:
11402 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
11403
11404 /* Detect valid immediates. */
11405 if ((GET_MODE_CLASS (mode) == MODE_INT
11406 || (GET_MODE_CLASS (mode) == MODE_CC
11407 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
11408 && CONST_INT_P (op1)
11409 && aarch64_uimm12_shift (INTVAL (op1)))
11410 {
11411 if (speed)
11412 /* SUB(S) (immediate). */
11413 *cost += extra_cost->alu.arith;
11414 return true;
11415 }
11416
11417 /* Look for SUB (extended register). */
11418 if (is_a <scalar_int_mode> (mode, &int_mode)
11419 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
11420 {
11421 if (speed)
11422 *cost += extra_cost->alu.extend_arith;
11423
11424 op1 = aarch64_strip_extend (op1, true);
11425 *cost += rtx_cost (op1, VOIDmode,
11426 (enum rtx_code) GET_CODE (op1), 0, speed);
11427 return true;
11428 }
11429
11430 rtx new_op1 = aarch64_strip_extend (op1, false);
11431
11432 /* Cost this as an FMA-alike operation. */
11433 if ((GET_CODE (new_op1) == MULT
11434 || aarch64_shift_p (GET_CODE (new_op1)))
11435 && code != COMPARE)
11436 {
11437 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
11438 (enum rtx_code) code,
11439 speed);
11440 return true;
11441 }
11442
11443 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
11444
11445 if (speed)
11446 {
11447 if (VECTOR_MODE_P (mode))
11448 {
11449 /* Vector SUB. */
11450 *cost += extra_cost->vect.alu;
11451 }
11452 else if (GET_MODE_CLASS (mode) == MODE_INT)
11453 {
11454 /* SUB(S). */
11455 *cost += extra_cost->alu.arith;
11456 }
11457 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11458 {
11459 /* FSUB. */
11460 *cost += extra_cost->fp[mode == DFmode].addsub;
11461 }
11462 }
11463 return true;
11464 }
11465
11466 case PLUS:
11467 {
11468 rtx new_op0;
11469
11470 op0 = XEXP (x, 0);
11471 op1 = XEXP (x, 1);
11472
11473 cost_plus:
11474 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11475 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11476 {
11477 /* CSINC. */
11478 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
11479 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11480 return true;
11481 }
11482
11483 if (GET_MODE_CLASS (mode) == MODE_INT
11484 && (aarch64_plus_immediate (op1, mode)
11485 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
11486 {
11487 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
11488
11489 if (speed)
11490 /* ADD (immediate). */
11491 *cost += extra_cost->alu.arith;
11492 return true;
11493 }
11494
11495 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11496
11497 /* Look for ADD (extended register). */
11498 if (is_a <scalar_int_mode> (mode, &int_mode)
11499 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
11500 {
11501 if (speed)
11502 *cost += extra_cost->alu.extend_arith;
11503
11504 op0 = aarch64_strip_extend (op0, true);
11505 *cost += rtx_cost (op0, VOIDmode,
11506 (enum rtx_code) GET_CODE (op0), 0, speed);
11507 return true;
11508 }
11509
11510 /* Strip any extend, leave shifts behind as we will
11511 cost them through mult_cost. */
11512 new_op0 = aarch64_strip_extend (op0, false);
11513
11514 if (GET_CODE (new_op0) == MULT
11515 || aarch64_shift_p (GET_CODE (new_op0)))
11516 {
11517 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
11518 speed);
11519 return true;
11520 }
11521
11522 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
11523
11524 if (speed)
11525 {
11526 if (VECTOR_MODE_P (mode))
11527 {
11528 /* Vector ADD. */
11529 *cost += extra_cost->vect.alu;
11530 }
11531 else if (GET_MODE_CLASS (mode) == MODE_INT)
11532 {
11533 /* ADD. */
11534 *cost += extra_cost->alu.arith;
11535 }
11536 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11537 {
11538 /* FADD. */
11539 *cost += extra_cost->fp[mode == DFmode].addsub;
11540 }
11541 }
11542 return true;
11543 }
11544
11545 case BSWAP:
11546 *cost = COSTS_N_INSNS (1);
11547
11548 if (speed)
11549 {
11550 if (VECTOR_MODE_P (mode))
11551 *cost += extra_cost->vect.alu;
11552 else
11553 *cost += extra_cost->alu.rev;
11554 }
11555 return false;
11556
11557 case IOR:
11558 if (aarch_rev16_p (x))
11559 {
11560 *cost = COSTS_N_INSNS (1);
11561
11562 if (speed)
11563 {
11564 if (VECTOR_MODE_P (mode))
11565 *cost += extra_cost->vect.alu;
11566 else
11567 *cost += extra_cost->alu.rev;
11568 }
11569 return true;
11570 }
11571
11572 if (aarch64_extr_rtx_p (x, &op0, &op1))
11573 {
11574 *cost += rtx_cost (op0, mode, IOR, 0, speed);
11575 *cost += rtx_cost (op1, mode, IOR, 1, speed);
11576 if (speed)
11577 *cost += extra_cost->alu.shift;
11578
11579 return true;
11580 }
11581 /* Fall through. */
11582 case XOR:
11583 case AND:
11584 cost_logic:
11585 op0 = XEXP (x, 0);
11586 op1 = XEXP (x, 1);
11587
11588 if (VECTOR_MODE_P (mode))
11589 {
11590 if (speed)
11591 *cost += extra_cost->vect.alu;
11592 return true;
11593 }
11594
11595 if (code == AND
11596 && GET_CODE (op0) == MULT
11597 && CONST_INT_P (XEXP (op0, 1))
11598 && CONST_INT_P (op1)
11599 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
11600 INTVAL (op1)) != 0)
11601 {
11602 /* This is a UBFM/SBFM. */
11603 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
11604 if (speed)
11605 *cost += extra_cost->alu.bfx;
11606 return true;
11607 }
11608
11609 if (is_int_mode (mode, &int_mode))
11610 {
11611 if (CONST_INT_P (op1))
11612 {
11613 /* We have a mask + shift version of a UBFIZ
11614 i.e. the *andim_ashift<mode>_bfiz pattern. */
11615 if (GET_CODE (op0) == ASHIFT
11616 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
11617 XEXP (op0, 1)))
11618 {
11619 *cost += rtx_cost (XEXP (op0, 0), int_mode,
11620 (enum rtx_code) code, 0, speed);
11621 if (speed)
11622 *cost += extra_cost->alu.bfx;
11623
11624 return true;
11625 }
11626 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
11627 {
11628 /* We possibly get the immediate for free, this is not
11629 modelled. */
11630 *cost += rtx_cost (op0, int_mode,
11631 (enum rtx_code) code, 0, speed);
11632 if (speed)
11633 *cost += extra_cost->alu.logical;
11634
11635 return true;
11636 }
11637 }
11638 else
11639 {
11640 rtx new_op0 = op0;
11641
11642 /* Handle ORN, EON, or BIC. */
11643 if (GET_CODE (op0) == NOT)
11644 op0 = XEXP (op0, 0);
11645
11646 new_op0 = aarch64_strip_shift (op0);
11647
11648 /* If we had a shift on op0 then this is a logical-shift-
11649 by-register/immediate operation. Otherwise, this is just
11650 a logical operation. */
11651 if (speed)
11652 {
11653 if (new_op0 != op0)
11654 {
11655 /* Shift by immediate. */
11656 if (CONST_INT_P (XEXP (op0, 1)))
11657 *cost += extra_cost->alu.log_shift;
11658 else
11659 *cost += extra_cost->alu.log_shift_reg;
11660 }
11661 else
11662 *cost += extra_cost->alu.logical;
11663 }
11664
11665 /* In both cases we want to cost both operands. */
11666 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
11667 0, speed);
11668 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
11669 1, speed);
11670
11671 return true;
11672 }
11673 }
11674 return false;
11675
11676 case NOT:
11677 x = XEXP (x, 0);
11678 op0 = aarch64_strip_shift (x);
11679
11680 if (VECTOR_MODE_P (mode))
11681 {
11682 /* Vector NOT. */
11683 *cost += extra_cost->vect.alu;
11684 return false;
11685 }
11686
11687 /* MVN-shifted-reg. */
11688 if (op0 != x)
11689 {
11690 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11691
11692 if (speed)
11693 *cost += extra_cost->alu.log_shift;
11694
11695 return true;
11696 }
11697 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
11698 Handle the second form here taking care that 'a' in the above can
11699 be a shift. */
11700 else if (GET_CODE (op0) == XOR)
11701 {
11702 rtx newop0 = XEXP (op0, 0);
11703 rtx newop1 = XEXP (op0, 1);
11704 rtx op0_stripped = aarch64_strip_shift (newop0);
11705
11706 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
11707 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
11708
11709 if (speed)
11710 {
11711 if (op0_stripped != newop0)
11712 *cost += extra_cost->alu.log_shift;
11713 else
11714 *cost += extra_cost->alu.logical;
11715 }
11716
11717 return true;
11718 }
11719 /* MVN. */
11720 if (speed)
11721 *cost += extra_cost->alu.logical;
11722
11723 return false;
11724
11725 case ZERO_EXTEND:
11726
11727 op0 = XEXP (x, 0);
11728 /* If a value is written in SI mode, then zero extended to DI
11729 mode, the operation will in general be free as a write to
11730 a 'w' register implicitly zeroes the upper bits of an 'x'
11731 register. However, if this is
11732
11733 (set (reg) (zero_extend (reg)))
11734
11735 we must cost the explicit register move. */
11736 if (mode == DImode
11737 && GET_MODE (op0) == SImode
11738 && outer == SET)
11739 {
11740 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11741
11742 /* If OP_COST is non-zero, then the cost of the zero extend
11743 is effectively the cost of the inner operation. Otherwise
11744 we have a MOV instruction and we take the cost from the MOV
11745 itself. This is true independently of whether we are
11746 optimizing for space or time. */
11747 if (op_cost)
11748 *cost = op_cost;
11749
11750 return true;
11751 }
11752 else if (MEM_P (op0))
11753 {
11754 /* All loads can zero extend to any size for free. */
11755 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11756 return true;
11757 }
11758
11759 op0 = aarch64_extend_bitfield_pattern_p (x);
11760 if (op0)
11761 {
11762 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11763 if (speed)
11764 *cost += extra_cost->alu.bfx;
11765 return true;
11766 }
11767
11768 if (speed)
11769 {
11770 if (VECTOR_MODE_P (mode))
11771 {
11772 /* UMOV. */
11773 *cost += extra_cost->vect.alu;
11774 }
11775 else
11776 {
11777 /* We generate an AND instead of UXTB/UXTH. */
11778 *cost += extra_cost->alu.logical;
11779 }
11780 }
11781 return false;
11782
11783 case SIGN_EXTEND:
11784 if (MEM_P (XEXP (x, 0)))
11785 {
11786 /* LDRSH. */
11787 if (speed)
11788 {
11789 rtx address = XEXP (XEXP (x, 0), 0);
11790 *cost += extra_cost->ldst.load_sign_extend;
11791
11792 *cost +=
11793 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11794 0, speed));
11795 }
11796 return true;
11797 }
11798
11799 op0 = aarch64_extend_bitfield_pattern_p (x);
11800 if (op0)
11801 {
11802 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11803 if (speed)
11804 *cost += extra_cost->alu.bfx;
11805 return true;
11806 }
11807
11808 if (speed)
11809 {
11810 if (VECTOR_MODE_P (mode))
11811 *cost += extra_cost->vect.alu;
11812 else
11813 *cost += extra_cost->alu.extend;
11814 }
11815 return false;
11816
11817 case ASHIFT:
11818 op0 = XEXP (x, 0);
11819 op1 = XEXP (x, 1);
11820
11821 if (CONST_INT_P (op1))
11822 {
11823 if (speed)
11824 {
11825 if (VECTOR_MODE_P (mode))
11826 {
11827 /* Vector shift (immediate). */
11828 *cost += extra_cost->vect.alu;
11829 }
11830 else
11831 {
11832 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
11833 aliases. */
11834 *cost += extra_cost->alu.shift;
11835 }
11836 }
11837
11838 /* We can incorporate zero/sign extend for free. */
11839 if (GET_CODE (op0) == ZERO_EXTEND
11840 || GET_CODE (op0) == SIGN_EXTEND)
11841 op0 = XEXP (op0, 0);
11842
11843 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11844 return true;
11845 }
11846 else
11847 {
11848 if (VECTOR_MODE_P (mode))
11849 {
11850 if (speed)
11851 /* Vector shift (register). */
11852 *cost += extra_cost->vect.alu;
11853 }
11854 else
11855 {
11856 if (speed)
11857 /* LSLV. */
11858 *cost += extra_cost->alu.shift_reg;
11859
11860 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11861 && CONST_INT_P (XEXP (op1, 1))
11862 && known_eq (INTVAL (XEXP (op1, 1)),
11863 GET_MODE_BITSIZE (mode) - 1))
11864 {
11865 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11866 /* We already demanded XEXP (op1, 0) to be REG_P, so
11867 don't recurse into it. */
11868 return true;
11869 }
11870 }
11871 return false; /* All arguments need to be in registers. */
11872 }
11873
11874 case ROTATE:
11875 case ROTATERT:
11876 case LSHIFTRT:
11877 case ASHIFTRT:
11878 op0 = XEXP (x, 0);
11879 op1 = XEXP (x, 1);
11880
11881 if (CONST_INT_P (op1))
11882 {
11883 /* ASR (immediate) and friends. */
11884 if (speed)
11885 {
11886 if (VECTOR_MODE_P (mode))
11887 *cost += extra_cost->vect.alu;
11888 else
11889 *cost += extra_cost->alu.shift;
11890 }
11891
11892 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11893 return true;
11894 }
11895 else
11896 {
11897 if (VECTOR_MODE_P (mode))
11898 {
11899 if (speed)
11900 /* Vector shift (register). */
11901 *cost += extra_cost->vect.alu;
11902 }
11903 else
11904 {
11905 if (speed)
11906 /* ASR (register) and friends. */
11907 *cost += extra_cost->alu.shift_reg;
11908
11909 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11910 && CONST_INT_P (XEXP (op1, 1))
11911 && known_eq (INTVAL (XEXP (op1, 1)),
11912 GET_MODE_BITSIZE (mode) - 1))
11913 {
11914 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11915 /* We already demanded XEXP (op1, 0) to be REG_P, so
11916 don't recurse into it. */
11917 return true;
11918 }
11919 }
11920 return false; /* All arguments need to be in registers. */
11921 }
11922
11923 case SYMBOL_REF:
11924
11925 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11926 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11927 {
11928 /* LDR. */
11929 if (speed)
11930 *cost += extra_cost->ldst.load;
11931 }
11932 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11933 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11934 {
11935 /* ADRP, followed by ADD. */
11936 *cost += COSTS_N_INSNS (1);
11937 if (speed)
11938 *cost += 2 * extra_cost->alu.arith;
11939 }
11940 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11941 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11942 {
11943 /* ADR. */
11944 if (speed)
11945 *cost += extra_cost->alu.arith;
11946 }
11947
11948 if (flag_pic)
11949 {
11950 /* One extra load instruction, after accessing the GOT. */
11951 *cost += COSTS_N_INSNS (1);
11952 if (speed)
11953 *cost += extra_cost->ldst.load;
11954 }
11955 return true;
11956
11957 case HIGH:
11958 case LO_SUM:
11959 /* ADRP/ADD (immediate). */
11960 if (speed)
11961 *cost += extra_cost->alu.arith;
11962 return true;
11963
11964 case ZERO_EXTRACT:
11965 case SIGN_EXTRACT:
11966 /* UBFX/SBFX. */
11967 if (speed)
11968 {
11969 if (VECTOR_MODE_P (mode))
11970 *cost += extra_cost->vect.alu;
11971 else
11972 *cost += extra_cost->alu.bfx;
11973 }
11974
11975 /* We can trust that the immediates used will be correct (there
11976 are no by-register forms), so we need only cost op0. */
11977 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11978 return true;
11979
11980 case MULT:
11981 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11982 /* aarch64_rtx_mult_cost always handles recursion to its
11983 operands. */
11984 return true;
11985
11986 case MOD:
11987 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11988 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11989 an unconditional negate. This case should only ever be reached through
11990 the set_smod_pow2_cheap check in expmed.c. */
11991 if (CONST_INT_P (XEXP (x, 1))
11992 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11993 && (mode == SImode || mode == DImode))
11994 {
11995 /* We expand to 4 instructions. Reset the baseline. */
11996 *cost = COSTS_N_INSNS (4);
11997
11998 if (speed)
11999 *cost += 2 * extra_cost->alu.logical
12000 + 2 * extra_cost->alu.arith;
12001
12002 return true;
12003 }
12004
12005 /* Fall-through. */
12006 case UMOD:
12007 if (speed)
12008 {
12009 /* Slighly prefer UMOD over SMOD. */
12010 if (VECTOR_MODE_P (mode))
12011 *cost += extra_cost->vect.alu;
12012 else if (GET_MODE_CLASS (mode) == MODE_INT)
12013 *cost += (extra_cost->mult[mode == DImode].add
12014 + extra_cost->mult[mode == DImode].idiv
12015 + (code == MOD ? 1 : 0));
12016 }
12017 return false; /* All arguments need to be in registers. */
12018
12019 case DIV:
12020 case UDIV:
12021 case SQRT:
12022 if (speed)
12023 {
12024 if (VECTOR_MODE_P (mode))
12025 *cost += extra_cost->vect.alu;
12026 else if (GET_MODE_CLASS (mode) == MODE_INT)
12027 /* There is no integer SQRT, so only DIV and UDIV can get
12028 here. */
12029 *cost += (extra_cost->mult[mode == DImode].idiv
12030 /* Slighly prefer UDIV over SDIV. */
12031 + (code == DIV ? 1 : 0));
12032 else
12033 *cost += extra_cost->fp[mode == DFmode].div;
12034 }
12035 return false; /* All arguments need to be in registers. */
12036
12037 case IF_THEN_ELSE:
12038 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12039 XEXP (x, 2), cost, speed);
12040
12041 case EQ:
12042 case NE:
12043 case GT:
12044 case GTU:
12045 case LT:
12046 case LTU:
12047 case GE:
12048 case GEU:
12049 case LE:
12050 case LEU:
12051
12052 return false; /* All arguments must be in registers. */
12053
12054 case FMA:
12055 op0 = XEXP (x, 0);
12056 op1 = XEXP (x, 1);
12057 op2 = XEXP (x, 2);
12058
12059 if (speed)
12060 {
12061 if (VECTOR_MODE_P (mode))
12062 *cost += extra_cost->vect.alu;
12063 else
12064 *cost += extra_cost->fp[mode == DFmode].fma;
12065 }
12066
12067 /* FMSUB, FNMADD, and FNMSUB are free. */
12068 if (GET_CODE (op0) == NEG)
12069 op0 = XEXP (op0, 0);
12070
12071 if (GET_CODE (op2) == NEG)
12072 op2 = XEXP (op2, 0);
12073
12074 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12075 and the by-element operand as operand 0. */
12076 if (GET_CODE (op1) == NEG)
12077 op1 = XEXP (op1, 0);
12078
12079 /* Catch vector-by-element operations. The by-element operand can
12080 either be (vec_duplicate (vec_select (x))) or just
12081 (vec_select (x)), depending on whether we are multiplying by
12082 a vector or a scalar.
12083
12084 Canonicalization is not very good in these cases, FMA4 will put the
12085 by-element operand as operand 0, FNMA4 will have it as operand 1. */
12086 if (GET_CODE (op0) == VEC_DUPLICATE)
12087 op0 = XEXP (op0, 0);
12088 else if (GET_CODE (op1) == VEC_DUPLICATE)
12089 op1 = XEXP (op1, 0);
12090
12091 if (GET_CODE (op0) == VEC_SELECT)
12092 op0 = XEXP (op0, 0);
12093 else if (GET_CODE (op1) == VEC_SELECT)
12094 op1 = XEXP (op1, 0);
12095
12096 /* If the remaining parameters are not registers,
12097 get the cost to put them into registers. */
12098 *cost += rtx_cost (op0, mode, FMA, 0, speed);
12099 *cost += rtx_cost (op1, mode, FMA, 1, speed);
12100 *cost += rtx_cost (op2, mode, FMA, 2, speed);
12101 return true;
12102
12103 case FLOAT:
12104 case UNSIGNED_FLOAT:
12105 if (speed)
12106 *cost += extra_cost->fp[mode == DFmode].fromint;
12107 return false;
12108
12109 case FLOAT_EXTEND:
12110 if (speed)
12111 {
12112 if (VECTOR_MODE_P (mode))
12113 {
12114 /*Vector truncate. */
12115 *cost += extra_cost->vect.alu;
12116 }
12117 else
12118 *cost += extra_cost->fp[mode == DFmode].widen;
12119 }
12120 return false;
12121
12122 case FLOAT_TRUNCATE:
12123 if (speed)
12124 {
12125 if (VECTOR_MODE_P (mode))
12126 {
12127 /*Vector conversion. */
12128 *cost += extra_cost->vect.alu;
12129 }
12130 else
12131 *cost += extra_cost->fp[mode == DFmode].narrow;
12132 }
12133 return false;
12134
12135 case FIX:
12136 case UNSIGNED_FIX:
12137 x = XEXP (x, 0);
12138 /* Strip the rounding part. They will all be implemented
12139 by the fcvt* family of instructions anyway. */
12140 if (GET_CODE (x) == UNSPEC)
12141 {
12142 unsigned int uns_code = XINT (x, 1);
12143
12144 if (uns_code == UNSPEC_FRINTA
12145 || uns_code == UNSPEC_FRINTM
12146 || uns_code == UNSPEC_FRINTN
12147 || uns_code == UNSPEC_FRINTP
12148 || uns_code == UNSPEC_FRINTZ)
12149 x = XVECEXP (x, 0, 0);
12150 }
12151
12152 if (speed)
12153 {
12154 if (VECTOR_MODE_P (mode))
12155 *cost += extra_cost->vect.alu;
12156 else
12157 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
12158 }
12159
12160 /* We can combine fmul by a power of 2 followed by a fcvt into a single
12161 fixed-point fcvt. */
12162 if (GET_CODE (x) == MULT
12163 && ((VECTOR_MODE_P (mode)
12164 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
12165 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
12166 {
12167 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
12168 0, speed);
12169 return true;
12170 }
12171
12172 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
12173 return true;
12174
12175 case ABS:
12176 if (VECTOR_MODE_P (mode))
12177 {
12178 /* ABS (vector). */
12179 if (speed)
12180 *cost += extra_cost->vect.alu;
12181 }
12182 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12183 {
12184 op0 = XEXP (x, 0);
12185
12186 /* FABD, which is analogous to FADD. */
12187 if (GET_CODE (op0) == MINUS)
12188 {
12189 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
12190 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
12191 if (speed)
12192 *cost += extra_cost->fp[mode == DFmode].addsub;
12193
12194 return true;
12195 }
12196 /* Simple FABS is analogous to FNEG. */
12197 if (speed)
12198 *cost += extra_cost->fp[mode == DFmode].neg;
12199 }
12200 else
12201 {
12202 /* Integer ABS will either be split to
12203 two arithmetic instructions, or will be an ABS
12204 (scalar), which we don't model. */
12205 *cost = COSTS_N_INSNS (2);
12206 if (speed)
12207 *cost += 2 * extra_cost->alu.arith;
12208 }
12209 return false;
12210
12211 case SMAX:
12212 case SMIN:
12213 if (speed)
12214 {
12215 if (VECTOR_MODE_P (mode))
12216 *cost += extra_cost->vect.alu;
12217 else
12218 {
12219 /* FMAXNM/FMINNM/FMAX/FMIN.
12220 TODO: This may not be accurate for all implementations, but
12221 we do not model this in the cost tables. */
12222 *cost += extra_cost->fp[mode == DFmode].addsub;
12223 }
12224 }
12225 return false;
12226
12227 case UNSPEC:
12228 /* The floating point round to integer frint* instructions. */
12229 if (aarch64_frint_unspec_p (XINT (x, 1)))
12230 {
12231 if (speed)
12232 *cost += extra_cost->fp[mode == DFmode].roundint;
12233
12234 return false;
12235 }
12236
12237 if (XINT (x, 1) == UNSPEC_RBIT)
12238 {
12239 if (speed)
12240 *cost += extra_cost->alu.rev;
12241
12242 return false;
12243 }
12244 break;
12245
12246 case TRUNCATE:
12247
12248 /* Decompose <su>muldi3_highpart. */
12249 if (/* (truncate:DI */
12250 mode == DImode
12251 /* (lshiftrt:TI */
12252 && GET_MODE (XEXP (x, 0)) == TImode
12253 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
12254 /* (mult:TI */
12255 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12256 /* (ANY_EXTEND:TI (reg:DI))
12257 (ANY_EXTEND:TI (reg:DI))) */
12258 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
12259 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
12260 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
12261 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
12262 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
12263 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
12264 /* (const_int 64) */
12265 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12266 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
12267 {
12268 /* UMULH/SMULH. */
12269 if (speed)
12270 *cost += extra_cost->mult[mode == DImode].extend;
12271 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
12272 mode, MULT, 0, speed);
12273 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
12274 mode, MULT, 1, speed);
12275 return true;
12276 }
12277
12278 /* Fall through. */
12279 default:
12280 break;
12281 }
12282
12283 if (dump_file
12284 && flag_aarch64_verbose_cost)
12285 fprintf (dump_file,
12286 "\nFailed to cost RTX. Assuming default cost.\n");
12287
12288 return true;
12289 }
12290
12291 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12292 calculated for X. This cost is stored in *COST. Returns true
12293 if the total cost of X was calculated. */
12294 static bool
12295 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
12296 int param, int *cost, bool speed)
12297 {
12298 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
12299
12300 if (dump_file
12301 && flag_aarch64_verbose_cost)
12302 {
12303 print_rtl_single (dump_file, x);
12304 fprintf (dump_file, "\n%s cost: %d (%s)\n",
12305 speed ? "Hot" : "Cold",
12306 *cost, result ? "final" : "partial");
12307 }
12308
12309 return result;
12310 }
12311
12312 static int
12313 aarch64_register_move_cost (machine_mode mode,
12314 reg_class_t from_i, reg_class_t to_i)
12315 {
12316 enum reg_class from = (enum reg_class) from_i;
12317 enum reg_class to = (enum reg_class) to_i;
12318 const struct cpu_regmove_cost *regmove_cost
12319 = aarch64_tune_params.regmove_cost;
12320
12321 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
12322 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
12323 to = GENERAL_REGS;
12324
12325 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
12326 from = GENERAL_REGS;
12327
12328 /* Make RDFFR very expensive. In particular, if we know that the FFR
12329 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
12330 as a way of obtaining a PTRUE. */
12331 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
12332 && hard_reg_set_subset_p (reg_class_contents[from_i],
12333 reg_class_contents[FFR_REGS]))
12334 return 80;
12335
12336 /* Moving between GPR and stack cost is the same as GP2GP. */
12337 if ((from == GENERAL_REGS && to == STACK_REG)
12338 || (to == GENERAL_REGS && from == STACK_REG))
12339 return regmove_cost->GP2GP;
12340
12341 /* To/From the stack register, we move via the gprs. */
12342 if (to == STACK_REG || from == STACK_REG)
12343 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
12344 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
12345
12346 if (known_eq (GET_MODE_SIZE (mode), 16))
12347 {
12348 /* 128-bit operations on general registers require 2 instructions. */
12349 if (from == GENERAL_REGS && to == GENERAL_REGS)
12350 return regmove_cost->GP2GP * 2;
12351 else if (from == GENERAL_REGS)
12352 return regmove_cost->GP2FP * 2;
12353 else if (to == GENERAL_REGS)
12354 return regmove_cost->FP2GP * 2;
12355
12356 /* When AdvSIMD instructions are disabled it is not possible to move
12357 a 128-bit value directly between Q registers. This is handled in
12358 secondary reload. A general register is used as a scratch to move
12359 the upper DI value and the lower DI value is moved directly,
12360 hence the cost is the sum of three moves. */
12361 if (! TARGET_SIMD)
12362 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
12363
12364 return regmove_cost->FP2FP;
12365 }
12366
12367 if (from == GENERAL_REGS && to == GENERAL_REGS)
12368 return regmove_cost->GP2GP;
12369 else if (from == GENERAL_REGS)
12370 return regmove_cost->GP2FP;
12371 else if (to == GENERAL_REGS)
12372 return regmove_cost->FP2GP;
12373
12374 return regmove_cost->FP2FP;
12375 }
12376
12377 static int
12378 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
12379 reg_class_t rclass ATTRIBUTE_UNUSED,
12380 bool in ATTRIBUTE_UNUSED)
12381 {
12382 return aarch64_tune_params.memmov_cost;
12383 }
12384
12385 /* Implement TARGET_INIT_BUILTINS. */
12386 static void
12387 aarch64_init_builtins ()
12388 {
12389 aarch64_general_init_builtins ();
12390 aarch64_sve::init_builtins ();
12391 }
12392
12393 /* Implement TARGET_FOLD_BUILTIN. */
12394 static tree
12395 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
12396 {
12397 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12398 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12399 tree type = TREE_TYPE (TREE_TYPE (fndecl));
12400 switch (code & AARCH64_BUILTIN_CLASS)
12401 {
12402 case AARCH64_BUILTIN_GENERAL:
12403 return aarch64_general_fold_builtin (subcode, type, nargs, args);
12404
12405 case AARCH64_BUILTIN_SVE:
12406 return NULL_TREE;
12407 }
12408 gcc_unreachable ();
12409 }
12410
12411 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
12412 static bool
12413 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
12414 {
12415 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
12416 tree fndecl = gimple_call_fndecl (stmt);
12417 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12418 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12419 gimple *new_stmt = NULL;
12420 switch (code & AARCH64_BUILTIN_CLASS)
12421 {
12422 case AARCH64_BUILTIN_GENERAL:
12423 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
12424 break;
12425
12426 case AARCH64_BUILTIN_SVE:
12427 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
12428 break;
12429 }
12430
12431 if (!new_stmt)
12432 return false;
12433
12434 gsi_replace (gsi, new_stmt, true);
12435 return true;
12436 }
12437
12438 /* Implement TARGET_EXPAND_BUILTIN. */
12439 static rtx
12440 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
12441 {
12442 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12443 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12444 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12445 switch (code & AARCH64_BUILTIN_CLASS)
12446 {
12447 case AARCH64_BUILTIN_GENERAL:
12448 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
12449
12450 case AARCH64_BUILTIN_SVE:
12451 return aarch64_sve::expand_builtin (subcode, exp, target);
12452 }
12453 gcc_unreachable ();
12454 }
12455
12456 /* Implement TARGET_BUILTIN_DECL. */
12457 static tree
12458 aarch64_builtin_decl (unsigned int code, bool initialize_p)
12459 {
12460 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12461 switch (code & AARCH64_BUILTIN_CLASS)
12462 {
12463 case AARCH64_BUILTIN_GENERAL:
12464 return aarch64_general_builtin_decl (subcode, initialize_p);
12465
12466 case AARCH64_BUILTIN_SVE:
12467 return aarch64_sve::builtin_decl (subcode, initialize_p);
12468 }
12469 gcc_unreachable ();
12470 }
12471
12472 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
12473 to optimize 1.0/sqrt. */
12474
12475 static bool
12476 use_rsqrt_p (machine_mode mode)
12477 {
12478 return (!flag_trapping_math
12479 && flag_unsafe_math_optimizations
12480 && ((aarch64_tune_params.approx_modes->recip_sqrt
12481 & AARCH64_APPROX_MODE (mode))
12482 || flag_mrecip_low_precision_sqrt));
12483 }
12484
12485 /* Function to decide when to use the approximate reciprocal square root
12486 builtin. */
12487
12488 static tree
12489 aarch64_builtin_reciprocal (tree fndecl)
12490 {
12491 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
12492
12493 if (!use_rsqrt_p (mode))
12494 return NULL_TREE;
12495 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12496 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12497 switch (code & AARCH64_BUILTIN_CLASS)
12498 {
12499 case AARCH64_BUILTIN_GENERAL:
12500 return aarch64_general_builtin_rsqrt (subcode);
12501
12502 case AARCH64_BUILTIN_SVE:
12503 return NULL_TREE;
12504 }
12505 gcc_unreachable ();
12506 }
12507
12508 /* Emit instruction sequence to compute either the approximate square root
12509 or its approximate reciprocal, depending on the flag RECP, and return
12510 whether the sequence was emitted or not. */
12511
12512 bool
12513 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
12514 {
12515 machine_mode mode = GET_MODE (dst);
12516
12517 if (GET_MODE_INNER (mode) == HFmode)
12518 {
12519 gcc_assert (!recp);
12520 return false;
12521 }
12522
12523 if (!recp)
12524 {
12525 if (!(flag_mlow_precision_sqrt
12526 || (aarch64_tune_params.approx_modes->sqrt
12527 & AARCH64_APPROX_MODE (mode))))
12528 return false;
12529
12530 if (flag_finite_math_only
12531 || flag_trapping_math
12532 || !flag_unsafe_math_optimizations
12533 || optimize_function_for_size_p (cfun))
12534 return false;
12535 }
12536 else
12537 /* Caller assumes we cannot fail. */
12538 gcc_assert (use_rsqrt_p (mode));
12539
12540 machine_mode mmsk = (VECTOR_MODE_P (mode)
12541 ? mode_for_int_vector (mode).require ()
12542 : int_mode_for_mode (mode).require ());
12543 rtx xmsk = gen_reg_rtx (mmsk);
12544 if (!recp)
12545 /* When calculating the approximate square root, compare the
12546 argument with 0.0 and create a mask. */
12547 emit_insn (gen_rtx_SET (xmsk,
12548 gen_rtx_NEG (mmsk,
12549 gen_rtx_EQ (mmsk, src,
12550 CONST0_RTX (mode)))));
12551
12552 /* Estimate the approximate reciprocal square root. */
12553 rtx xdst = gen_reg_rtx (mode);
12554 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
12555
12556 /* Iterate over the series twice for SF and thrice for DF. */
12557 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12558
12559 /* Optionally iterate over the series once less for faster performance
12560 while sacrificing the accuracy. */
12561 if ((recp && flag_mrecip_low_precision_sqrt)
12562 || (!recp && flag_mlow_precision_sqrt))
12563 iterations--;
12564
12565 /* Iterate over the series to calculate the approximate reciprocal square
12566 root. */
12567 rtx x1 = gen_reg_rtx (mode);
12568 while (iterations--)
12569 {
12570 rtx x2 = gen_reg_rtx (mode);
12571 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
12572
12573 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
12574
12575 if (iterations > 0)
12576 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
12577 }
12578
12579 if (!recp)
12580 {
12581 /* Qualify the approximate reciprocal square root when the argument is
12582 0.0 by squashing the intermediary result to 0.0. */
12583 rtx xtmp = gen_reg_rtx (mmsk);
12584 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
12585 gen_rtx_SUBREG (mmsk, xdst, 0)));
12586 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
12587
12588 /* Calculate the approximate square root. */
12589 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
12590 }
12591
12592 /* Finalize the approximation. */
12593 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
12594
12595 return true;
12596 }
12597
12598 /* Emit the instruction sequence to compute the approximation for the division
12599 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
12600
12601 bool
12602 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
12603 {
12604 machine_mode mode = GET_MODE (quo);
12605
12606 if (GET_MODE_INNER (mode) == HFmode)
12607 return false;
12608
12609 bool use_approx_division_p = (flag_mlow_precision_div
12610 || (aarch64_tune_params.approx_modes->division
12611 & AARCH64_APPROX_MODE (mode)));
12612
12613 if (!flag_finite_math_only
12614 || flag_trapping_math
12615 || !flag_unsafe_math_optimizations
12616 || optimize_function_for_size_p (cfun)
12617 || !use_approx_division_p)
12618 return false;
12619
12620 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
12621 return false;
12622
12623 /* Estimate the approximate reciprocal. */
12624 rtx xrcp = gen_reg_rtx (mode);
12625 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
12626
12627 /* Iterate over the series twice for SF and thrice for DF. */
12628 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12629
12630 /* Optionally iterate over the series once less for faster performance,
12631 while sacrificing the accuracy. */
12632 if (flag_mlow_precision_div)
12633 iterations--;
12634
12635 /* Iterate over the series to calculate the approximate reciprocal. */
12636 rtx xtmp = gen_reg_rtx (mode);
12637 while (iterations--)
12638 {
12639 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
12640
12641 if (iterations > 0)
12642 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
12643 }
12644
12645 if (num != CONST1_RTX (mode))
12646 {
12647 /* As the approximate reciprocal of DEN is already calculated, only
12648 calculate the approximate division when NUM is not 1.0. */
12649 rtx xnum = force_reg (mode, num);
12650 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
12651 }
12652
12653 /* Finalize the approximation. */
12654 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
12655 return true;
12656 }
12657
12658 /* Return the number of instructions that can be issued per cycle. */
12659 static int
12660 aarch64_sched_issue_rate (void)
12661 {
12662 return aarch64_tune_params.issue_rate;
12663 }
12664
12665 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
12666 static int
12667 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
12668 {
12669 if (DEBUG_INSN_P (insn))
12670 return more;
12671
12672 rtx_code code = GET_CODE (PATTERN (insn));
12673 if (code == USE || code == CLOBBER)
12674 return more;
12675
12676 if (get_attr_type (insn) == TYPE_NO_INSN)
12677 return more;
12678
12679 return more - 1;
12680 }
12681
12682 static int
12683 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
12684 {
12685 int issue_rate = aarch64_sched_issue_rate ();
12686
12687 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
12688 }
12689
12690
12691 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
12692 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
12693 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
12694
12695 static int
12696 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
12697 int ready_index)
12698 {
12699 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
12700 }
12701
12702
12703 /* Vectorizer cost model target hooks. */
12704
12705 /* Implement targetm.vectorize.builtin_vectorization_cost. */
12706 static int
12707 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
12708 tree vectype,
12709 int misalign ATTRIBUTE_UNUSED)
12710 {
12711 unsigned elements;
12712 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
12713 bool fp = false;
12714
12715 if (vectype != NULL)
12716 fp = FLOAT_TYPE_P (vectype);
12717
12718 switch (type_of_cost)
12719 {
12720 case scalar_stmt:
12721 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
12722
12723 case scalar_load:
12724 return costs->scalar_load_cost;
12725
12726 case scalar_store:
12727 return costs->scalar_store_cost;
12728
12729 case vector_stmt:
12730 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12731
12732 case vector_load:
12733 return costs->vec_align_load_cost;
12734
12735 case vector_store:
12736 return costs->vec_store_cost;
12737
12738 case vec_to_scalar:
12739 return costs->vec_to_scalar_cost;
12740
12741 case scalar_to_vec:
12742 return costs->scalar_to_vec_cost;
12743
12744 case unaligned_load:
12745 case vector_gather_load:
12746 return costs->vec_unalign_load_cost;
12747
12748 case unaligned_store:
12749 case vector_scatter_store:
12750 return costs->vec_unalign_store_cost;
12751
12752 case cond_branch_taken:
12753 return costs->cond_taken_branch_cost;
12754
12755 case cond_branch_not_taken:
12756 return costs->cond_not_taken_branch_cost;
12757
12758 case vec_perm:
12759 return costs->vec_permute_cost;
12760
12761 case vec_promote_demote:
12762 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12763
12764 case vec_construct:
12765 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
12766 return elements / 2 + 1;
12767
12768 default:
12769 gcc_unreachable ();
12770 }
12771 }
12772
12773 /* Implement targetm.vectorize.add_stmt_cost. */
12774 static unsigned
12775 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
12776 struct _stmt_vec_info *stmt_info, int misalign,
12777 enum vect_cost_model_location where)
12778 {
12779 unsigned *cost = (unsigned *) data;
12780 unsigned retval = 0;
12781
12782 if (flag_vect_cost_model)
12783 {
12784 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
12785 int stmt_cost =
12786 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
12787
12788 /* Statements in an inner loop relative to the loop being
12789 vectorized are weighted more heavily. The value here is
12790 arbitrary and could potentially be improved with analysis. */
12791 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
12792 count *= 50; /* FIXME */
12793
12794 retval = (unsigned) (count * stmt_cost);
12795 cost[where] += retval;
12796 }
12797
12798 return retval;
12799 }
12800
12801 static void initialize_aarch64_code_model (struct gcc_options *);
12802
12803 /* Parse the TO_PARSE string and put the architecture struct that it
12804 selects into RES and the architectural features into ISA_FLAGS.
12805 Return an aarch64_parse_opt_result describing the parse result.
12806 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
12807 When the TO_PARSE string contains an invalid extension,
12808 a copy of the string is created and stored to INVALID_EXTENSION. */
12809
12810 static enum aarch64_parse_opt_result
12811 aarch64_parse_arch (const char *to_parse, const struct processor **res,
12812 uint64_t *isa_flags, std::string *invalid_extension)
12813 {
12814 const char *ext;
12815 const struct processor *arch;
12816 size_t len;
12817
12818 ext = strchr (to_parse, '+');
12819
12820 if (ext != NULL)
12821 len = ext - to_parse;
12822 else
12823 len = strlen (to_parse);
12824
12825 if (len == 0)
12826 return AARCH64_PARSE_MISSING_ARG;
12827
12828
12829 /* Loop through the list of supported ARCHes to find a match. */
12830 for (arch = all_architectures; arch->name != NULL; arch++)
12831 {
12832 if (strlen (arch->name) == len
12833 && strncmp (arch->name, to_parse, len) == 0)
12834 {
12835 uint64_t isa_temp = arch->flags;
12836
12837 if (ext != NULL)
12838 {
12839 /* TO_PARSE string contains at least one extension. */
12840 enum aarch64_parse_opt_result ext_res
12841 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12842
12843 if (ext_res != AARCH64_PARSE_OK)
12844 return ext_res;
12845 }
12846 /* Extension parsing was successful. Confirm the result
12847 arch and ISA flags. */
12848 *res = arch;
12849 *isa_flags = isa_temp;
12850 return AARCH64_PARSE_OK;
12851 }
12852 }
12853
12854 /* ARCH name not found in list. */
12855 return AARCH64_PARSE_INVALID_ARG;
12856 }
12857
12858 /* Parse the TO_PARSE string and put the result tuning in RES and the
12859 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
12860 describing the parse result. If there is an error parsing, RES and
12861 ISA_FLAGS are left unchanged.
12862 When the TO_PARSE string contains an invalid extension,
12863 a copy of the string is created and stored to INVALID_EXTENSION. */
12864
12865 static enum aarch64_parse_opt_result
12866 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
12867 uint64_t *isa_flags, std::string *invalid_extension)
12868 {
12869 const char *ext;
12870 const struct processor *cpu;
12871 size_t len;
12872
12873 ext = strchr (to_parse, '+');
12874
12875 if (ext != NULL)
12876 len = ext - to_parse;
12877 else
12878 len = strlen (to_parse);
12879
12880 if (len == 0)
12881 return AARCH64_PARSE_MISSING_ARG;
12882
12883
12884 /* Loop through the list of supported CPUs to find a match. */
12885 for (cpu = all_cores; cpu->name != NULL; cpu++)
12886 {
12887 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12888 {
12889 uint64_t isa_temp = cpu->flags;
12890
12891
12892 if (ext != NULL)
12893 {
12894 /* TO_PARSE string contains at least one extension. */
12895 enum aarch64_parse_opt_result ext_res
12896 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12897
12898 if (ext_res != AARCH64_PARSE_OK)
12899 return ext_res;
12900 }
12901 /* Extension parsing was successfull. Confirm the result
12902 cpu and ISA flags. */
12903 *res = cpu;
12904 *isa_flags = isa_temp;
12905 return AARCH64_PARSE_OK;
12906 }
12907 }
12908
12909 /* CPU name not found in list. */
12910 return AARCH64_PARSE_INVALID_ARG;
12911 }
12912
12913 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12914 Return an aarch64_parse_opt_result describing the parse result.
12915 If the parsing fails the RES does not change. */
12916
12917 static enum aarch64_parse_opt_result
12918 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12919 {
12920 const struct processor *cpu;
12921
12922 /* Loop through the list of supported CPUs to find a match. */
12923 for (cpu = all_cores; cpu->name != NULL; cpu++)
12924 {
12925 if (strcmp (cpu->name, to_parse) == 0)
12926 {
12927 *res = cpu;
12928 return AARCH64_PARSE_OK;
12929 }
12930 }
12931
12932 /* CPU name not found in list. */
12933 return AARCH64_PARSE_INVALID_ARG;
12934 }
12935
12936 /* Parse TOKEN, which has length LENGTH to see if it is an option
12937 described in FLAG. If it is, return the index bit for that fusion type.
12938 If not, error (printing OPTION_NAME) and return zero. */
12939
12940 static unsigned int
12941 aarch64_parse_one_option_token (const char *token,
12942 size_t length,
12943 const struct aarch64_flag_desc *flag,
12944 const char *option_name)
12945 {
12946 for (; flag->name != NULL; flag++)
12947 {
12948 if (length == strlen (flag->name)
12949 && !strncmp (flag->name, token, length))
12950 return flag->flag;
12951 }
12952
12953 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12954 return 0;
12955 }
12956
12957 /* Parse OPTION which is a comma-separated list of flags to enable.
12958 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12959 default state we inherit from the CPU tuning structures. OPTION_NAME
12960 gives the top-level option we are parsing in the -moverride string,
12961 for use in error messages. */
12962
12963 static unsigned int
12964 aarch64_parse_boolean_options (const char *option,
12965 const struct aarch64_flag_desc *flags,
12966 unsigned int initial_state,
12967 const char *option_name)
12968 {
12969 const char separator = '.';
12970 const char* specs = option;
12971 const char* ntoken = option;
12972 unsigned int found_flags = initial_state;
12973
12974 while ((ntoken = strchr (specs, separator)))
12975 {
12976 size_t token_length = ntoken - specs;
12977 unsigned token_ops = aarch64_parse_one_option_token (specs,
12978 token_length,
12979 flags,
12980 option_name);
12981 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12982 in the token stream, reset the supported operations. So:
12983
12984 adrp+add.cmp+branch.none.adrp+add
12985
12986 would have the result of turning on only adrp+add fusion. */
12987 if (!token_ops)
12988 found_flags = 0;
12989
12990 found_flags |= token_ops;
12991 specs = ++ntoken;
12992 }
12993
12994 /* We ended with a comma, print something. */
12995 if (!(*specs))
12996 {
12997 error ("%s string ill-formed\n", option_name);
12998 return 0;
12999 }
13000
13001 /* We still have one more token to parse. */
13002 size_t token_length = strlen (specs);
13003 unsigned token_ops = aarch64_parse_one_option_token (specs,
13004 token_length,
13005 flags,
13006 option_name);
13007 if (!token_ops)
13008 found_flags = 0;
13009
13010 found_flags |= token_ops;
13011 return found_flags;
13012 }
13013
13014 /* Support for overriding instruction fusion. */
13015
13016 static void
13017 aarch64_parse_fuse_string (const char *fuse_string,
13018 struct tune_params *tune)
13019 {
13020 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
13021 aarch64_fusible_pairs,
13022 tune->fusible_ops,
13023 "fuse=");
13024 }
13025
13026 /* Support for overriding other tuning flags. */
13027
13028 static void
13029 aarch64_parse_tune_string (const char *tune_string,
13030 struct tune_params *tune)
13031 {
13032 tune->extra_tuning_flags
13033 = aarch64_parse_boolean_options (tune_string,
13034 aarch64_tuning_flags,
13035 tune->extra_tuning_flags,
13036 "tune=");
13037 }
13038
13039 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13040 Accept the valid SVE vector widths allowed by
13041 aarch64_sve_vector_bits_enum and use it to override sve_width
13042 in TUNE. */
13043
13044 static void
13045 aarch64_parse_sve_width_string (const char *tune_string,
13046 struct tune_params *tune)
13047 {
13048 int width = -1;
13049
13050 int n = sscanf (tune_string, "%d", &width);
13051 if (n == EOF)
13052 {
13053 error ("invalid format for sve_width");
13054 return;
13055 }
13056 switch (width)
13057 {
13058 case SVE_128:
13059 case SVE_256:
13060 case SVE_512:
13061 case SVE_1024:
13062 case SVE_2048:
13063 break;
13064 default:
13065 error ("invalid sve_width value: %d", width);
13066 }
13067 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
13068 }
13069
13070 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13071 we understand. If it is, extract the option string and handoff to
13072 the appropriate function. */
13073
13074 void
13075 aarch64_parse_one_override_token (const char* token,
13076 size_t length,
13077 struct tune_params *tune)
13078 {
13079 const struct aarch64_tuning_override_function *fn
13080 = aarch64_tuning_override_functions;
13081
13082 const char *option_part = strchr (token, '=');
13083 if (!option_part)
13084 {
13085 error ("tuning string missing in option (%s)", token);
13086 return;
13087 }
13088
13089 /* Get the length of the option name. */
13090 length = option_part - token;
13091 /* Skip the '=' to get to the option string. */
13092 option_part++;
13093
13094 for (; fn->name != NULL; fn++)
13095 {
13096 if (!strncmp (fn->name, token, length))
13097 {
13098 fn->parse_override (option_part, tune);
13099 return;
13100 }
13101 }
13102
13103 error ("unknown tuning option (%s)",token);
13104 return;
13105 }
13106
13107 /* A checking mechanism for the implementation of the tls size. */
13108
13109 static void
13110 initialize_aarch64_tls_size (struct gcc_options *opts)
13111 {
13112 if (aarch64_tls_size == 0)
13113 aarch64_tls_size = 24;
13114
13115 switch (opts->x_aarch64_cmodel_var)
13116 {
13117 case AARCH64_CMODEL_TINY:
13118 /* Both the default and maximum TLS size allowed under tiny is 1M which
13119 needs two instructions to address, so we clamp the size to 24. */
13120 if (aarch64_tls_size > 24)
13121 aarch64_tls_size = 24;
13122 break;
13123 case AARCH64_CMODEL_SMALL:
13124 /* The maximum TLS size allowed under small is 4G. */
13125 if (aarch64_tls_size > 32)
13126 aarch64_tls_size = 32;
13127 break;
13128 case AARCH64_CMODEL_LARGE:
13129 /* The maximum TLS size allowed under large is 16E.
13130 FIXME: 16E should be 64bit, we only support 48bit offset now. */
13131 if (aarch64_tls_size > 48)
13132 aarch64_tls_size = 48;
13133 break;
13134 default:
13135 gcc_unreachable ();
13136 }
13137
13138 return;
13139 }
13140
13141 /* Parse STRING looking for options in the format:
13142 string :: option:string
13143 option :: name=substring
13144 name :: {a-z}
13145 substring :: defined by option. */
13146
13147 static void
13148 aarch64_parse_override_string (const char* input_string,
13149 struct tune_params* tune)
13150 {
13151 const char separator = ':';
13152 size_t string_length = strlen (input_string) + 1;
13153 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
13154 char *string = string_root;
13155 strncpy (string, input_string, string_length);
13156 string[string_length - 1] = '\0';
13157
13158 char* ntoken = string;
13159
13160 while ((ntoken = strchr (string, separator)))
13161 {
13162 size_t token_length = ntoken - string;
13163 /* Make this substring look like a string. */
13164 *ntoken = '\0';
13165 aarch64_parse_one_override_token (string, token_length, tune);
13166 string = ++ntoken;
13167 }
13168
13169 /* One last option to parse. */
13170 aarch64_parse_one_override_token (string, strlen (string), tune);
13171 free (string_root);
13172 }
13173
13174
13175 static void
13176 aarch64_override_options_after_change_1 (struct gcc_options *opts)
13177 {
13178 if (accepted_branch_protection_string)
13179 {
13180 opts->x_aarch64_branch_protection_string
13181 = xstrdup (accepted_branch_protection_string);
13182 }
13183
13184 /* PR 70044: We have to be careful about being called multiple times for the
13185 same function. This means all changes should be repeatable. */
13186
13187 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
13188 Disable the frame pointer flag so the mid-end will not use a frame
13189 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
13190 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
13191 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
13192 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
13193 if (opts->x_flag_omit_frame_pointer == 0)
13194 opts->x_flag_omit_frame_pointer = 2;
13195
13196 /* If not optimizing for size, set the default
13197 alignment to what the target wants. */
13198 if (!opts->x_optimize_size)
13199 {
13200 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
13201 opts->x_str_align_loops = aarch64_tune_params.loop_align;
13202 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
13203 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
13204 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
13205 opts->x_str_align_functions = aarch64_tune_params.function_align;
13206 }
13207
13208 /* We default to no pc-relative literal loads. */
13209
13210 aarch64_pcrelative_literal_loads = false;
13211
13212 /* If -mpc-relative-literal-loads is set on the command line, this
13213 implies that the user asked for PC relative literal loads. */
13214 if (opts->x_pcrelative_literal_loads == 1)
13215 aarch64_pcrelative_literal_loads = true;
13216
13217 /* In the tiny memory model it makes no sense to disallow PC relative
13218 literal pool loads. */
13219 if (aarch64_cmodel == AARCH64_CMODEL_TINY
13220 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13221 aarch64_pcrelative_literal_loads = true;
13222
13223 /* When enabling the lower precision Newton series for the square root, also
13224 enable it for the reciprocal square root, since the latter is an
13225 intermediary step for the former. */
13226 if (flag_mlow_precision_sqrt)
13227 flag_mrecip_low_precision_sqrt = true;
13228 }
13229
13230 /* 'Unpack' up the internal tuning structs and update the options
13231 in OPTS. The caller must have set up selected_tune and selected_arch
13232 as all the other target-specific codegen decisions are
13233 derived from them. */
13234
13235 void
13236 aarch64_override_options_internal (struct gcc_options *opts)
13237 {
13238 aarch64_tune_flags = selected_tune->flags;
13239 aarch64_tune = selected_tune->sched_core;
13240 /* Make a copy of the tuning parameters attached to the core, which
13241 we may later overwrite. */
13242 aarch64_tune_params = *(selected_tune->tune);
13243 aarch64_architecture_version = selected_arch->architecture_version;
13244
13245 if (opts->x_aarch64_override_tune_string)
13246 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
13247 &aarch64_tune_params);
13248
13249 /* This target defaults to strict volatile bitfields. */
13250 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
13251 opts->x_flag_strict_volatile_bitfields = 1;
13252
13253 if (aarch64_stack_protector_guard == SSP_GLOBAL
13254 && opts->x_aarch64_stack_protector_guard_offset_str)
13255 {
13256 error ("incompatible options %<-mstack-protector-guard=global%> and "
13257 "%<-mstack-protector-guard-offset=%s%>",
13258 aarch64_stack_protector_guard_offset_str);
13259 }
13260
13261 if (aarch64_stack_protector_guard == SSP_SYSREG
13262 && !(opts->x_aarch64_stack_protector_guard_offset_str
13263 && opts->x_aarch64_stack_protector_guard_reg_str))
13264 {
13265 error ("both %<-mstack-protector-guard-offset%> and "
13266 "%<-mstack-protector-guard-reg%> must be used "
13267 "with %<-mstack-protector-guard=sysreg%>");
13268 }
13269
13270 if (opts->x_aarch64_stack_protector_guard_reg_str)
13271 {
13272 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
13273 error ("specify a system register with a small string length.");
13274 }
13275
13276 if (opts->x_aarch64_stack_protector_guard_offset_str)
13277 {
13278 char *end;
13279 const char *str = aarch64_stack_protector_guard_offset_str;
13280 errno = 0;
13281 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
13282 if (!*str || *end || errno)
13283 error ("%qs is not a valid offset in %qs", str,
13284 "-mstack-protector-guard-offset=");
13285 aarch64_stack_protector_guard_offset = offs;
13286 }
13287
13288 initialize_aarch64_code_model (opts);
13289 initialize_aarch64_tls_size (opts);
13290
13291 int queue_depth = 0;
13292 switch (aarch64_tune_params.autoprefetcher_model)
13293 {
13294 case tune_params::AUTOPREFETCHER_OFF:
13295 queue_depth = -1;
13296 break;
13297 case tune_params::AUTOPREFETCHER_WEAK:
13298 queue_depth = 0;
13299 break;
13300 case tune_params::AUTOPREFETCHER_STRONG:
13301 queue_depth = max_insn_queue_index + 1;
13302 break;
13303 default:
13304 gcc_unreachable ();
13305 }
13306
13307 /* We don't mind passing in global_options_set here as we don't use
13308 the *options_set structs anyway. */
13309 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
13310 queue_depth,
13311 opts->x_param_values,
13312 global_options_set.x_param_values);
13313
13314 /* Set up parameters to be used in prefetching algorithm. Do not
13315 override the defaults unless we are tuning for a core we have
13316 researched values for. */
13317 if (aarch64_tune_params.prefetch->num_slots > 0)
13318 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
13319 aarch64_tune_params.prefetch->num_slots,
13320 opts->x_param_values,
13321 global_options_set.x_param_values);
13322 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
13323 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
13324 aarch64_tune_params.prefetch->l1_cache_size,
13325 opts->x_param_values,
13326 global_options_set.x_param_values);
13327 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
13328 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
13329 aarch64_tune_params.prefetch->l1_cache_line_size,
13330 opts->x_param_values,
13331 global_options_set.x_param_values);
13332 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
13333 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
13334 aarch64_tune_params.prefetch->l2_cache_size,
13335 opts->x_param_values,
13336 global_options_set.x_param_values);
13337 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
13338 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
13339 0,
13340 opts->x_param_values,
13341 global_options_set.x_param_values);
13342 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
13343 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
13344 aarch64_tune_params.prefetch->minimum_stride,
13345 opts->x_param_values,
13346 global_options_set.x_param_values);
13347
13348 /* Use the alternative scheduling-pressure algorithm by default. */
13349 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
13350 opts->x_param_values,
13351 global_options_set.x_param_values);
13352
13353 /* If the user hasn't changed it via configure then set the default to 64 KB
13354 for the backend. */
13355 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
13356 DEFAULT_STK_CLASH_GUARD_SIZE == 0
13357 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
13358 opts->x_param_values,
13359 global_options_set.x_param_values);
13360
13361 /* Validate the guard size. */
13362 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
13363
13364 /* Enforce that interval is the same size as size so the mid-end does the
13365 right thing. */
13366 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
13367 guard_size,
13368 opts->x_param_values,
13369 global_options_set.x_param_values);
13370
13371 /* The maybe_set calls won't update the value if the user has explicitly set
13372 one. Which means we need to validate that probing interval and guard size
13373 are equal. */
13374 int probe_interval
13375 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
13376 if (guard_size != probe_interval)
13377 error ("stack clash guard size %<%d%> must be equal to probing interval "
13378 "%<%d%>", guard_size, probe_interval);
13379
13380 /* Enable sw prefetching at specified optimization level for
13381 CPUS that have prefetch. Lower optimization level threshold by 1
13382 when profiling is enabled. */
13383 if (opts->x_flag_prefetch_loop_arrays < 0
13384 && !opts->x_optimize_size
13385 && aarch64_tune_params.prefetch->default_opt_level >= 0
13386 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
13387 opts->x_flag_prefetch_loop_arrays = 1;
13388
13389 if (opts->x_aarch64_arch_string == NULL)
13390 opts->x_aarch64_arch_string = selected_arch->name;
13391 if (opts->x_aarch64_cpu_string == NULL)
13392 opts->x_aarch64_cpu_string = selected_cpu->name;
13393 if (opts->x_aarch64_tune_string == NULL)
13394 opts->x_aarch64_tune_string = selected_tune->name;
13395
13396 aarch64_override_options_after_change_1 (opts);
13397 }
13398
13399 /* Print a hint with a suggestion for a core or architecture name that
13400 most closely resembles what the user passed in STR. ARCH is true if
13401 the user is asking for an architecture name. ARCH is false if the user
13402 is asking for a core name. */
13403
13404 static void
13405 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
13406 {
13407 auto_vec<const char *> candidates;
13408 const struct processor *entry = arch ? all_architectures : all_cores;
13409 for (; entry->name != NULL; entry++)
13410 candidates.safe_push (entry->name);
13411
13412 #ifdef HAVE_LOCAL_CPU_DETECT
13413 /* Add also "native" as possible value. */
13414 if (arch)
13415 candidates.safe_push ("native");
13416 #endif
13417
13418 char *s;
13419 const char *hint = candidates_list_and_hint (str, s, candidates);
13420 if (hint)
13421 inform (input_location, "valid arguments are: %s;"
13422 " did you mean %qs?", s, hint);
13423 else
13424 inform (input_location, "valid arguments are: %s", s);
13425
13426 XDELETEVEC (s);
13427 }
13428
13429 /* Print a hint with a suggestion for a core name that most closely resembles
13430 what the user passed in STR. */
13431
13432 inline static void
13433 aarch64_print_hint_for_core (const char *str)
13434 {
13435 aarch64_print_hint_for_core_or_arch (str, false);
13436 }
13437
13438 /* Print a hint with a suggestion for an architecture name that most closely
13439 resembles what the user passed in STR. */
13440
13441 inline static void
13442 aarch64_print_hint_for_arch (const char *str)
13443 {
13444 aarch64_print_hint_for_core_or_arch (str, true);
13445 }
13446
13447
13448 /* Print a hint with a suggestion for an extension name
13449 that most closely resembles what the user passed in STR. */
13450
13451 void
13452 aarch64_print_hint_for_extensions (const std::string &str)
13453 {
13454 auto_vec<const char *> candidates;
13455 aarch64_get_all_extension_candidates (&candidates);
13456 char *s;
13457 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
13458 if (hint)
13459 inform (input_location, "valid arguments are: %s;"
13460 " did you mean %qs?", s, hint);
13461 else
13462 inform (input_location, "valid arguments are: %s;", s);
13463
13464 XDELETEVEC (s);
13465 }
13466
13467 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
13468 specified in STR and throw errors if appropriate. Put the results if
13469 they are valid in RES and ISA_FLAGS. Return whether the option is
13470 valid. */
13471
13472 static bool
13473 aarch64_validate_mcpu (const char *str, const struct processor **res,
13474 uint64_t *isa_flags)
13475 {
13476 std::string invalid_extension;
13477 enum aarch64_parse_opt_result parse_res
13478 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
13479
13480 if (parse_res == AARCH64_PARSE_OK)
13481 return true;
13482
13483 switch (parse_res)
13484 {
13485 case AARCH64_PARSE_MISSING_ARG:
13486 error ("missing cpu name in %<-mcpu=%s%>", str);
13487 break;
13488 case AARCH64_PARSE_INVALID_ARG:
13489 error ("unknown value %qs for %<-mcpu%>", str);
13490 aarch64_print_hint_for_core (str);
13491 break;
13492 case AARCH64_PARSE_INVALID_FEATURE:
13493 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
13494 invalid_extension.c_str (), str);
13495 aarch64_print_hint_for_extensions (invalid_extension);
13496 break;
13497 default:
13498 gcc_unreachable ();
13499 }
13500
13501 return false;
13502 }
13503
13504 /* Parses CONST_STR for branch protection features specified in
13505 aarch64_branch_protect_types, and set any global variables required. Returns
13506 the parsing result and assigns LAST_STR to the last processed token from
13507 CONST_STR so that it can be used for error reporting. */
13508
13509 static enum
13510 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
13511 char** last_str)
13512 {
13513 char *str_root = xstrdup (const_str);
13514 char* token_save = NULL;
13515 char *str = strtok_r (str_root, "+", &token_save);
13516 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
13517 if (!str)
13518 res = AARCH64_PARSE_MISSING_ARG;
13519 else
13520 {
13521 char *next_str = strtok_r (NULL, "+", &token_save);
13522 /* Reset the branch protection features to their defaults. */
13523 aarch64_handle_no_branch_protection (NULL, NULL);
13524
13525 while (str && res == AARCH64_PARSE_OK)
13526 {
13527 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
13528 bool found = false;
13529 /* Search for this type. */
13530 while (type && type->name && !found && res == AARCH64_PARSE_OK)
13531 {
13532 if (strcmp (str, type->name) == 0)
13533 {
13534 found = true;
13535 res = type->handler (str, next_str);
13536 str = next_str;
13537 next_str = strtok_r (NULL, "+", &token_save);
13538 }
13539 else
13540 type++;
13541 }
13542 if (found && res == AARCH64_PARSE_OK)
13543 {
13544 bool found_subtype = true;
13545 /* Loop through each token until we find one that isn't a
13546 subtype. */
13547 while (found_subtype)
13548 {
13549 found_subtype = false;
13550 const aarch64_branch_protect_type *subtype = type->subtypes;
13551 /* Search for the subtype. */
13552 while (str && subtype && subtype->name && !found_subtype
13553 && res == AARCH64_PARSE_OK)
13554 {
13555 if (strcmp (str, subtype->name) == 0)
13556 {
13557 found_subtype = true;
13558 res = subtype->handler (str, next_str);
13559 str = next_str;
13560 next_str = strtok_r (NULL, "+", &token_save);
13561 }
13562 else
13563 subtype++;
13564 }
13565 }
13566 }
13567 else if (!found)
13568 res = AARCH64_PARSE_INVALID_ARG;
13569 }
13570 }
13571 /* Copy the last processed token into the argument to pass it back.
13572 Used by option and attribute validation to print the offending token. */
13573 if (last_str)
13574 {
13575 if (str) strcpy (*last_str, str);
13576 else *last_str = NULL;
13577 }
13578 if (res == AARCH64_PARSE_OK)
13579 {
13580 /* If needed, alloc the accepted string then copy in const_str.
13581 Used by override_option_after_change_1. */
13582 if (!accepted_branch_protection_string)
13583 accepted_branch_protection_string = (char *) xmalloc (
13584 BRANCH_PROTECT_STR_MAX
13585 + 1);
13586 strncpy (accepted_branch_protection_string, const_str,
13587 BRANCH_PROTECT_STR_MAX + 1);
13588 /* Forcibly null-terminate. */
13589 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
13590 }
13591 return res;
13592 }
13593
13594 static bool
13595 aarch64_validate_mbranch_protection (const char *const_str)
13596 {
13597 char *str = (char *) xmalloc (strlen (const_str));
13598 enum aarch64_parse_opt_result res =
13599 aarch64_parse_branch_protection (const_str, &str);
13600 if (res == AARCH64_PARSE_INVALID_ARG)
13601 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
13602 else if (res == AARCH64_PARSE_MISSING_ARG)
13603 error ("missing argument for %<-mbranch-protection=%>");
13604 free (str);
13605 return res == AARCH64_PARSE_OK;
13606 }
13607
13608 /* Validate a command-line -march option. Parse the arch and extensions
13609 (if any) specified in STR and throw errors if appropriate. Put the
13610 results, if they are valid, in RES and ISA_FLAGS. Return whether the
13611 option is valid. */
13612
13613 static bool
13614 aarch64_validate_march (const char *str, const struct processor **res,
13615 uint64_t *isa_flags)
13616 {
13617 std::string invalid_extension;
13618 enum aarch64_parse_opt_result parse_res
13619 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
13620
13621 if (parse_res == AARCH64_PARSE_OK)
13622 return true;
13623
13624 switch (parse_res)
13625 {
13626 case AARCH64_PARSE_MISSING_ARG:
13627 error ("missing arch name in %<-march=%s%>", str);
13628 break;
13629 case AARCH64_PARSE_INVALID_ARG:
13630 error ("unknown value %qs for %<-march%>", str);
13631 aarch64_print_hint_for_arch (str);
13632 break;
13633 case AARCH64_PARSE_INVALID_FEATURE:
13634 error ("invalid feature modifier %qs in %<-march=%s%>",
13635 invalid_extension.c_str (), str);
13636 aarch64_print_hint_for_extensions (invalid_extension);
13637 break;
13638 default:
13639 gcc_unreachable ();
13640 }
13641
13642 return false;
13643 }
13644
13645 /* Validate a command-line -mtune option. Parse the cpu
13646 specified in STR and throw errors if appropriate. Put the
13647 result, if it is valid, in RES. Return whether the option is
13648 valid. */
13649
13650 static bool
13651 aarch64_validate_mtune (const char *str, const struct processor **res)
13652 {
13653 enum aarch64_parse_opt_result parse_res
13654 = aarch64_parse_tune (str, res);
13655
13656 if (parse_res == AARCH64_PARSE_OK)
13657 return true;
13658
13659 switch (parse_res)
13660 {
13661 case AARCH64_PARSE_MISSING_ARG:
13662 error ("missing cpu name in %<-mtune=%s%>", str);
13663 break;
13664 case AARCH64_PARSE_INVALID_ARG:
13665 error ("unknown value %qs for %<-mtune%>", str);
13666 aarch64_print_hint_for_core (str);
13667 break;
13668 default:
13669 gcc_unreachable ();
13670 }
13671 return false;
13672 }
13673
13674 /* Return the CPU corresponding to the enum CPU.
13675 If it doesn't specify a cpu, return the default. */
13676
13677 static const struct processor *
13678 aarch64_get_tune_cpu (enum aarch64_processor cpu)
13679 {
13680 if (cpu != aarch64_none)
13681 return &all_cores[cpu];
13682
13683 /* The & 0x3f is to extract the bottom 6 bits that encode the
13684 default cpu as selected by the --with-cpu GCC configure option
13685 in config.gcc.
13686 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
13687 flags mechanism should be reworked to make it more sane. */
13688 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
13689 }
13690
13691 /* Return the architecture corresponding to the enum ARCH.
13692 If it doesn't specify a valid architecture, return the default. */
13693
13694 static const struct processor *
13695 aarch64_get_arch (enum aarch64_arch arch)
13696 {
13697 if (arch != aarch64_no_arch)
13698 return &all_architectures[arch];
13699
13700 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
13701
13702 return &all_architectures[cpu->arch];
13703 }
13704
13705 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
13706
13707 static poly_uint16
13708 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
13709 {
13710 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
13711 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
13712 deciding which .md file patterns to use and when deciding whether
13713 something is a legitimate address or constant. */
13714 if (value == SVE_SCALABLE || value == SVE_128)
13715 return poly_uint16 (2, 2);
13716 else
13717 return (int) value / 64;
13718 }
13719
13720 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
13721 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
13722 tuning structs. In particular it must set selected_tune and
13723 aarch64_isa_flags that define the available ISA features and tuning
13724 decisions. It must also set selected_arch as this will be used to
13725 output the .arch asm tags for each function. */
13726
13727 static void
13728 aarch64_override_options (void)
13729 {
13730 uint64_t cpu_isa = 0;
13731 uint64_t arch_isa = 0;
13732 aarch64_isa_flags = 0;
13733
13734 bool valid_cpu = true;
13735 bool valid_tune = true;
13736 bool valid_arch = true;
13737
13738 selected_cpu = NULL;
13739 selected_arch = NULL;
13740 selected_tune = NULL;
13741
13742 if (aarch64_branch_protection_string)
13743 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
13744
13745 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
13746 If either of -march or -mtune is given, they override their
13747 respective component of -mcpu. */
13748 if (aarch64_cpu_string)
13749 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
13750 &cpu_isa);
13751
13752 if (aarch64_arch_string)
13753 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
13754 &arch_isa);
13755
13756 if (aarch64_tune_string)
13757 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
13758
13759 #ifdef SUBTARGET_OVERRIDE_OPTIONS
13760 SUBTARGET_OVERRIDE_OPTIONS;
13761 #endif
13762
13763 /* If the user did not specify a processor, choose the default
13764 one for them. This will be the CPU set during configuration using
13765 --with-cpu, otherwise it is "generic". */
13766 if (!selected_cpu)
13767 {
13768 if (selected_arch)
13769 {
13770 selected_cpu = &all_cores[selected_arch->ident];
13771 aarch64_isa_flags = arch_isa;
13772 explicit_arch = selected_arch->arch;
13773 }
13774 else
13775 {
13776 /* Get default configure-time CPU. */
13777 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
13778 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
13779 }
13780
13781 if (selected_tune)
13782 explicit_tune_core = selected_tune->ident;
13783 }
13784 /* If both -mcpu and -march are specified check that they are architecturally
13785 compatible, warn if they're not and prefer the -march ISA flags. */
13786 else if (selected_arch)
13787 {
13788 if (selected_arch->arch != selected_cpu->arch)
13789 {
13790 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
13791 all_architectures[selected_cpu->arch].name,
13792 selected_arch->name);
13793 }
13794 aarch64_isa_flags = arch_isa;
13795 explicit_arch = selected_arch->arch;
13796 explicit_tune_core = selected_tune ? selected_tune->ident
13797 : selected_cpu->ident;
13798 }
13799 else
13800 {
13801 /* -mcpu but no -march. */
13802 aarch64_isa_flags = cpu_isa;
13803 explicit_tune_core = selected_tune ? selected_tune->ident
13804 : selected_cpu->ident;
13805 gcc_assert (selected_cpu);
13806 selected_arch = &all_architectures[selected_cpu->arch];
13807 explicit_arch = selected_arch->arch;
13808 }
13809
13810 /* Set the arch as well as we will need it when outputing
13811 the .arch directive in assembly. */
13812 if (!selected_arch)
13813 {
13814 gcc_assert (selected_cpu);
13815 selected_arch = &all_architectures[selected_cpu->arch];
13816 }
13817
13818 if (!selected_tune)
13819 selected_tune = selected_cpu;
13820
13821 if (aarch64_enable_bti == 2)
13822 {
13823 #ifdef TARGET_ENABLE_BTI
13824 aarch64_enable_bti = 1;
13825 #else
13826 aarch64_enable_bti = 0;
13827 #endif
13828 }
13829
13830 /* Return address signing is currently not supported for ILP32 targets. For
13831 LP64 targets use the configured option in the absence of a command-line
13832 option for -mbranch-protection. */
13833 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
13834 {
13835 #ifdef TARGET_ENABLE_PAC_RET
13836 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
13837 #else
13838 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
13839 #endif
13840 }
13841
13842 #ifndef HAVE_AS_MABI_OPTION
13843 /* The compiler may have been configured with 2.23.* binutils, which does
13844 not have support for ILP32. */
13845 if (TARGET_ILP32)
13846 error ("assembler does not support %<-mabi=ilp32%>");
13847 #endif
13848
13849 /* Convert -msve-vector-bits to a VG count. */
13850 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
13851
13852 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
13853 sorry ("return address signing is only supported for %<-mabi=lp64%>");
13854
13855 /* Make sure we properly set up the explicit options. */
13856 if ((aarch64_cpu_string && valid_cpu)
13857 || (aarch64_tune_string && valid_tune))
13858 gcc_assert (explicit_tune_core != aarch64_none);
13859
13860 if ((aarch64_cpu_string && valid_cpu)
13861 || (aarch64_arch_string && valid_arch))
13862 gcc_assert (explicit_arch != aarch64_no_arch);
13863
13864 /* The pass to insert speculation tracking runs before
13865 shrink-wrapping and the latter does not know how to update the
13866 tracking status. So disable it in this case. */
13867 if (aarch64_track_speculation)
13868 flag_shrink_wrap = 0;
13869
13870 aarch64_override_options_internal (&global_options);
13871
13872 /* Save these options as the default ones in case we push and pop them later
13873 while processing functions with potential target attributes. */
13874 target_option_default_node = target_option_current_node
13875 = build_target_option_node (&global_options);
13876 }
13877
13878 /* Implement targetm.override_options_after_change. */
13879
13880 static void
13881 aarch64_override_options_after_change (void)
13882 {
13883 aarch64_override_options_after_change_1 (&global_options);
13884 }
13885
13886 static struct machine_function *
13887 aarch64_init_machine_status (void)
13888 {
13889 struct machine_function *machine;
13890 machine = ggc_cleared_alloc<machine_function> ();
13891 return machine;
13892 }
13893
13894 void
13895 aarch64_init_expanders (void)
13896 {
13897 init_machine_status = aarch64_init_machine_status;
13898 }
13899
13900 /* A checking mechanism for the implementation of the various code models. */
13901 static void
13902 initialize_aarch64_code_model (struct gcc_options *opts)
13903 {
13904 if (opts->x_flag_pic)
13905 {
13906 switch (opts->x_aarch64_cmodel_var)
13907 {
13908 case AARCH64_CMODEL_TINY:
13909 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13910 break;
13911 case AARCH64_CMODEL_SMALL:
13912 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13913 aarch64_cmodel = (flag_pic == 2
13914 ? AARCH64_CMODEL_SMALL_PIC
13915 : AARCH64_CMODEL_SMALL_SPIC);
13916 #else
13917 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13918 #endif
13919 break;
13920 case AARCH64_CMODEL_LARGE:
13921 sorry ("code model %qs with %<-f%s%>", "large",
13922 opts->x_flag_pic > 1 ? "PIC" : "pic");
13923 break;
13924 default:
13925 gcc_unreachable ();
13926 }
13927 }
13928 else
13929 aarch64_cmodel = opts->x_aarch64_cmodel_var;
13930 }
13931
13932 /* Implement TARGET_OPTION_SAVE. */
13933
13934 static void
13935 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13936 {
13937 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13938 ptr->x_aarch64_branch_protection_string
13939 = opts->x_aarch64_branch_protection_string;
13940 }
13941
13942 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
13943 using the information saved in PTR. */
13944
13945 static void
13946 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13947 {
13948 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13949 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13950 opts->x_explicit_arch = ptr->x_explicit_arch;
13951 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13952 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13953 opts->x_aarch64_branch_protection_string
13954 = ptr->x_aarch64_branch_protection_string;
13955 if (opts->x_aarch64_branch_protection_string)
13956 {
13957 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13958 NULL);
13959 }
13960
13961 aarch64_override_options_internal (opts);
13962 }
13963
13964 /* Implement TARGET_OPTION_PRINT. */
13965
13966 static void
13967 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13968 {
13969 const struct processor *cpu
13970 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13971 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13972 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13973 std::string extension
13974 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13975
13976 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13977 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13978 arch->name, extension.c_str ());
13979 }
13980
13981 static GTY(()) tree aarch64_previous_fndecl;
13982
13983 void
13984 aarch64_reset_previous_fndecl (void)
13985 {
13986 aarch64_previous_fndecl = NULL;
13987 }
13988
13989 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13990 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13991 make sure optab availability predicates are recomputed when necessary. */
13992
13993 void
13994 aarch64_save_restore_target_globals (tree new_tree)
13995 {
13996 if (TREE_TARGET_GLOBALS (new_tree))
13997 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13998 else if (new_tree == target_option_default_node)
13999 restore_target_globals (&default_target_globals);
14000 else
14001 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
14002 }
14003
14004 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
14005 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14006 of the function, if such exists. This function may be called multiple
14007 times on a single function so use aarch64_previous_fndecl to avoid
14008 setting up identical state. */
14009
14010 static void
14011 aarch64_set_current_function (tree fndecl)
14012 {
14013 if (!fndecl || fndecl == aarch64_previous_fndecl)
14014 return;
14015
14016 tree old_tree = (aarch64_previous_fndecl
14017 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
14018 : NULL_TREE);
14019
14020 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14021
14022 /* If current function has no attributes but the previous one did,
14023 use the default node. */
14024 if (!new_tree && old_tree)
14025 new_tree = target_option_default_node;
14026
14027 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
14028 the default have been handled by aarch64_save_restore_target_globals from
14029 aarch64_pragma_target_parse. */
14030 if (old_tree == new_tree)
14031 return;
14032
14033 aarch64_previous_fndecl = fndecl;
14034
14035 /* First set the target options. */
14036 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
14037
14038 aarch64_save_restore_target_globals (new_tree);
14039 }
14040
14041 /* Enum describing the various ways we can handle attributes.
14042 In many cases we can reuse the generic option handling machinery. */
14043
14044 enum aarch64_attr_opt_type
14045 {
14046 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
14047 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
14048 aarch64_attr_enum, /* Attribute sets an enum variable. */
14049 aarch64_attr_custom /* Attribute requires a custom handling function. */
14050 };
14051
14052 /* All the information needed to handle a target attribute.
14053 NAME is the name of the attribute.
14054 ATTR_TYPE specifies the type of behavior of the attribute as described
14055 in the definition of enum aarch64_attr_opt_type.
14056 ALLOW_NEG is true if the attribute supports a "no-" form.
14057 HANDLER is the function that takes the attribute string as an argument
14058 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14059 OPT_NUM is the enum specifying the option that the attribute modifies.
14060 This is needed for attributes that mirror the behavior of a command-line
14061 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14062 aarch64_attr_enum. */
14063
14064 struct aarch64_attribute_info
14065 {
14066 const char *name;
14067 enum aarch64_attr_opt_type attr_type;
14068 bool allow_neg;
14069 bool (*handler) (const char *);
14070 enum opt_code opt_num;
14071 };
14072
14073 /* Handle the ARCH_STR argument to the arch= target attribute. */
14074
14075 static bool
14076 aarch64_handle_attr_arch (const char *str)
14077 {
14078 const struct processor *tmp_arch = NULL;
14079 std::string invalid_extension;
14080 enum aarch64_parse_opt_result parse_res
14081 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
14082
14083 if (parse_res == AARCH64_PARSE_OK)
14084 {
14085 gcc_assert (tmp_arch);
14086 selected_arch = tmp_arch;
14087 explicit_arch = selected_arch->arch;
14088 return true;
14089 }
14090
14091 switch (parse_res)
14092 {
14093 case AARCH64_PARSE_MISSING_ARG:
14094 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14095 break;
14096 case AARCH64_PARSE_INVALID_ARG:
14097 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
14098 aarch64_print_hint_for_arch (str);
14099 break;
14100 case AARCH64_PARSE_INVALID_FEATURE:
14101 error ("invalid feature modifier %s of value (\"%s\") in "
14102 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14103 aarch64_print_hint_for_extensions (invalid_extension);
14104 break;
14105 default:
14106 gcc_unreachable ();
14107 }
14108
14109 return false;
14110 }
14111
14112 /* Handle the argument CPU_STR to the cpu= target attribute. */
14113
14114 static bool
14115 aarch64_handle_attr_cpu (const char *str)
14116 {
14117 const struct processor *tmp_cpu = NULL;
14118 std::string invalid_extension;
14119 enum aarch64_parse_opt_result parse_res
14120 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
14121
14122 if (parse_res == AARCH64_PARSE_OK)
14123 {
14124 gcc_assert (tmp_cpu);
14125 selected_tune = tmp_cpu;
14126 explicit_tune_core = selected_tune->ident;
14127
14128 selected_arch = &all_architectures[tmp_cpu->arch];
14129 explicit_arch = selected_arch->arch;
14130 return true;
14131 }
14132
14133 switch (parse_res)
14134 {
14135 case AARCH64_PARSE_MISSING_ARG:
14136 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14137 break;
14138 case AARCH64_PARSE_INVALID_ARG:
14139 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
14140 aarch64_print_hint_for_core (str);
14141 break;
14142 case AARCH64_PARSE_INVALID_FEATURE:
14143 error ("invalid feature modifier %s of value (\"%s\") in "
14144 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14145 aarch64_print_hint_for_extensions (invalid_extension);
14146 break;
14147 default:
14148 gcc_unreachable ();
14149 }
14150
14151 return false;
14152 }
14153
14154 /* Handle the argument STR to the branch-protection= attribute. */
14155
14156 static bool
14157 aarch64_handle_attr_branch_protection (const char* str)
14158 {
14159 char *err_str = (char *) xmalloc (strlen (str));
14160 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
14161 &err_str);
14162 bool success = false;
14163 switch (res)
14164 {
14165 case AARCH64_PARSE_MISSING_ARG:
14166 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14167 " attribute");
14168 break;
14169 case AARCH64_PARSE_INVALID_ARG:
14170 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14171 "=\")%> pragma or attribute", err_str);
14172 break;
14173 case AARCH64_PARSE_OK:
14174 success = true;
14175 /* Fall through. */
14176 case AARCH64_PARSE_INVALID_FEATURE:
14177 break;
14178 default:
14179 gcc_unreachable ();
14180 }
14181 free (err_str);
14182 return success;
14183 }
14184
14185 /* Handle the argument STR to the tune= target attribute. */
14186
14187 static bool
14188 aarch64_handle_attr_tune (const char *str)
14189 {
14190 const struct processor *tmp_tune = NULL;
14191 enum aarch64_parse_opt_result parse_res
14192 = aarch64_parse_tune (str, &tmp_tune);
14193
14194 if (parse_res == AARCH64_PARSE_OK)
14195 {
14196 gcc_assert (tmp_tune);
14197 selected_tune = tmp_tune;
14198 explicit_tune_core = selected_tune->ident;
14199 return true;
14200 }
14201
14202 switch (parse_res)
14203 {
14204 case AARCH64_PARSE_INVALID_ARG:
14205 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
14206 aarch64_print_hint_for_core (str);
14207 break;
14208 default:
14209 gcc_unreachable ();
14210 }
14211
14212 return false;
14213 }
14214
14215 /* Parse an architecture extensions target attribute string specified in STR.
14216 For example "+fp+nosimd". Show any errors if needed. Return TRUE
14217 if successful. Update aarch64_isa_flags to reflect the ISA features
14218 modified. */
14219
14220 static bool
14221 aarch64_handle_attr_isa_flags (char *str)
14222 {
14223 enum aarch64_parse_opt_result parse_res;
14224 uint64_t isa_flags = aarch64_isa_flags;
14225
14226 /* We allow "+nothing" in the beginning to clear out all architectural
14227 features if the user wants to handpick specific features. */
14228 if (strncmp ("+nothing", str, 8) == 0)
14229 {
14230 isa_flags = 0;
14231 str += 8;
14232 }
14233
14234 std::string invalid_extension;
14235 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
14236
14237 if (parse_res == AARCH64_PARSE_OK)
14238 {
14239 aarch64_isa_flags = isa_flags;
14240 return true;
14241 }
14242
14243 switch (parse_res)
14244 {
14245 case AARCH64_PARSE_MISSING_ARG:
14246 error ("missing value in %<target()%> pragma or attribute");
14247 break;
14248
14249 case AARCH64_PARSE_INVALID_FEATURE:
14250 error ("invalid feature modifier %s of value (\"%s\") in "
14251 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14252 break;
14253
14254 default:
14255 gcc_unreachable ();
14256 }
14257
14258 return false;
14259 }
14260
14261 /* The target attributes that we support. On top of these we also support just
14262 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
14263 handled explicitly in aarch64_process_one_target_attr. */
14264
14265 static const struct aarch64_attribute_info aarch64_attributes[] =
14266 {
14267 { "general-regs-only", aarch64_attr_mask, false, NULL,
14268 OPT_mgeneral_regs_only },
14269 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
14270 OPT_mfix_cortex_a53_835769 },
14271 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
14272 OPT_mfix_cortex_a53_843419 },
14273 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
14274 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
14275 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
14276 OPT_momit_leaf_frame_pointer },
14277 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
14278 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
14279 OPT_march_ },
14280 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
14281 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
14282 OPT_mtune_ },
14283 { "branch-protection", aarch64_attr_custom, false,
14284 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
14285 { "sign-return-address", aarch64_attr_enum, false, NULL,
14286 OPT_msign_return_address_ },
14287 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
14288 };
14289
14290 /* Parse ARG_STR which contains the definition of one target attribute.
14291 Show appropriate errors if any or return true if the attribute is valid. */
14292
14293 static bool
14294 aarch64_process_one_target_attr (char *arg_str)
14295 {
14296 bool invert = false;
14297
14298 size_t len = strlen (arg_str);
14299
14300 if (len == 0)
14301 {
14302 error ("malformed %<target()%> pragma or attribute");
14303 return false;
14304 }
14305
14306 char *str_to_check = (char *) alloca (len + 1);
14307 strcpy (str_to_check, arg_str);
14308
14309 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
14310 It is easier to detect and handle it explicitly here rather than going
14311 through the machinery for the rest of the target attributes in this
14312 function. */
14313 if (*str_to_check == '+')
14314 return aarch64_handle_attr_isa_flags (str_to_check);
14315
14316 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
14317 {
14318 invert = true;
14319 str_to_check += 3;
14320 }
14321 char *arg = strchr (str_to_check, '=');
14322
14323 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
14324 and point ARG to "foo". */
14325 if (arg)
14326 {
14327 *arg = '\0';
14328 arg++;
14329 }
14330 const struct aarch64_attribute_info *p_attr;
14331 bool found = false;
14332 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
14333 {
14334 /* If the names don't match up, or the user has given an argument
14335 to an attribute that doesn't accept one, or didn't give an argument
14336 to an attribute that expects one, fail to match. */
14337 if (strcmp (str_to_check, p_attr->name) != 0)
14338 continue;
14339
14340 found = true;
14341 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
14342 || p_attr->attr_type == aarch64_attr_enum;
14343
14344 if (attr_need_arg_p ^ (arg != NULL))
14345 {
14346 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
14347 return false;
14348 }
14349
14350 /* If the name matches but the attribute does not allow "no-" versions
14351 then we can't match. */
14352 if (invert && !p_attr->allow_neg)
14353 {
14354 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
14355 return false;
14356 }
14357
14358 switch (p_attr->attr_type)
14359 {
14360 /* Has a custom handler registered.
14361 For example, cpu=, arch=, tune=. */
14362 case aarch64_attr_custom:
14363 gcc_assert (p_attr->handler);
14364 if (!p_attr->handler (arg))
14365 return false;
14366 break;
14367
14368 /* Either set or unset a boolean option. */
14369 case aarch64_attr_bool:
14370 {
14371 struct cl_decoded_option decoded;
14372
14373 generate_option (p_attr->opt_num, NULL, !invert,
14374 CL_TARGET, &decoded);
14375 aarch64_handle_option (&global_options, &global_options_set,
14376 &decoded, input_location);
14377 break;
14378 }
14379 /* Set or unset a bit in the target_flags. aarch64_handle_option
14380 should know what mask to apply given the option number. */
14381 case aarch64_attr_mask:
14382 {
14383 struct cl_decoded_option decoded;
14384 /* We only need to specify the option number.
14385 aarch64_handle_option will know which mask to apply. */
14386 decoded.opt_index = p_attr->opt_num;
14387 decoded.value = !invert;
14388 aarch64_handle_option (&global_options, &global_options_set,
14389 &decoded, input_location);
14390 break;
14391 }
14392 /* Use the option setting machinery to set an option to an enum. */
14393 case aarch64_attr_enum:
14394 {
14395 gcc_assert (arg);
14396 bool valid;
14397 int value;
14398 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
14399 &value, CL_TARGET);
14400 if (valid)
14401 {
14402 set_option (&global_options, NULL, p_attr->opt_num, value,
14403 NULL, DK_UNSPECIFIED, input_location,
14404 global_dc);
14405 }
14406 else
14407 {
14408 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
14409 }
14410 break;
14411 }
14412 default:
14413 gcc_unreachable ();
14414 }
14415 }
14416
14417 /* If we reached here we either have found an attribute and validated
14418 it or didn't match any. If we matched an attribute but its arguments
14419 were malformed we will have returned false already. */
14420 return found;
14421 }
14422
14423 /* Count how many times the character C appears in
14424 NULL-terminated string STR. */
14425
14426 static unsigned int
14427 num_occurences_in_str (char c, char *str)
14428 {
14429 unsigned int res = 0;
14430 while (*str != '\0')
14431 {
14432 if (*str == c)
14433 res++;
14434
14435 str++;
14436 }
14437
14438 return res;
14439 }
14440
14441 /* Parse the tree in ARGS that contains the target attribute information
14442 and update the global target options space. */
14443
14444 bool
14445 aarch64_process_target_attr (tree args)
14446 {
14447 if (TREE_CODE (args) == TREE_LIST)
14448 {
14449 do
14450 {
14451 tree head = TREE_VALUE (args);
14452 if (head)
14453 {
14454 if (!aarch64_process_target_attr (head))
14455 return false;
14456 }
14457 args = TREE_CHAIN (args);
14458 } while (args);
14459
14460 return true;
14461 }
14462
14463 if (TREE_CODE (args) != STRING_CST)
14464 {
14465 error ("attribute %<target%> argument not a string");
14466 return false;
14467 }
14468
14469 size_t len = strlen (TREE_STRING_POINTER (args));
14470 char *str_to_check = (char *) alloca (len + 1);
14471 strcpy (str_to_check, TREE_STRING_POINTER (args));
14472
14473 if (len == 0)
14474 {
14475 error ("malformed %<target()%> pragma or attribute");
14476 return false;
14477 }
14478
14479 /* Used to catch empty spaces between commas i.e.
14480 attribute ((target ("attr1,,attr2"))). */
14481 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
14482
14483 /* Handle multiple target attributes separated by ','. */
14484 char *token = strtok_r (str_to_check, ",", &str_to_check);
14485
14486 unsigned int num_attrs = 0;
14487 while (token)
14488 {
14489 num_attrs++;
14490 if (!aarch64_process_one_target_attr (token))
14491 {
14492 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
14493 return false;
14494 }
14495
14496 token = strtok_r (NULL, ",", &str_to_check);
14497 }
14498
14499 if (num_attrs != num_commas + 1)
14500 {
14501 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
14502 return false;
14503 }
14504
14505 return true;
14506 }
14507
14508 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
14509 process attribute ((target ("..."))). */
14510
14511 static bool
14512 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
14513 {
14514 struct cl_target_option cur_target;
14515 bool ret;
14516 tree old_optimize;
14517 tree new_target, new_optimize;
14518 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14519
14520 /* If what we're processing is the current pragma string then the
14521 target option node is already stored in target_option_current_node
14522 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
14523 having to re-parse the string. This is especially useful to keep
14524 arm_neon.h compile times down since that header contains a lot
14525 of intrinsics enclosed in pragmas. */
14526 if (!existing_target && args == current_target_pragma)
14527 {
14528 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
14529 return true;
14530 }
14531 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14532
14533 old_optimize = build_optimization_node (&global_options);
14534 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14535
14536 /* If the function changed the optimization levels as well as setting
14537 target options, start with the optimizations specified. */
14538 if (func_optimize && func_optimize != old_optimize)
14539 cl_optimization_restore (&global_options,
14540 TREE_OPTIMIZATION (func_optimize));
14541
14542 /* Save the current target options to restore at the end. */
14543 cl_target_option_save (&cur_target, &global_options);
14544
14545 /* If fndecl already has some target attributes applied to it, unpack
14546 them so that we add this attribute on top of them, rather than
14547 overwriting them. */
14548 if (existing_target)
14549 {
14550 struct cl_target_option *existing_options
14551 = TREE_TARGET_OPTION (existing_target);
14552
14553 if (existing_options)
14554 cl_target_option_restore (&global_options, existing_options);
14555 }
14556 else
14557 cl_target_option_restore (&global_options,
14558 TREE_TARGET_OPTION (target_option_current_node));
14559
14560 ret = aarch64_process_target_attr (args);
14561
14562 /* Set up any additional state. */
14563 if (ret)
14564 {
14565 aarch64_override_options_internal (&global_options);
14566 /* Initialize SIMD builtins if we haven't already.
14567 Set current_target_pragma to NULL for the duration so that
14568 the builtin initialization code doesn't try to tag the functions
14569 being built with the attributes specified by any current pragma, thus
14570 going into an infinite recursion. */
14571 if (TARGET_SIMD)
14572 {
14573 tree saved_current_target_pragma = current_target_pragma;
14574 current_target_pragma = NULL;
14575 aarch64_init_simd_builtins ();
14576 current_target_pragma = saved_current_target_pragma;
14577 }
14578 new_target = build_target_option_node (&global_options);
14579 }
14580 else
14581 new_target = NULL;
14582
14583 new_optimize = build_optimization_node (&global_options);
14584
14585 if (fndecl && ret)
14586 {
14587 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
14588
14589 if (old_optimize != new_optimize)
14590 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
14591 }
14592
14593 cl_target_option_restore (&global_options, &cur_target);
14594
14595 if (old_optimize != new_optimize)
14596 cl_optimization_restore (&global_options,
14597 TREE_OPTIMIZATION (old_optimize));
14598 return ret;
14599 }
14600
14601 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
14602 tri-bool options (yes, no, don't care) and the default value is
14603 DEF, determine whether to reject inlining. */
14604
14605 static bool
14606 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
14607 int dont_care, int def)
14608 {
14609 /* If the callee doesn't care, always allow inlining. */
14610 if (callee == dont_care)
14611 return true;
14612
14613 /* If the caller doesn't care, always allow inlining. */
14614 if (caller == dont_care)
14615 return true;
14616
14617 /* Otherwise, allow inlining if either the callee and caller values
14618 agree, or if the callee is using the default value. */
14619 return (callee == caller || callee == def);
14620 }
14621
14622 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
14623 to inline CALLEE into CALLER based on target-specific info.
14624 Make sure that the caller and callee have compatible architectural
14625 features. Then go through the other possible target attributes
14626 and see if they can block inlining. Try not to reject always_inline
14627 callees unless they are incompatible architecturally. */
14628
14629 static bool
14630 aarch64_can_inline_p (tree caller, tree callee)
14631 {
14632 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
14633 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
14634
14635 struct cl_target_option *caller_opts
14636 = TREE_TARGET_OPTION (caller_tree ? caller_tree
14637 : target_option_default_node);
14638
14639 struct cl_target_option *callee_opts
14640 = TREE_TARGET_OPTION (callee_tree ? callee_tree
14641 : target_option_default_node);
14642
14643 /* Callee's ISA flags should be a subset of the caller's. */
14644 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
14645 != callee_opts->x_aarch64_isa_flags)
14646 return false;
14647
14648 /* Allow non-strict aligned functions inlining into strict
14649 aligned ones. */
14650 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
14651 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
14652 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
14653 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
14654 return false;
14655
14656 bool always_inline = lookup_attribute ("always_inline",
14657 DECL_ATTRIBUTES (callee));
14658
14659 /* If the architectural features match up and the callee is always_inline
14660 then the other attributes don't matter. */
14661 if (always_inline)
14662 return true;
14663
14664 if (caller_opts->x_aarch64_cmodel_var
14665 != callee_opts->x_aarch64_cmodel_var)
14666 return false;
14667
14668 if (caller_opts->x_aarch64_tls_dialect
14669 != callee_opts->x_aarch64_tls_dialect)
14670 return false;
14671
14672 /* Honour explicit requests to workaround errata. */
14673 if (!aarch64_tribools_ok_for_inlining_p (
14674 caller_opts->x_aarch64_fix_a53_err835769,
14675 callee_opts->x_aarch64_fix_a53_err835769,
14676 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
14677 return false;
14678
14679 if (!aarch64_tribools_ok_for_inlining_p (
14680 caller_opts->x_aarch64_fix_a53_err843419,
14681 callee_opts->x_aarch64_fix_a53_err843419,
14682 2, TARGET_FIX_ERR_A53_843419))
14683 return false;
14684
14685 /* If the user explicitly specified -momit-leaf-frame-pointer for the
14686 caller and calle and they don't match up, reject inlining. */
14687 if (!aarch64_tribools_ok_for_inlining_p (
14688 caller_opts->x_flag_omit_leaf_frame_pointer,
14689 callee_opts->x_flag_omit_leaf_frame_pointer,
14690 2, 1))
14691 return false;
14692
14693 /* If the callee has specific tuning overrides, respect them. */
14694 if (callee_opts->x_aarch64_override_tune_string != NULL
14695 && caller_opts->x_aarch64_override_tune_string == NULL)
14696 return false;
14697
14698 /* If the user specified tuning override strings for the
14699 caller and callee and they don't match up, reject inlining.
14700 We just do a string compare here, we don't analyze the meaning
14701 of the string, as it would be too costly for little gain. */
14702 if (callee_opts->x_aarch64_override_tune_string
14703 && caller_opts->x_aarch64_override_tune_string
14704 && (strcmp (callee_opts->x_aarch64_override_tune_string,
14705 caller_opts->x_aarch64_override_tune_string) != 0))
14706 return false;
14707
14708 return true;
14709 }
14710
14711 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
14712 been already. */
14713
14714 unsigned int
14715 aarch64_tlsdesc_abi_id ()
14716 {
14717 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
14718 if (!tlsdesc_abi.initialized_p ())
14719 {
14720 HARD_REG_SET full_reg_clobbers;
14721 CLEAR_HARD_REG_SET (full_reg_clobbers);
14722 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
14723 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
14724 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
14725 SET_HARD_REG_BIT (full_reg_clobbers, regno);
14726 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
14727 }
14728 return tlsdesc_abi.id ();
14729 }
14730
14731 /* Return true if SYMBOL_REF X binds locally. */
14732
14733 static bool
14734 aarch64_symbol_binds_local_p (const_rtx x)
14735 {
14736 return (SYMBOL_REF_DECL (x)
14737 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
14738 : SYMBOL_REF_LOCAL_P (x));
14739 }
14740
14741 /* Return true if SYMBOL_REF X is thread local */
14742 static bool
14743 aarch64_tls_symbol_p (rtx x)
14744 {
14745 if (! TARGET_HAVE_TLS)
14746 return false;
14747
14748 if (GET_CODE (x) != SYMBOL_REF)
14749 return false;
14750
14751 return SYMBOL_REF_TLS_MODEL (x) != 0;
14752 }
14753
14754 /* Classify a TLS symbol into one of the TLS kinds. */
14755 enum aarch64_symbol_type
14756 aarch64_classify_tls_symbol (rtx x)
14757 {
14758 enum tls_model tls_kind = tls_symbolic_operand_type (x);
14759
14760 switch (tls_kind)
14761 {
14762 case TLS_MODEL_GLOBAL_DYNAMIC:
14763 case TLS_MODEL_LOCAL_DYNAMIC:
14764 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
14765
14766 case TLS_MODEL_INITIAL_EXEC:
14767 switch (aarch64_cmodel)
14768 {
14769 case AARCH64_CMODEL_TINY:
14770 case AARCH64_CMODEL_TINY_PIC:
14771 return SYMBOL_TINY_TLSIE;
14772 default:
14773 return SYMBOL_SMALL_TLSIE;
14774 }
14775
14776 case TLS_MODEL_LOCAL_EXEC:
14777 if (aarch64_tls_size == 12)
14778 return SYMBOL_TLSLE12;
14779 else if (aarch64_tls_size == 24)
14780 return SYMBOL_TLSLE24;
14781 else if (aarch64_tls_size == 32)
14782 return SYMBOL_TLSLE32;
14783 else if (aarch64_tls_size == 48)
14784 return SYMBOL_TLSLE48;
14785 else
14786 gcc_unreachable ();
14787
14788 case TLS_MODEL_EMULATED:
14789 case TLS_MODEL_NONE:
14790 return SYMBOL_FORCE_TO_MEM;
14791
14792 default:
14793 gcc_unreachable ();
14794 }
14795 }
14796
14797 /* Return the correct method for accessing X + OFFSET, where X is either
14798 a SYMBOL_REF or LABEL_REF. */
14799
14800 enum aarch64_symbol_type
14801 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
14802 {
14803 if (GET_CODE (x) == LABEL_REF)
14804 {
14805 switch (aarch64_cmodel)
14806 {
14807 case AARCH64_CMODEL_LARGE:
14808 return SYMBOL_FORCE_TO_MEM;
14809
14810 case AARCH64_CMODEL_TINY_PIC:
14811 case AARCH64_CMODEL_TINY:
14812 return SYMBOL_TINY_ABSOLUTE;
14813
14814 case AARCH64_CMODEL_SMALL_SPIC:
14815 case AARCH64_CMODEL_SMALL_PIC:
14816 case AARCH64_CMODEL_SMALL:
14817 return SYMBOL_SMALL_ABSOLUTE;
14818
14819 default:
14820 gcc_unreachable ();
14821 }
14822 }
14823
14824 if (GET_CODE (x) == SYMBOL_REF)
14825 {
14826 if (aarch64_tls_symbol_p (x))
14827 return aarch64_classify_tls_symbol (x);
14828
14829 switch (aarch64_cmodel)
14830 {
14831 case AARCH64_CMODEL_TINY:
14832 /* When we retrieve symbol + offset address, we have to make sure
14833 the offset does not cause overflow of the final address. But
14834 we have no way of knowing the address of symbol at compile time
14835 so we can't accurately say if the distance between the PC and
14836 symbol + offset is outside the addressible range of +/-1MB in the
14837 TINY code model. So we limit the maximum offset to +/-64KB and
14838 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
14839 If offset_within_block_p is true we allow larger offsets.
14840 Furthermore force to memory if the symbol is a weak reference to
14841 something that doesn't resolve to a symbol in this module. */
14842
14843 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
14844 return SYMBOL_FORCE_TO_MEM;
14845 if (!(IN_RANGE (offset, -0x10000, 0x10000)
14846 || offset_within_block_p (x, offset)))
14847 return SYMBOL_FORCE_TO_MEM;
14848
14849 return SYMBOL_TINY_ABSOLUTE;
14850
14851 case AARCH64_CMODEL_SMALL:
14852 /* Same reasoning as the tiny code model, but the offset cap here is
14853 1MB, allowing +/-3.9GB for the offset to the symbol. */
14854
14855 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
14856 return SYMBOL_FORCE_TO_MEM;
14857 if (!(IN_RANGE (offset, -0x100000, 0x100000)
14858 || offset_within_block_p (x, offset)))
14859 return SYMBOL_FORCE_TO_MEM;
14860
14861 return SYMBOL_SMALL_ABSOLUTE;
14862
14863 case AARCH64_CMODEL_TINY_PIC:
14864 if (!aarch64_symbol_binds_local_p (x))
14865 return SYMBOL_TINY_GOT;
14866 return SYMBOL_TINY_ABSOLUTE;
14867
14868 case AARCH64_CMODEL_SMALL_SPIC:
14869 case AARCH64_CMODEL_SMALL_PIC:
14870 if (!aarch64_symbol_binds_local_p (x))
14871 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
14872 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
14873 return SYMBOL_SMALL_ABSOLUTE;
14874
14875 case AARCH64_CMODEL_LARGE:
14876 /* This is alright even in PIC code as the constant
14877 pool reference is always PC relative and within
14878 the same translation unit. */
14879 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
14880 return SYMBOL_SMALL_ABSOLUTE;
14881 else
14882 return SYMBOL_FORCE_TO_MEM;
14883
14884 default:
14885 gcc_unreachable ();
14886 }
14887 }
14888
14889 /* By default push everything into the constant pool. */
14890 return SYMBOL_FORCE_TO_MEM;
14891 }
14892
14893 bool
14894 aarch64_constant_address_p (rtx x)
14895 {
14896 return (CONSTANT_P (x) && memory_address_p (DImode, x));
14897 }
14898
14899 bool
14900 aarch64_legitimate_pic_operand_p (rtx x)
14901 {
14902 if (GET_CODE (x) == SYMBOL_REF
14903 || (GET_CODE (x) == CONST
14904 && GET_CODE (XEXP (x, 0)) == PLUS
14905 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
14906 return false;
14907
14908 return true;
14909 }
14910
14911 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
14912 that should be rematerialized rather than spilled. */
14913
14914 static bool
14915 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14916 {
14917 /* Support CSE and rematerialization of common constants. */
14918 if (CONST_INT_P (x)
14919 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14920 || GET_CODE (x) == CONST_VECTOR)
14921 return true;
14922
14923 /* Do not allow vector struct mode constants for Advanced SIMD.
14924 We could support 0 and -1 easily, but they need support in
14925 aarch64-simd.md. */
14926 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14927 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14928 return false;
14929
14930 /* Only accept variable-length vector constants if they can be
14931 handled directly.
14932
14933 ??? It would be possible to handle rematerialization of other
14934 constants via secondary reloads. */
14935 if (vec_flags & VEC_ANY_SVE)
14936 return aarch64_simd_valid_immediate (x, NULL);
14937
14938 if (GET_CODE (x) == HIGH)
14939 x = XEXP (x, 0);
14940
14941 /* Accept polynomial constants that can be calculated by using the
14942 destination of a move as the sole temporary. Constants that
14943 require a second temporary cannot be rematerialized (they can't be
14944 forced to memory and also aren't legitimate constants). */
14945 poly_int64 offset;
14946 if (poly_int_rtx_p (x, &offset))
14947 return aarch64_offset_temporaries (false, offset) <= 1;
14948
14949 /* If an offset is being added to something else, we need to allow the
14950 base to be moved into the destination register, meaning that there
14951 are no free temporaries for the offset. */
14952 x = strip_offset (x, &offset);
14953 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14954 return false;
14955
14956 /* Do not allow const (plus (anchor_symbol, const_int)). */
14957 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14958 return false;
14959
14960 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
14961 so spilling them is better than rematerialization. */
14962 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14963 return true;
14964
14965 /* Label references are always constant. */
14966 if (GET_CODE (x) == LABEL_REF)
14967 return true;
14968
14969 return false;
14970 }
14971
14972 rtx
14973 aarch64_load_tp (rtx target)
14974 {
14975 if (!target
14976 || GET_MODE (target) != Pmode
14977 || !register_operand (target, Pmode))
14978 target = gen_reg_rtx (Pmode);
14979
14980 /* Can return in any reg. */
14981 emit_insn (gen_aarch64_load_tp_hard (target));
14982 return target;
14983 }
14984
14985 /* On AAPCS systems, this is the "struct __va_list". */
14986 static GTY(()) tree va_list_type;
14987
14988 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14989 Return the type to use as __builtin_va_list.
14990
14991 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14992
14993 struct __va_list
14994 {
14995 void *__stack;
14996 void *__gr_top;
14997 void *__vr_top;
14998 int __gr_offs;
14999 int __vr_offs;
15000 }; */
15001
15002 static tree
15003 aarch64_build_builtin_va_list (void)
15004 {
15005 tree va_list_name;
15006 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15007
15008 /* Create the type. */
15009 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
15010 /* Give it the required name. */
15011 va_list_name = build_decl (BUILTINS_LOCATION,
15012 TYPE_DECL,
15013 get_identifier ("__va_list"),
15014 va_list_type);
15015 DECL_ARTIFICIAL (va_list_name) = 1;
15016 TYPE_NAME (va_list_type) = va_list_name;
15017 TYPE_STUB_DECL (va_list_type) = va_list_name;
15018
15019 /* Create the fields. */
15020 f_stack = build_decl (BUILTINS_LOCATION,
15021 FIELD_DECL, get_identifier ("__stack"),
15022 ptr_type_node);
15023 f_grtop = build_decl (BUILTINS_LOCATION,
15024 FIELD_DECL, get_identifier ("__gr_top"),
15025 ptr_type_node);
15026 f_vrtop = build_decl (BUILTINS_LOCATION,
15027 FIELD_DECL, get_identifier ("__vr_top"),
15028 ptr_type_node);
15029 f_groff = build_decl (BUILTINS_LOCATION,
15030 FIELD_DECL, get_identifier ("__gr_offs"),
15031 integer_type_node);
15032 f_vroff = build_decl (BUILTINS_LOCATION,
15033 FIELD_DECL, get_identifier ("__vr_offs"),
15034 integer_type_node);
15035
15036 /* Tell tree-stdarg pass about our internal offset fields.
15037 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15038 purpose to identify whether the code is updating va_list internal
15039 offset fields through irregular way. */
15040 va_list_gpr_counter_field = f_groff;
15041 va_list_fpr_counter_field = f_vroff;
15042
15043 DECL_ARTIFICIAL (f_stack) = 1;
15044 DECL_ARTIFICIAL (f_grtop) = 1;
15045 DECL_ARTIFICIAL (f_vrtop) = 1;
15046 DECL_ARTIFICIAL (f_groff) = 1;
15047 DECL_ARTIFICIAL (f_vroff) = 1;
15048
15049 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
15050 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
15051 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
15052 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
15053 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
15054
15055 TYPE_FIELDS (va_list_type) = f_stack;
15056 DECL_CHAIN (f_stack) = f_grtop;
15057 DECL_CHAIN (f_grtop) = f_vrtop;
15058 DECL_CHAIN (f_vrtop) = f_groff;
15059 DECL_CHAIN (f_groff) = f_vroff;
15060
15061 /* Compute its layout. */
15062 layout_type (va_list_type);
15063
15064 return va_list_type;
15065 }
15066
15067 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
15068 static void
15069 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
15070 {
15071 const CUMULATIVE_ARGS *cum;
15072 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15073 tree stack, grtop, vrtop, groff, vroff;
15074 tree t;
15075 int gr_save_area_size = cfun->va_list_gpr_size;
15076 int vr_save_area_size = cfun->va_list_fpr_size;
15077 int vr_offset;
15078
15079 cum = &crtl->args.info;
15080 if (cfun->va_list_gpr_size)
15081 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
15082 cfun->va_list_gpr_size);
15083 if (cfun->va_list_fpr_size)
15084 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
15085 * UNITS_PER_VREG, cfun->va_list_fpr_size);
15086
15087 if (!TARGET_FLOAT)
15088 {
15089 gcc_assert (cum->aapcs_nvrn == 0);
15090 vr_save_area_size = 0;
15091 }
15092
15093 f_stack = TYPE_FIELDS (va_list_type_node);
15094 f_grtop = DECL_CHAIN (f_stack);
15095 f_vrtop = DECL_CHAIN (f_grtop);
15096 f_groff = DECL_CHAIN (f_vrtop);
15097 f_vroff = DECL_CHAIN (f_groff);
15098
15099 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
15100 NULL_TREE);
15101 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
15102 NULL_TREE);
15103 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
15104 NULL_TREE);
15105 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
15106 NULL_TREE);
15107 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
15108 NULL_TREE);
15109
15110 /* Emit code to initialize STACK, which points to the next varargs stack
15111 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
15112 by named arguments. STACK is 8-byte aligned. */
15113 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
15114 if (cum->aapcs_stack_size > 0)
15115 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
15116 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
15117 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15118
15119 /* Emit code to initialize GRTOP, the top of the GR save area.
15120 virtual_incoming_args_rtx should have been 16 byte aligned. */
15121 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
15122 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
15123 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15124
15125 /* Emit code to initialize VRTOP, the top of the VR save area.
15126 This address is gr_save_area_bytes below GRTOP, rounded
15127 down to the next 16-byte boundary. */
15128 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
15129 vr_offset = ROUND_UP (gr_save_area_size,
15130 STACK_BOUNDARY / BITS_PER_UNIT);
15131
15132 if (vr_offset)
15133 t = fold_build_pointer_plus_hwi (t, -vr_offset);
15134 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
15135 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15136
15137 /* Emit code to initialize GROFF, the offset from GRTOP of the
15138 next GPR argument. */
15139 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
15140 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
15141 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15142
15143 /* Likewise emit code to initialize VROFF, the offset from FTOP
15144 of the next VR argument. */
15145 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
15146 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
15147 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15148 }
15149
15150 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
15151
15152 static tree
15153 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
15154 gimple_seq *post_p ATTRIBUTE_UNUSED)
15155 {
15156 tree addr;
15157 bool indirect_p;
15158 bool is_ha; /* is HFA or HVA. */
15159 bool dw_align; /* double-word align. */
15160 machine_mode ag_mode = VOIDmode;
15161 int nregs;
15162 machine_mode mode;
15163
15164 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15165 tree stack, f_top, f_off, off, arg, roundup, on_stack;
15166 HOST_WIDE_INT size, rsize, adjust, align;
15167 tree t, u, cond1, cond2;
15168
15169 indirect_p = pass_va_arg_by_reference (type);
15170 if (indirect_p)
15171 type = build_pointer_type (type);
15172
15173 mode = TYPE_MODE (type);
15174
15175 f_stack = TYPE_FIELDS (va_list_type_node);
15176 f_grtop = DECL_CHAIN (f_stack);
15177 f_vrtop = DECL_CHAIN (f_grtop);
15178 f_groff = DECL_CHAIN (f_vrtop);
15179 f_vroff = DECL_CHAIN (f_groff);
15180
15181 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
15182 f_stack, NULL_TREE);
15183 size = int_size_in_bytes (type);
15184
15185 bool abi_break;
15186 align
15187 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
15188
15189 dw_align = false;
15190 adjust = 0;
15191 if (aarch64_vfp_is_call_or_return_candidate (mode,
15192 type,
15193 &ag_mode,
15194 &nregs,
15195 &is_ha))
15196 {
15197 /* No frontends can create types with variable-sized modes, so we
15198 shouldn't be asked to pass or return them. */
15199 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
15200
15201 /* TYPE passed in fp/simd registers. */
15202 if (!TARGET_FLOAT)
15203 aarch64_err_no_fpadvsimd (mode);
15204
15205 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
15206 unshare_expr (valist), f_vrtop, NULL_TREE);
15207 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
15208 unshare_expr (valist), f_vroff, NULL_TREE);
15209
15210 rsize = nregs * UNITS_PER_VREG;
15211
15212 if (is_ha)
15213 {
15214 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
15215 adjust = UNITS_PER_VREG - ag_size;
15216 }
15217 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15218 && size < UNITS_PER_VREG)
15219 {
15220 adjust = UNITS_PER_VREG - size;
15221 }
15222 }
15223 else
15224 {
15225 /* TYPE passed in general registers. */
15226 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
15227 unshare_expr (valist), f_grtop, NULL_TREE);
15228 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
15229 unshare_expr (valist), f_groff, NULL_TREE);
15230 rsize = ROUND_UP (size, UNITS_PER_WORD);
15231 nregs = rsize / UNITS_PER_WORD;
15232
15233 if (align > 8)
15234 {
15235 if (abi_break && warn_psabi)
15236 inform (input_location, "parameter passing for argument of type "
15237 "%qT changed in GCC 9.1", type);
15238 dw_align = true;
15239 }
15240
15241 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15242 && size < UNITS_PER_WORD)
15243 {
15244 adjust = UNITS_PER_WORD - size;
15245 }
15246 }
15247
15248 /* Get a local temporary for the field value. */
15249 off = get_initialized_tmp_var (f_off, pre_p, NULL);
15250
15251 /* Emit code to branch if off >= 0. */
15252 t = build2 (GE_EXPR, boolean_type_node, off,
15253 build_int_cst (TREE_TYPE (off), 0));
15254 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
15255
15256 if (dw_align)
15257 {
15258 /* Emit: offs = (offs + 15) & -16. */
15259 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15260 build_int_cst (TREE_TYPE (off), 15));
15261 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
15262 build_int_cst (TREE_TYPE (off), -16));
15263 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
15264 }
15265 else
15266 roundup = NULL;
15267
15268 /* Update ap.__[g|v]r_offs */
15269 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15270 build_int_cst (TREE_TYPE (off), rsize));
15271 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
15272
15273 /* String up. */
15274 if (roundup)
15275 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15276
15277 /* [cond2] if (ap.__[g|v]r_offs > 0) */
15278 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
15279 build_int_cst (TREE_TYPE (f_off), 0));
15280 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
15281
15282 /* String up: make sure the assignment happens before the use. */
15283 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
15284 COND_EXPR_ELSE (cond1) = t;
15285
15286 /* Prepare the trees handling the argument that is passed on the stack;
15287 the top level node will store in ON_STACK. */
15288 arg = get_initialized_tmp_var (stack, pre_p, NULL);
15289 if (align > 8)
15290 {
15291 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
15292 t = fold_build_pointer_plus_hwi (arg, 15);
15293 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15294 build_int_cst (TREE_TYPE (t), -16));
15295 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
15296 }
15297 else
15298 roundup = NULL;
15299 /* Advance ap.__stack */
15300 t = fold_build_pointer_plus_hwi (arg, size + 7);
15301 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15302 build_int_cst (TREE_TYPE (t), -8));
15303 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
15304 /* String up roundup and advance. */
15305 if (roundup)
15306 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15307 /* String up with arg */
15308 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
15309 /* Big-endianness related address adjustment. */
15310 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15311 && size < UNITS_PER_WORD)
15312 {
15313 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
15314 size_int (UNITS_PER_WORD - size));
15315 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
15316 }
15317
15318 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
15319 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
15320
15321 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
15322 t = off;
15323 if (adjust)
15324 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
15325 build_int_cst (TREE_TYPE (off), adjust));
15326
15327 t = fold_convert (sizetype, t);
15328 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
15329
15330 if (is_ha)
15331 {
15332 /* type ha; // treat as "struct {ftype field[n];}"
15333 ... [computing offs]
15334 for (i = 0; i <nregs; ++i, offs += 16)
15335 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
15336 return ha; */
15337 int i;
15338 tree tmp_ha, field_t, field_ptr_t;
15339
15340 /* Declare a local variable. */
15341 tmp_ha = create_tmp_var_raw (type, "ha");
15342 gimple_add_tmp_var (tmp_ha);
15343
15344 /* Establish the base type. */
15345 switch (ag_mode)
15346 {
15347 case E_SFmode:
15348 field_t = float_type_node;
15349 field_ptr_t = float_ptr_type_node;
15350 break;
15351 case E_DFmode:
15352 field_t = double_type_node;
15353 field_ptr_t = double_ptr_type_node;
15354 break;
15355 case E_TFmode:
15356 field_t = long_double_type_node;
15357 field_ptr_t = long_double_ptr_type_node;
15358 break;
15359 case E_HFmode:
15360 field_t = aarch64_fp16_type_node;
15361 field_ptr_t = aarch64_fp16_ptr_type_node;
15362 break;
15363 case E_V2SImode:
15364 case E_V4SImode:
15365 {
15366 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
15367 field_t = build_vector_type_for_mode (innertype, ag_mode);
15368 field_ptr_t = build_pointer_type (field_t);
15369 }
15370 break;
15371 default:
15372 gcc_assert (0);
15373 }
15374
15375 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
15376 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
15377 addr = t;
15378 t = fold_convert (field_ptr_t, addr);
15379 t = build2 (MODIFY_EXPR, field_t,
15380 build1 (INDIRECT_REF, field_t, tmp_ha),
15381 build1 (INDIRECT_REF, field_t, t));
15382
15383 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
15384 for (i = 1; i < nregs; ++i)
15385 {
15386 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
15387 u = fold_convert (field_ptr_t, addr);
15388 u = build2 (MODIFY_EXPR, field_t,
15389 build2 (MEM_REF, field_t, tmp_ha,
15390 build_int_cst (field_ptr_t,
15391 (i *
15392 int_size_in_bytes (field_t)))),
15393 build1 (INDIRECT_REF, field_t, u));
15394 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
15395 }
15396
15397 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
15398 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
15399 }
15400
15401 COND_EXPR_ELSE (cond2) = t;
15402 addr = fold_convert (build_pointer_type (type), cond1);
15403 addr = build_va_arg_indirect_ref (addr);
15404
15405 if (indirect_p)
15406 addr = build_va_arg_indirect_ref (addr);
15407
15408 return addr;
15409 }
15410
15411 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
15412
15413 static void
15414 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
15415 const function_arg_info &arg,
15416 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
15417 {
15418 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
15419 CUMULATIVE_ARGS local_cum;
15420 int gr_saved = cfun->va_list_gpr_size;
15421 int vr_saved = cfun->va_list_fpr_size;
15422
15423 /* The caller has advanced CUM up to, but not beyond, the last named
15424 argument. Advance a local copy of CUM past the last "real" named
15425 argument, to find out how many registers are left over. */
15426 local_cum = *cum;
15427 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
15428
15429 /* Found out how many registers we need to save.
15430 Honor tree-stdvar analysis results. */
15431 if (cfun->va_list_gpr_size)
15432 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
15433 cfun->va_list_gpr_size / UNITS_PER_WORD);
15434 if (cfun->va_list_fpr_size)
15435 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
15436 cfun->va_list_fpr_size / UNITS_PER_VREG);
15437
15438 if (!TARGET_FLOAT)
15439 {
15440 gcc_assert (local_cum.aapcs_nvrn == 0);
15441 vr_saved = 0;
15442 }
15443
15444 if (!no_rtl)
15445 {
15446 if (gr_saved > 0)
15447 {
15448 rtx ptr, mem;
15449
15450 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
15451 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
15452 - gr_saved * UNITS_PER_WORD);
15453 mem = gen_frame_mem (BLKmode, ptr);
15454 set_mem_alias_set (mem, get_varargs_alias_set ());
15455
15456 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
15457 mem, gr_saved);
15458 }
15459 if (vr_saved > 0)
15460 {
15461 /* We can't use move_block_from_reg, because it will use
15462 the wrong mode, storing D regs only. */
15463 machine_mode mode = TImode;
15464 int off, i, vr_start;
15465
15466 /* Set OFF to the offset from virtual_incoming_args_rtx of
15467 the first vector register. The VR save area lies below
15468 the GR one, and is aligned to 16 bytes. */
15469 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
15470 STACK_BOUNDARY / BITS_PER_UNIT);
15471 off -= vr_saved * UNITS_PER_VREG;
15472
15473 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
15474 for (i = 0; i < vr_saved; ++i)
15475 {
15476 rtx ptr, mem;
15477
15478 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
15479 mem = gen_frame_mem (mode, ptr);
15480 set_mem_alias_set (mem, get_varargs_alias_set ());
15481 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
15482 off += UNITS_PER_VREG;
15483 }
15484 }
15485 }
15486
15487 /* We don't save the size into *PRETEND_SIZE because we want to avoid
15488 any complication of having crtl->args.pretend_args_size changed. */
15489 cfun->machine->frame.saved_varargs_size
15490 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
15491 STACK_BOUNDARY / BITS_PER_UNIT)
15492 + vr_saved * UNITS_PER_VREG);
15493 }
15494
15495 static void
15496 aarch64_conditional_register_usage (void)
15497 {
15498 int i;
15499 if (!TARGET_FLOAT)
15500 {
15501 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
15502 {
15503 fixed_regs[i] = 1;
15504 call_used_regs[i] = 1;
15505 }
15506 }
15507 if (!TARGET_SVE)
15508 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
15509 {
15510 fixed_regs[i] = 1;
15511 call_used_regs[i] = 1;
15512 }
15513
15514 /* Only allow the FFR and FFRT to be accessed via special patterns. */
15515 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
15516 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
15517
15518 /* When tracking speculation, we need a couple of call-clobbered registers
15519 to track the speculation state. It would be nice to just use
15520 IP0 and IP1, but currently there are numerous places that just
15521 assume these registers are free for other uses (eg pointer
15522 authentication). */
15523 if (aarch64_track_speculation)
15524 {
15525 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
15526 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
15527 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15528 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15529 }
15530 }
15531
15532 /* Walk down the type tree of TYPE counting consecutive base elements.
15533 If *MODEP is VOIDmode, then set it to the first valid floating point
15534 type. If a non-floating point type is found, or if a floating point
15535 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
15536 otherwise return the count in the sub-tree. */
15537 static int
15538 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
15539 {
15540 machine_mode mode;
15541 HOST_WIDE_INT size;
15542
15543 /* SVE types (and types containing SVE types) must be handled
15544 before calling this function. */
15545 gcc_assert (!aarch64_sve::builtin_type_p (type));
15546
15547 switch (TREE_CODE (type))
15548 {
15549 case REAL_TYPE:
15550 mode = TYPE_MODE (type);
15551 if (mode != DFmode && mode != SFmode
15552 && mode != TFmode && mode != HFmode)
15553 return -1;
15554
15555 if (*modep == VOIDmode)
15556 *modep = mode;
15557
15558 if (*modep == mode)
15559 return 1;
15560
15561 break;
15562
15563 case COMPLEX_TYPE:
15564 mode = TYPE_MODE (TREE_TYPE (type));
15565 if (mode != DFmode && mode != SFmode
15566 && mode != TFmode && mode != HFmode)
15567 return -1;
15568
15569 if (*modep == VOIDmode)
15570 *modep = mode;
15571
15572 if (*modep == mode)
15573 return 2;
15574
15575 break;
15576
15577 case VECTOR_TYPE:
15578 /* Use V2SImode and V4SImode as representatives of all 64-bit
15579 and 128-bit vector types. */
15580 size = int_size_in_bytes (type);
15581 switch (size)
15582 {
15583 case 8:
15584 mode = V2SImode;
15585 break;
15586 case 16:
15587 mode = V4SImode;
15588 break;
15589 default:
15590 return -1;
15591 }
15592
15593 if (*modep == VOIDmode)
15594 *modep = mode;
15595
15596 /* Vector modes are considered to be opaque: two vectors are
15597 equivalent for the purposes of being homogeneous aggregates
15598 if they are the same size. */
15599 if (*modep == mode)
15600 return 1;
15601
15602 break;
15603
15604 case ARRAY_TYPE:
15605 {
15606 int count;
15607 tree index = TYPE_DOMAIN (type);
15608
15609 /* Can't handle incomplete types nor sizes that are not
15610 fixed. */
15611 if (!COMPLETE_TYPE_P (type)
15612 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15613 return -1;
15614
15615 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
15616 if (count == -1
15617 || !index
15618 || !TYPE_MAX_VALUE (index)
15619 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
15620 || !TYPE_MIN_VALUE (index)
15621 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
15622 || count < 0)
15623 return -1;
15624
15625 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
15626 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
15627
15628 /* There must be no padding. */
15629 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15630 count * GET_MODE_BITSIZE (*modep)))
15631 return -1;
15632
15633 return count;
15634 }
15635
15636 case RECORD_TYPE:
15637 {
15638 int count = 0;
15639 int sub_count;
15640 tree field;
15641
15642 /* Can't handle incomplete types nor sizes that are not
15643 fixed. */
15644 if (!COMPLETE_TYPE_P (type)
15645 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15646 return -1;
15647
15648 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
15649 {
15650 if (TREE_CODE (field) != FIELD_DECL)
15651 continue;
15652
15653 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
15654 if (sub_count < 0)
15655 return -1;
15656 count += sub_count;
15657 }
15658
15659 /* There must be no padding. */
15660 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15661 count * GET_MODE_BITSIZE (*modep)))
15662 return -1;
15663
15664 return count;
15665 }
15666
15667 case UNION_TYPE:
15668 case QUAL_UNION_TYPE:
15669 {
15670 /* These aren't very interesting except in a degenerate case. */
15671 int count = 0;
15672 int sub_count;
15673 tree field;
15674
15675 /* Can't handle incomplete types nor sizes that are not
15676 fixed. */
15677 if (!COMPLETE_TYPE_P (type)
15678 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15679 return -1;
15680
15681 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
15682 {
15683 if (TREE_CODE (field) != FIELD_DECL)
15684 continue;
15685
15686 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
15687 if (sub_count < 0)
15688 return -1;
15689 count = count > sub_count ? count : sub_count;
15690 }
15691
15692 /* There must be no padding. */
15693 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15694 count * GET_MODE_BITSIZE (*modep)))
15695 return -1;
15696
15697 return count;
15698 }
15699
15700 default:
15701 break;
15702 }
15703
15704 return -1;
15705 }
15706
15707 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
15708 type as described in AAPCS64 \S 4.1.2.
15709
15710 See the comment above aarch64_composite_type_p for the notes on MODE. */
15711
15712 static bool
15713 aarch64_short_vector_p (const_tree type,
15714 machine_mode mode)
15715 {
15716 poly_int64 size = -1;
15717
15718 if (type && aarch64_sve::builtin_type_p (type))
15719 return false;
15720
15721 if (type && TREE_CODE (type) == VECTOR_TYPE)
15722 size = int_size_in_bytes (type);
15723 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
15724 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
15725 size = GET_MODE_SIZE (mode);
15726
15727 return known_eq (size, 8) || known_eq (size, 16);
15728 }
15729
15730 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
15731 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
15732 array types. The C99 floating-point complex types are also considered
15733 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
15734 types, which are GCC extensions and out of the scope of AAPCS64, are
15735 treated as composite types here as well.
15736
15737 Note that MODE itself is not sufficient in determining whether a type
15738 is such a composite type or not. This is because
15739 stor-layout.c:compute_record_mode may have already changed the MODE
15740 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
15741 structure with only one field may have its MODE set to the mode of the
15742 field. Also an integer mode whose size matches the size of the
15743 RECORD_TYPE type may be used to substitute the original mode
15744 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
15745 solely relied on. */
15746
15747 static bool
15748 aarch64_composite_type_p (const_tree type,
15749 machine_mode mode)
15750 {
15751 if (aarch64_short_vector_p (type, mode))
15752 return false;
15753
15754 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
15755 return true;
15756
15757 if (mode == BLKmode
15758 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
15759 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
15760 return true;
15761
15762 return false;
15763 }
15764
15765 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
15766 shall be passed or returned in simd/fp register(s) (providing these
15767 parameter passing registers are available).
15768
15769 Upon successful return, *COUNT returns the number of needed registers,
15770 *BASE_MODE returns the mode of the individual register and when IS_HAF
15771 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
15772 floating-point aggregate or a homogeneous short-vector aggregate. */
15773
15774 static bool
15775 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
15776 const_tree type,
15777 machine_mode *base_mode,
15778 int *count,
15779 bool *is_ha)
15780 {
15781 if (is_ha != NULL) *is_ha = false;
15782
15783 if (type && aarch64_sve::builtin_type_p (type))
15784 return false;
15785
15786 machine_mode new_mode = VOIDmode;
15787 bool composite_p = aarch64_composite_type_p (type, mode);
15788
15789 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
15790 || aarch64_short_vector_p (type, mode))
15791 {
15792 *count = 1;
15793 new_mode = mode;
15794 }
15795 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
15796 {
15797 if (is_ha != NULL) *is_ha = true;
15798 *count = 2;
15799 new_mode = GET_MODE_INNER (mode);
15800 }
15801 else if (type && composite_p)
15802 {
15803 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
15804
15805 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
15806 {
15807 if (is_ha != NULL) *is_ha = true;
15808 *count = ag_count;
15809 }
15810 else
15811 return false;
15812 }
15813 else
15814 return false;
15815
15816 *base_mode = new_mode;
15817 return true;
15818 }
15819
15820 /* Implement TARGET_STRUCT_VALUE_RTX. */
15821
15822 static rtx
15823 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
15824 int incoming ATTRIBUTE_UNUSED)
15825 {
15826 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
15827 }
15828
15829 /* Implements target hook vector_mode_supported_p. */
15830 static bool
15831 aarch64_vector_mode_supported_p (machine_mode mode)
15832 {
15833 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15834 return vec_flags != 0 && (vec_flags & (VEC_STRUCT | VEC_PARTIAL)) == 0;
15835 }
15836
15837 /* Return the full-width SVE vector mode for element mode MODE, if one
15838 exists. */
15839 opt_machine_mode
15840 aarch64_full_sve_mode (scalar_mode mode)
15841 {
15842 switch (mode)
15843 {
15844 case E_DFmode:
15845 return VNx2DFmode;
15846 case E_SFmode:
15847 return VNx4SFmode;
15848 case E_HFmode:
15849 return VNx8HFmode;
15850 case E_DImode:
15851 return VNx2DImode;
15852 case E_SImode:
15853 return VNx4SImode;
15854 case E_HImode:
15855 return VNx8HImode;
15856 case E_QImode:
15857 return VNx16QImode;
15858 default:
15859 return opt_machine_mode ();
15860 }
15861 }
15862
15863 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
15864 if it exists. */
15865 opt_machine_mode
15866 aarch64_vq_mode (scalar_mode mode)
15867 {
15868 switch (mode)
15869 {
15870 case E_DFmode:
15871 return V2DFmode;
15872 case E_SFmode:
15873 return V4SFmode;
15874 case E_HFmode:
15875 return V8HFmode;
15876 case E_SImode:
15877 return V4SImode;
15878 case E_HImode:
15879 return V8HImode;
15880 case E_QImode:
15881 return V16QImode;
15882 case E_DImode:
15883 return V2DImode;
15884 default:
15885 return opt_machine_mode ();
15886 }
15887 }
15888
15889 /* Return appropriate SIMD container
15890 for MODE within a vector of WIDTH bits. */
15891 static machine_mode
15892 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
15893 {
15894 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
15895 return aarch64_full_sve_mode (mode).else_mode (word_mode);
15896
15897 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
15898 if (TARGET_SIMD)
15899 {
15900 if (known_eq (width, 128))
15901 return aarch64_vq_mode (mode).else_mode (word_mode);
15902 else
15903 switch (mode)
15904 {
15905 case E_SFmode:
15906 return V2SFmode;
15907 case E_HFmode:
15908 return V4HFmode;
15909 case E_SImode:
15910 return V2SImode;
15911 case E_HImode:
15912 return V4HImode;
15913 case E_QImode:
15914 return V8QImode;
15915 default:
15916 break;
15917 }
15918 }
15919 return word_mode;
15920 }
15921
15922 /* Return 128-bit container as the preferred SIMD mode for MODE. */
15923 static machine_mode
15924 aarch64_preferred_simd_mode (scalar_mode mode)
15925 {
15926 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15927 return aarch64_simd_container_mode (mode, bits);
15928 }
15929
15930 /* Return a list of possible vector sizes for the vectorizer
15931 to iterate over. */
15932 static void
15933 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15934 {
15935 if (TARGET_SVE)
15936 sizes->safe_push (BYTES_PER_SVE_VECTOR);
15937 sizes->safe_push (16);
15938 sizes->safe_push (8);
15939 }
15940
15941 /* Implement TARGET_MANGLE_TYPE. */
15942
15943 static const char *
15944 aarch64_mangle_type (const_tree type)
15945 {
15946 /* The AArch64 ABI documents say that "__va_list" has to be
15947 mangled as if it is in the "std" namespace. */
15948 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15949 return "St9__va_list";
15950
15951 /* Half-precision float. */
15952 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15953 return "Dh";
15954
15955 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
15956 builtin types. */
15957 if (TYPE_NAME (type) != NULL)
15958 {
15959 const char *res;
15960 if ((res = aarch64_general_mangle_builtin_type (type))
15961 || (res = aarch64_sve::mangle_builtin_type (type)))
15962 return res;
15963 }
15964
15965 /* Use the default mangling. */
15966 return NULL;
15967 }
15968
15969 /* Find the first rtx_insn before insn that will generate an assembly
15970 instruction. */
15971
15972 static rtx_insn *
15973 aarch64_prev_real_insn (rtx_insn *insn)
15974 {
15975 if (!insn)
15976 return NULL;
15977
15978 do
15979 {
15980 insn = prev_real_insn (insn);
15981 }
15982 while (insn && recog_memoized (insn) < 0);
15983
15984 return insn;
15985 }
15986
15987 static bool
15988 is_madd_op (enum attr_type t1)
15989 {
15990 unsigned int i;
15991 /* A number of these may be AArch32 only. */
15992 enum attr_type mlatypes[] = {
15993 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15994 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15995 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15996 };
15997
15998 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15999 {
16000 if (t1 == mlatypes[i])
16001 return true;
16002 }
16003
16004 return false;
16005 }
16006
16007 /* Check if there is a register dependency between a load and the insn
16008 for which we hold recog_data. */
16009
16010 static bool
16011 dep_between_memop_and_curr (rtx memop)
16012 {
16013 rtx load_reg;
16014 int opno;
16015
16016 gcc_assert (GET_CODE (memop) == SET);
16017
16018 if (!REG_P (SET_DEST (memop)))
16019 return false;
16020
16021 load_reg = SET_DEST (memop);
16022 for (opno = 1; opno < recog_data.n_operands; opno++)
16023 {
16024 rtx operand = recog_data.operand[opno];
16025 if (REG_P (operand)
16026 && reg_overlap_mentioned_p (load_reg, operand))
16027 return true;
16028
16029 }
16030 return false;
16031 }
16032
16033
16034 /* When working around the Cortex-A53 erratum 835769,
16035 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16036 instruction and has a preceding memory instruction such that a NOP
16037 should be inserted between them. */
16038
16039 bool
16040 aarch64_madd_needs_nop (rtx_insn* insn)
16041 {
16042 enum attr_type attr_type;
16043 rtx_insn *prev;
16044 rtx body;
16045
16046 if (!TARGET_FIX_ERR_A53_835769)
16047 return false;
16048
16049 if (!INSN_P (insn) || recog_memoized (insn) < 0)
16050 return false;
16051
16052 attr_type = get_attr_type (insn);
16053 if (!is_madd_op (attr_type))
16054 return false;
16055
16056 prev = aarch64_prev_real_insn (insn);
16057 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16058 Restore recog state to INSN to avoid state corruption. */
16059 extract_constrain_insn_cached (insn);
16060
16061 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
16062 return false;
16063
16064 body = single_set (prev);
16065
16066 /* If the previous insn is a memory op and there is no dependency between
16067 it and the DImode madd, emit a NOP between them. If body is NULL then we
16068 have a complex memory operation, probably a load/store pair.
16069 Be conservative for now and emit a NOP. */
16070 if (GET_MODE (recog_data.operand[0]) == DImode
16071 && (!body || !dep_between_memop_and_curr (body)))
16072 return true;
16073
16074 return false;
16075
16076 }
16077
16078
16079 /* Implement FINAL_PRESCAN_INSN. */
16080
16081 void
16082 aarch64_final_prescan_insn (rtx_insn *insn)
16083 {
16084 if (aarch64_madd_needs_nop (insn))
16085 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
16086 }
16087
16088
16089 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
16090 instruction. */
16091
16092 bool
16093 aarch64_sve_index_immediate_p (rtx base_or_step)
16094 {
16095 return (CONST_INT_P (base_or_step)
16096 && IN_RANGE (INTVAL (base_or_step), -16, 15));
16097 }
16098
16099 /* Return true if X is a valid immediate for the SVE ADD and SUB
16100 instructions. Negate X first if NEGATE_P is true. */
16101
16102 bool
16103 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
16104 {
16105 rtx elt;
16106
16107 if (!const_vec_duplicate_p (x, &elt)
16108 || !CONST_INT_P (elt))
16109 return false;
16110
16111 HOST_WIDE_INT val = INTVAL (elt);
16112 if (negate_p)
16113 val = -val;
16114 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
16115
16116 if (val & 0xff)
16117 return IN_RANGE (val, 0, 0xff);
16118 return IN_RANGE (val, 0, 0xff00);
16119 }
16120
16121 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
16122 instructions. Negate X first if NEGATE_P is true. */
16123
16124 bool
16125 aarch64_sve_sqadd_sqsub_immediate_p (rtx x, bool negate_p)
16126 {
16127 rtx elt;
16128
16129 if (!const_vec_duplicate_p (x, &elt)
16130 || !CONST_INT_P (elt))
16131 return false;
16132
16133 if (!aarch64_sve_arith_immediate_p (x, negate_p))
16134 return false;
16135
16136 /* After the optional negation, the immediate must be nonnegative.
16137 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
16138 instead of SQADD Zn.B, Zn.B, #129. */
16139 return negate_p == (INTVAL (elt) < 0);
16140 }
16141
16142 /* Return true if X is a valid immediate operand for an SVE logical
16143 instruction such as AND. */
16144
16145 bool
16146 aarch64_sve_bitmask_immediate_p (rtx x)
16147 {
16148 rtx elt;
16149
16150 return (const_vec_duplicate_p (x, &elt)
16151 && CONST_INT_P (elt)
16152 && aarch64_bitmask_imm (INTVAL (elt),
16153 GET_MODE_INNER (GET_MODE (x))));
16154 }
16155
16156 /* Return true if X is a valid immediate for the SVE DUP and CPY
16157 instructions. */
16158
16159 bool
16160 aarch64_sve_dup_immediate_p (rtx x)
16161 {
16162 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
16163 if (!CONST_INT_P (x))
16164 return false;
16165
16166 HOST_WIDE_INT val = INTVAL (x);
16167 if (val & 0xff)
16168 return IN_RANGE (val, -0x80, 0x7f);
16169 return IN_RANGE (val, -0x8000, 0x7f00);
16170 }
16171
16172 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
16173 SIGNED_P says whether the operand is signed rather than unsigned. */
16174
16175 bool
16176 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
16177 {
16178 x = unwrap_const_vec_duplicate (x);
16179 return (CONST_INT_P (x)
16180 && (signed_p
16181 ? IN_RANGE (INTVAL (x), -16, 15)
16182 : IN_RANGE (INTVAL (x), 0, 127)));
16183 }
16184
16185 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
16186 instruction. Negate X first if NEGATE_P is true. */
16187
16188 bool
16189 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
16190 {
16191 rtx elt;
16192 REAL_VALUE_TYPE r;
16193
16194 if (!const_vec_duplicate_p (x, &elt)
16195 || GET_CODE (elt) != CONST_DOUBLE)
16196 return false;
16197
16198 r = *CONST_DOUBLE_REAL_VALUE (elt);
16199
16200 if (negate_p)
16201 r = real_value_negate (&r);
16202
16203 if (real_equal (&r, &dconst1))
16204 return true;
16205 if (real_equal (&r, &dconsthalf))
16206 return true;
16207 return false;
16208 }
16209
16210 /* Return true if X is a valid immediate operand for an SVE FMUL
16211 instruction. */
16212
16213 bool
16214 aarch64_sve_float_mul_immediate_p (rtx x)
16215 {
16216 rtx elt;
16217
16218 return (const_vec_duplicate_p (x, &elt)
16219 && GET_CODE (elt) == CONST_DOUBLE
16220 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
16221 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
16222 }
16223
16224 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
16225 for the Advanced SIMD operation described by WHICH and INSN. If INFO
16226 is nonnull, use it to describe valid immediates. */
16227 static bool
16228 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
16229 simd_immediate_info *info,
16230 enum simd_immediate_check which,
16231 simd_immediate_info::insn_type insn)
16232 {
16233 /* Try a 4-byte immediate with LSL. */
16234 for (unsigned int shift = 0; shift < 32; shift += 8)
16235 if ((val32 & (0xff << shift)) == val32)
16236 {
16237 if (info)
16238 *info = simd_immediate_info (SImode, val32 >> shift, insn,
16239 simd_immediate_info::LSL, shift);
16240 return true;
16241 }
16242
16243 /* Try a 2-byte immediate with LSL. */
16244 unsigned int imm16 = val32 & 0xffff;
16245 if (imm16 == (val32 >> 16))
16246 for (unsigned int shift = 0; shift < 16; shift += 8)
16247 if ((imm16 & (0xff << shift)) == imm16)
16248 {
16249 if (info)
16250 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
16251 simd_immediate_info::LSL, shift);
16252 return true;
16253 }
16254
16255 /* Try a 4-byte immediate with MSL, except for cases that MVN
16256 can handle. */
16257 if (which == AARCH64_CHECK_MOV)
16258 for (unsigned int shift = 8; shift < 24; shift += 8)
16259 {
16260 unsigned int low = (1 << shift) - 1;
16261 if (((val32 & (0xff << shift)) | low) == val32)
16262 {
16263 if (info)
16264 *info = simd_immediate_info (SImode, val32 >> shift, insn,
16265 simd_immediate_info::MSL, shift);
16266 return true;
16267 }
16268 }
16269
16270 return false;
16271 }
16272
16273 /* Return true if replicating VAL64 is a valid immediate for the
16274 Advanced SIMD operation described by WHICH. If INFO is nonnull,
16275 use it to describe valid immediates. */
16276 static bool
16277 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
16278 simd_immediate_info *info,
16279 enum simd_immediate_check which)
16280 {
16281 unsigned int val32 = val64 & 0xffffffff;
16282 unsigned int val16 = val64 & 0xffff;
16283 unsigned int val8 = val64 & 0xff;
16284
16285 if (val32 == (val64 >> 32))
16286 {
16287 if ((which & AARCH64_CHECK_ORR) != 0
16288 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
16289 simd_immediate_info::MOV))
16290 return true;
16291
16292 if ((which & AARCH64_CHECK_BIC) != 0
16293 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
16294 simd_immediate_info::MVN))
16295 return true;
16296
16297 /* Try using a replicated byte. */
16298 if (which == AARCH64_CHECK_MOV
16299 && val16 == (val32 >> 16)
16300 && val8 == (val16 >> 8))
16301 {
16302 if (info)
16303 *info = simd_immediate_info (QImode, val8);
16304 return true;
16305 }
16306 }
16307
16308 /* Try using a bit-to-bytemask. */
16309 if (which == AARCH64_CHECK_MOV)
16310 {
16311 unsigned int i;
16312 for (i = 0; i < 64; i += 8)
16313 {
16314 unsigned char byte = (val64 >> i) & 0xff;
16315 if (byte != 0 && byte != 0xff)
16316 break;
16317 }
16318 if (i == 64)
16319 {
16320 if (info)
16321 *info = simd_immediate_info (DImode, val64);
16322 return true;
16323 }
16324 }
16325 return false;
16326 }
16327
16328 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
16329 instruction. If INFO is nonnull, use it to describe valid immediates. */
16330
16331 static bool
16332 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
16333 simd_immediate_info *info)
16334 {
16335 scalar_int_mode mode = DImode;
16336 unsigned int val32 = val64 & 0xffffffff;
16337 if (val32 == (val64 >> 32))
16338 {
16339 mode = SImode;
16340 unsigned int val16 = val32 & 0xffff;
16341 if (val16 == (val32 >> 16))
16342 {
16343 mode = HImode;
16344 unsigned int val8 = val16 & 0xff;
16345 if (val8 == (val16 >> 8))
16346 mode = QImode;
16347 }
16348 }
16349 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
16350 if (IN_RANGE (val, -0x80, 0x7f))
16351 {
16352 /* DUP with no shift. */
16353 if (info)
16354 *info = simd_immediate_info (mode, val);
16355 return true;
16356 }
16357 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
16358 {
16359 /* DUP with LSL #8. */
16360 if (info)
16361 *info = simd_immediate_info (mode, val);
16362 return true;
16363 }
16364 if (aarch64_bitmask_imm (val64, mode))
16365 {
16366 /* DUPM. */
16367 if (info)
16368 *info = simd_immediate_info (mode, val);
16369 return true;
16370 }
16371 return false;
16372 }
16373
16374 /* Return true if X is an UNSPEC_PTRUE constant of the form:
16375
16376 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
16377
16378 where PATTERN is the svpattern as a CONST_INT and where ZERO
16379 is a zero constant of the required PTRUE mode (which can have
16380 fewer elements than X's mode, if zero bits are significant).
16381
16382 If so, and if INFO is nonnull, describe the immediate in INFO. */
16383 bool
16384 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
16385 {
16386 if (GET_CODE (x) != CONST)
16387 return false;
16388
16389 x = XEXP (x, 0);
16390 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
16391 return false;
16392
16393 if (info)
16394 {
16395 aarch64_svpattern pattern
16396 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
16397 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
16398 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
16399 *info = simd_immediate_info (int_mode, pattern);
16400 }
16401 return true;
16402 }
16403
16404 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
16405 it to describe valid immediates. */
16406
16407 static bool
16408 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
16409 {
16410 if (aarch64_sve_ptrue_svpattern_p (x, info))
16411 return true;
16412
16413 if (x == CONST0_RTX (GET_MODE (x)))
16414 {
16415 if (info)
16416 *info = simd_immediate_info (DImode, 0);
16417 return true;
16418 }
16419
16420 /* Analyze the value as a VNx16BImode. This should be relatively
16421 efficient, since rtx_vector_builder has enough built-in capacity
16422 to store all VLA predicate constants without needing the heap. */
16423 rtx_vector_builder builder;
16424 if (!aarch64_get_sve_pred_bits (builder, x))
16425 return false;
16426
16427 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
16428 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
16429 {
16430 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
16431 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
16432 if (pattern != AARCH64_NUM_SVPATTERNS)
16433 {
16434 if (info)
16435 {
16436 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
16437 *info = simd_immediate_info (int_mode, pattern);
16438 }
16439 return true;
16440 }
16441 }
16442 return false;
16443 }
16444
16445 /* Return true if OP is a valid SIMD immediate for the operation
16446 described by WHICH. If INFO is nonnull, use it to describe valid
16447 immediates. */
16448 bool
16449 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
16450 enum simd_immediate_check which)
16451 {
16452 machine_mode mode = GET_MODE (op);
16453 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16454 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16455 return false;
16456
16457 if (vec_flags & VEC_SVE_PRED)
16458 return aarch64_sve_pred_valid_immediate (op, info);
16459
16460 scalar_mode elt_mode = GET_MODE_INNER (mode);
16461 rtx base, step;
16462 unsigned int n_elts;
16463 if (GET_CODE (op) == CONST_VECTOR
16464 && CONST_VECTOR_DUPLICATE_P (op))
16465 n_elts = CONST_VECTOR_NPATTERNS (op);
16466 else if ((vec_flags & VEC_SVE_DATA)
16467 && const_vec_series_p (op, &base, &step))
16468 {
16469 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
16470 if (!aarch64_sve_index_immediate_p (base)
16471 || !aarch64_sve_index_immediate_p (step))
16472 return false;
16473
16474 if (info)
16475 *info = simd_immediate_info (elt_mode, base, step);
16476 return true;
16477 }
16478 else if (GET_CODE (op) == CONST_VECTOR
16479 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
16480 /* N_ELTS set above. */;
16481 else
16482 return false;
16483
16484 scalar_float_mode elt_float_mode;
16485 if (n_elts == 1
16486 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
16487 {
16488 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
16489 if (aarch64_float_const_zero_rtx_p (elt)
16490 || aarch64_float_const_representable_p (elt))
16491 {
16492 if (info)
16493 *info = simd_immediate_info (elt_float_mode, elt);
16494 return true;
16495 }
16496 }
16497
16498 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
16499 if (elt_size > 8)
16500 return false;
16501
16502 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
16503
16504 /* Expand the vector constant out into a byte vector, with the least
16505 significant byte of the register first. */
16506 auto_vec<unsigned char, 16> bytes;
16507 bytes.reserve (n_elts * elt_size);
16508 for (unsigned int i = 0; i < n_elts; i++)
16509 {
16510 /* The vector is provided in gcc endian-neutral fashion.
16511 For aarch64_be Advanced SIMD, it must be laid out in the vector
16512 register in reverse order. */
16513 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
16514 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
16515
16516 if (elt_mode != elt_int_mode)
16517 elt = gen_lowpart (elt_int_mode, elt);
16518
16519 if (!CONST_INT_P (elt))
16520 return false;
16521
16522 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
16523 for (unsigned int byte = 0; byte < elt_size; byte++)
16524 {
16525 bytes.quick_push (elt_val & 0xff);
16526 elt_val >>= BITS_PER_UNIT;
16527 }
16528 }
16529
16530 /* The immediate must repeat every eight bytes. */
16531 unsigned int nbytes = bytes.length ();
16532 for (unsigned i = 8; i < nbytes; ++i)
16533 if (bytes[i] != bytes[i - 8])
16534 return false;
16535
16536 /* Get the repeating 8-byte value as an integer. No endian correction
16537 is needed here because bytes is already in lsb-first order. */
16538 unsigned HOST_WIDE_INT val64 = 0;
16539 for (unsigned int i = 0; i < 8; i++)
16540 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
16541 << (i * BITS_PER_UNIT));
16542
16543 if (vec_flags & VEC_SVE_DATA)
16544 return aarch64_sve_valid_immediate (val64, info);
16545 else
16546 return aarch64_advsimd_valid_immediate (val64, info, which);
16547 }
16548
16549 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
16550 has a step in the range of INDEX. Return the index expression if so,
16551 otherwise return null. */
16552 rtx
16553 aarch64_check_zero_based_sve_index_immediate (rtx x)
16554 {
16555 rtx base, step;
16556 if (const_vec_series_p (x, &base, &step)
16557 && base == const0_rtx
16558 && aarch64_sve_index_immediate_p (step))
16559 return step;
16560 return NULL_RTX;
16561 }
16562
16563 /* Check of immediate shift constants are within range. */
16564 bool
16565 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
16566 {
16567 x = unwrap_const_vec_duplicate (x);
16568 if (!CONST_INT_P (x))
16569 return false;
16570 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
16571 if (left)
16572 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
16573 else
16574 return IN_RANGE (INTVAL (x), 1, bit_width);
16575 }
16576
16577 /* Return the bitmask CONST_INT to select the bits required by a zero extract
16578 operation of width WIDTH at bit position POS. */
16579
16580 rtx
16581 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
16582 {
16583 gcc_assert (CONST_INT_P (width));
16584 gcc_assert (CONST_INT_P (pos));
16585
16586 unsigned HOST_WIDE_INT mask
16587 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
16588 return GEN_INT (mask << UINTVAL (pos));
16589 }
16590
16591 bool
16592 aarch64_mov_operand_p (rtx x, machine_mode mode)
16593 {
16594 if (GET_CODE (x) == HIGH
16595 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
16596 return true;
16597
16598 if (CONST_INT_P (x))
16599 return true;
16600
16601 if (VECTOR_MODE_P (GET_MODE (x)))
16602 {
16603 /* Require predicate constants to be VNx16BI before RA, so that we
16604 force everything to have a canonical form. */
16605 if (!lra_in_progress
16606 && !reload_completed
16607 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
16608 && GET_MODE (x) != VNx16BImode)
16609 return false;
16610
16611 return aarch64_simd_valid_immediate (x, NULL);
16612 }
16613
16614 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
16615 return true;
16616
16617 if (aarch64_sve_cnt_immediate_p (x))
16618 return true;
16619
16620 return aarch64_classify_symbolic_expression (x)
16621 == SYMBOL_TINY_ABSOLUTE;
16622 }
16623
16624 /* Return a const_int vector of VAL. */
16625 rtx
16626 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
16627 {
16628 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
16629 return gen_const_vec_duplicate (mode, c);
16630 }
16631
16632 /* Check OP is a legal scalar immediate for the MOVI instruction. */
16633
16634 bool
16635 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
16636 {
16637 machine_mode vmode;
16638
16639 vmode = aarch64_simd_container_mode (mode, 64);
16640 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
16641 return aarch64_simd_valid_immediate (op_v, NULL);
16642 }
16643
16644 /* Construct and return a PARALLEL RTX vector with elements numbering the
16645 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
16646 the vector - from the perspective of the architecture. This does not
16647 line up with GCC's perspective on lane numbers, so we end up with
16648 different masks depending on our target endian-ness. The diagram
16649 below may help. We must draw the distinction when building masks
16650 which select one half of the vector. An instruction selecting
16651 architectural low-lanes for a big-endian target, must be described using
16652 a mask selecting GCC high-lanes.
16653
16654 Big-Endian Little-Endian
16655
16656 GCC 0 1 2 3 3 2 1 0
16657 | x | x | x | x | | x | x | x | x |
16658 Architecture 3 2 1 0 3 2 1 0
16659
16660 Low Mask: { 2, 3 } { 0, 1 }
16661 High Mask: { 0, 1 } { 2, 3 }
16662
16663 MODE Is the mode of the vector and NUNITS is the number of units in it. */
16664
16665 rtx
16666 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
16667 {
16668 rtvec v = rtvec_alloc (nunits / 2);
16669 int high_base = nunits / 2;
16670 int low_base = 0;
16671 int base;
16672 rtx t1;
16673 int i;
16674
16675 if (BYTES_BIG_ENDIAN)
16676 base = high ? low_base : high_base;
16677 else
16678 base = high ? high_base : low_base;
16679
16680 for (i = 0; i < nunits / 2; i++)
16681 RTVEC_ELT (v, i) = GEN_INT (base + i);
16682
16683 t1 = gen_rtx_PARALLEL (mode, v);
16684 return t1;
16685 }
16686
16687 /* Check OP for validity as a PARALLEL RTX vector with elements
16688 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
16689 from the perspective of the architecture. See the diagram above
16690 aarch64_simd_vect_par_cnst_half for more details. */
16691
16692 bool
16693 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
16694 bool high)
16695 {
16696 int nelts;
16697 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
16698 return false;
16699
16700 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
16701 HOST_WIDE_INT count_op = XVECLEN (op, 0);
16702 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
16703 int i = 0;
16704
16705 if (count_op != count_ideal)
16706 return false;
16707
16708 for (i = 0; i < count_ideal; i++)
16709 {
16710 rtx elt_op = XVECEXP (op, 0, i);
16711 rtx elt_ideal = XVECEXP (ideal, 0, i);
16712
16713 if (!CONST_INT_P (elt_op)
16714 || INTVAL (elt_ideal) != INTVAL (elt_op))
16715 return false;
16716 }
16717 return true;
16718 }
16719
16720 /* Return a PARALLEL containing NELTS elements, with element I equal
16721 to BASE + I * STEP. */
16722
16723 rtx
16724 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
16725 {
16726 rtvec vec = rtvec_alloc (nelts);
16727 for (unsigned int i = 0; i < nelts; ++i)
16728 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
16729 return gen_rtx_PARALLEL (VOIDmode, vec);
16730 }
16731
16732 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
16733 series with step STEP. */
16734
16735 bool
16736 aarch64_stepped_int_parallel_p (rtx op, int step)
16737 {
16738 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
16739 return false;
16740
16741 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
16742 for (int i = 1; i < XVECLEN (op, 0); ++i)
16743 if (!CONST_INT_P (XVECEXP (op, 0, i))
16744 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
16745 return false;
16746
16747 return true;
16748 }
16749
16750 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
16751 HIGH (exclusive). */
16752 void
16753 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
16754 const_tree exp)
16755 {
16756 HOST_WIDE_INT lane;
16757 gcc_assert (CONST_INT_P (operand));
16758 lane = INTVAL (operand);
16759
16760 if (lane < low || lane >= high)
16761 {
16762 if (exp)
16763 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
16764 else
16765 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
16766 }
16767 }
16768
16769 /* Peform endian correction on lane number N, which indexes a vector
16770 of mode MODE, and return the result as an SImode rtx. */
16771
16772 rtx
16773 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
16774 {
16775 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
16776 }
16777
16778 /* Return TRUE if OP is a valid vector addressing mode. */
16779
16780 bool
16781 aarch64_simd_mem_operand_p (rtx op)
16782 {
16783 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
16784 || REG_P (XEXP (op, 0)));
16785 }
16786
16787 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
16788
16789 bool
16790 aarch64_sve_ld1r_operand_p (rtx op)
16791 {
16792 struct aarch64_address_info addr;
16793 scalar_mode mode;
16794
16795 return (MEM_P (op)
16796 && is_a <scalar_mode> (GET_MODE (op), &mode)
16797 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
16798 && addr.type == ADDRESS_REG_IMM
16799 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
16800 }
16801
16802 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
16803 bool
16804 aarch64_sve_ld1rq_operand_p (rtx op)
16805 {
16806 struct aarch64_address_info addr;
16807 scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
16808 if (!MEM_P (op)
16809 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
16810 return false;
16811
16812 if (addr.type == ADDRESS_REG_IMM)
16813 return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
16814
16815 if (addr.type == ADDRESS_REG_REG)
16816 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
16817
16818 return false;
16819 }
16820
16821 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
16822 bool
16823 aarch64_sve_ldff1_operand_p (rtx op)
16824 {
16825 if (!MEM_P (op))
16826 return false;
16827
16828 struct aarch64_address_info addr;
16829 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
16830 return false;
16831
16832 if (addr.type == ADDRESS_REG_IMM)
16833 return known_eq (addr.const_offset, 0);
16834
16835 return addr.type == ADDRESS_REG_REG;
16836 }
16837
16838 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
16839 bool
16840 aarch64_sve_ldnf1_operand_p (rtx op)
16841 {
16842 struct aarch64_address_info addr;
16843
16844 return (MEM_P (op)
16845 && aarch64_classify_address (&addr, XEXP (op, 0),
16846 GET_MODE (op), false)
16847 && addr.type == ADDRESS_REG_IMM);
16848 }
16849
16850 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
16851 The conditions for STR are the same. */
16852 bool
16853 aarch64_sve_ldr_operand_p (rtx op)
16854 {
16855 struct aarch64_address_info addr;
16856
16857 return (MEM_P (op)
16858 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
16859 false, ADDR_QUERY_ANY)
16860 && addr.type == ADDRESS_REG_IMM);
16861 }
16862
16863 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
16864 addressing memory of mode MODE. */
16865 bool
16866 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
16867 {
16868 struct aarch64_address_info addr;
16869 if (!aarch64_classify_address (&addr, op, mode, false))
16870 return false;
16871
16872 if (addr.type == ADDRESS_REG_IMM)
16873 return known_eq (addr.const_offset, 0);
16874
16875 return addr.type == ADDRESS_REG_REG;
16876 }
16877
16878 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
16879 We need to be able to access the individual pieces, so the range
16880 is different from LD[234] and ST[234]. */
16881 bool
16882 aarch64_sve_struct_memory_operand_p (rtx op)
16883 {
16884 if (!MEM_P (op))
16885 return false;
16886
16887 machine_mode mode = GET_MODE (op);
16888 struct aarch64_address_info addr;
16889 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
16890 ADDR_QUERY_ANY)
16891 || addr.type != ADDRESS_REG_IMM)
16892 return false;
16893
16894 poly_int64 first = addr.const_offset;
16895 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
16896 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
16897 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
16898 }
16899
16900 /* Emit a register copy from operand to operand, taking care not to
16901 early-clobber source registers in the process.
16902
16903 COUNT is the number of components into which the copy needs to be
16904 decomposed. */
16905 void
16906 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
16907 unsigned int count)
16908 {
16909 unsigned int i;
16910 int rdest = REGNO (operands[0]);
16911 int rsrc = REGNO (operands[1]);
16912
16913 if (!reg_overlap_mentioned_p (operands[0], operands[1])
16914 || rdest < rsrc)
16915 for (i = 0; i < count; i++)
16916 emit_move_insn (gen_rtx_REG (mode, rdest + i),
16917 gen_rtx_REG (mode, rsrc + i));
16918 else
16919 for (i = 0; i < count; i++)
16920 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
16921 gen_rtx_REG (mode, rsrc + count - i - 1));
16922 }
16923
16924 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
16925 one of VSTRUCT modes: OI, CI, or XI. */
16926 int
16927 aarch64_simd_attr_length_rglist (machine_mode mode)
16928 {
16929 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
16930 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
16931 }
16932
16933 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
16934 alignment of a vector to 128 bits. SVE predicates have an alignment of
16935 16 bits. */
16936 static HOST_WIDE_INT
16937 aarch64_simd_vector_alignment (const_tree type)
16938 {
16939 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
16940 be set for non-predicate vectors of booleans. Modes are the most
16941 direct way we have of identifying real SVE predicate types. */
16942 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
16943 return 16;
16944 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16945 return 128;
16946 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
16947 }
16948
16949 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
16950 static poly_uint64
16951 aarch64_vectorize_preferred_vector_alignment (const_tree type)
16952 {
16953 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
16954 {
16955 /* If the length of the vector is fixed, try to align to that length,
16956 otherwise don't try to align at all. */
16957 HOST_WIDE_INT result;
16958 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
16959 result = TYPE_ALIGN (TREE_TYPE (type));
16960 return result;
16961 }
16962 return TYPE_ALIGN (type);
16963 }
16964
16965 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
16966 static bool
16967 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
16968 {
16969 if (is_packed)
16970 return false;
16971
16972 /* For fixed-length vectors, check that the vectorizer will aim for
16973 full-vector alignment. This isn't true for generic GCC vectors
16974 that are wider than the ABI maximum of 128 bits. */
16975 poly_uint64 preferred_alignment =
16976 aarch64_vectorize_preferred_vector_alignment (type);
16977 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
16978 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
16979 preferred_alignment))
16980 return false;
16981
16982 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
16983 return true;
16984 }
16985
16986 /* Return true if the vector misalignment factor is supported by the
16987 target. */
16988 static bool
16989 aarch64_builtin_support_vector_misalignment (machine_mode mode,
16990 const_tree type, int misalignment,
16991 bool is_packed)
16992 {
16993 if (TARGET_SIMD && STRICT_ALIGNMENT)
16994 {
16995 /* Return if movmisalign pattern is not supported for this mode. */
16996 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
16997 return false;
16998
16999 /* Misalignment factor is unknown at compile time. */
17000 if (misalignment == -1)
17001 return false;
17002 }
17003 return default_builtin_support_vector_misalignment (mode, type, misalignment,
17004 is_packed);
17005 }
17006
17007 /* If VALS is a vector constant that can be loaded into a register
17008 using DUP, generate instructions to do so and return an RTX to
17009 assign to the register. Otherwise return NULL_RTX. */
17010 static rtx
17011 aarch64_simd_dup_constant (rtx vals)
17012 {
17013 machine_mode mode = GET_MODE (vals);
17014 machine_mode inner_mode = GET_MODE_INNER (mode);
17015 rtx x;
17016
17017 if (!const_vec_duplicate_p (vals, &x))
17018 return NULL_RTX;
17019
17020 /* We can load this constant by using DUP and a constant in a
17021 single ARM register. This will be cheaper than a vector
17022 load. */
17023 x = copy_to_mode_reg (inner_mode, x);
17024 return gen_vec_duplicate (mode, x);
17025 }
17026
17027
17028 /* Generate code to load VALS, which is a PARALLEL containing only
17029 constants (for vec_init) or CONST_VECTOR, efficiently into a
17030 register. Returns an RTX to copy into the register, or NULL_RTX
17031 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
17032 static rtx
17033 aarch64_simd_make_constant (rtx vals)
17034 {
17035 machine_mode mode = GET_MODE (vals);
17036 rtx const_dup;
17037 rtx const_vec = NULL_RTX;
17038 int n_const = 0;
17039 int i;
17040
17041 if (GET_CODE (vals) == CONST_VECTOR)
17042 const_vec = vals;
17043 else if (GET_CODE (vals) == PARALLEL)
17044 {
17045 /* A CONST_VECTOR must contain only CONST_INTs and
17046 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
17047 Only store valid constants in a CONST_VECTOR. */
17048 int n_elts = XVECLEN (vals, 0);
17049 for (i = 0; i < n_elts; ++i)
17050 {
17051 rtx x = XVECEXP (vals, 0, i);
17052 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17053 n_const++;
17054 }
17055 if (n_const == n_elts)
17056 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
17057 }
17058 else
17059 gcc_unreachable ();
17060
17061 if (const_vec != NULL_RTX
17062 && aarch64_simd_valid_immediate (const_vec, NULL))
17063 /* Load using MOVI/MVNI. */
17064 return const_vec;
17065 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
17066 /* Loaded using DUP. */
17067 return const_dup;
17068 else if (const_vec != NULL_RTX)
17069 /* Load from constant pool. We cannot take advantage of single-cycle
17070 LD1 because we need a PC-relative addressing mode. */
17071 return const_vec;
17072 else
17073 /* A PARALLEL containing something not valid inside CONST_VECTOR.
17074 We cannot construct an initializer. */
17075 return NULL_RTX;
17076 }
17077
17078 /* Expand a vector initialisation sequence, such that TARGET is
17079 initialised to contain VALS. */
17080
17081 void
17082 aarch64_expand_vector_init (rtx target, rtx vals)
17083 {
17084 machine_mode mode = GET_MODE (target);
17085 scalar_mode inner_mode = GET_MODE_INNER (mode);
17086 /* The number of vector elements. */
17087 int n_elts = XVECLEN (vals, 0);
17088 /* The number of vector elements which are not constant. */
17089 int n_var = 0;
17090 rtx any_const = NULL_RTX;
17091 /* The first element of vals. */
17092 rtx v0 = XVECEXP (vals, 0, 0);
17093 bool all_same = true;
17094
17095 /* This is a special vec_init<M><N> where N is not an element mode but a
17096 vector mode with half the elements of M. We expect to find two entries
17097 of mode N in VALS and we must put their concatentation into TARGET. */
17098 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
17099 {
17100 gcc_assert (known_eq (GET_MODE_SIZE (mode),
17101 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
17102 rtx lo = XVECEXP (vals, 0, 0);
17103 rtx hi = XVECEXP (vals, 0, 1);
17104 machine_mode narrow_mode = GET_MODE (lo);
17105 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
17106 gcc_assert (narrow_mode == GET_MODE (hi));
17107
17108 /* When we want to concatenate a half-width vector with zeroes we can
17109 use the aarch64_combinez[_be] patterns. Just make sure that the
17110 zeroes are in the right half. */
17111 if (BYTES_BIG_ENDIAN
17112 && aarch64_simd_imm_zero (lo, narrow_mode)
17113 && general_operand (hi, narrow_mode))
17114 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
17115 else if (!BYTES_BIG_ENDIAN
17116 && aarch64_simd_imm_zero (hi, narrow_mode)
17117 && general_operand (lo, narrow_mode))
17118 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
17119 else
17120 {
17121 /* Else create the two half-width registers and combine them. */
17122 if (!REG_P (lo))
17123 lo = force_reg (GET_MODE (lo), lo);
17124 if (!REG_P (hi))
17125 hi = force_reg (GET_MODE (hi), hi);
17126
17127 if (BYTES_BIG_ENDIAN)
17128 std::swap (lo, hi);
17129 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
17130 }
17131 return;
17132 }
17133
17134 /* Count the number of variable elements to initialise. */
17135 for (int i = 0; i < n_elts; ++i)
17136 {
17137 rtx x = XVECEXP (vals, 0, i);
17138 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
17139 ++n_var;
17140 else
17141 any_const = x;
17142
17143 all_same &= rtx_equal_p (x, v0);
17144 }
17145
17146 /* No variable elements, hand off to aarch64_simd_make_constant which knows
17147 how best to handle this. */
17148 if (n_var == 0)
17149 {
17150 rtx constant = aarch64_simd_make_constant (vals);
17151 if (constant != NULL_RTX)
17152 {
17153 emit_move_insn (target, constant);
17154 return;
17155 }
17156 }
17157
17158 /* Splat a single non-constant element if we can. */
17159 if (all_same)
17160 {
17161 rtx x = copy_to_mode_reg (inner_mode, v0);
17162 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17163 return;
17164 }
17165
17166 enum insn_code icode = optab_handler (vec_set_optab, mode);
17167 gcc_assert (icode != CODE_FOR_nothing);
17168
17169 /* If there are only variable elements, try to optimize
17170 the insertion using dup for the most common element
17171 followed by insertions. */
17172
17173 /* The algorithm will fill matches[*][0] with the earliest matching element,
17174 and matches[X][1] with the count of duplicate elements (if X is the
17175 earliest element which has duplicates). */
17176
17177 if (n_var == n_elts && n_elts <= 16)
17178 {
17179 int matches[16][2] = {0};
17180 for (int i = 0; i < n_elts; i++)
17181 {
17182 for (int j = 0; j <= i; j++)
17183 {
17184 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
17185 {
17186 matches[i][0] = j;
17187 matches[j][1]++;
17188 break;
17189 }
17190 }
17191 }
17192 int maxelement = 0;
17193 int maxv = 0;
17194 for (int i = 0; i < n_elts; i++)
17195 if (matches[i][1] > maxv)
17196 {
17197 maxelement = i;
17198 maxv = matches[i][1];
17199 }
17200
17201 /* Create a duplicate of the most common element, unless all elements
17202 are equally useless to us, in which case just immediately set the
17203 vector register using the first element. */
17204
17205 if (maxv == 1)
17206 {
17207 /* For vectors of two 64-bit elements, we can do even better. */
17208 if (n_elts == 2
17209 && (inner_mode == E_DImode
17210 || inner_mode == E_DFmode))
17211
17212 {
17213 rtx x0 = XVECEXP (vals, 0, 0);
17214 rtx x1 = XVECEXP (vals, 0, 1);
17215 /* Combine can pick up this case, but handling it directly
17216 here leaves clearer RTL.
17217
17218 This is load_pair_lanes<mode>, and also gives us a clean-up
17219 for store_pair_lanes<mode>. */
17220 if (memory_operand (x0, inner_mode)
17221 && memory_operand (x1, inner_mode)
17222 && !STRICT_ALIGNMENT
17223 && rtx_equal_p (XEXP (x1, 0),
17224 plus_constant (Pmode,
17225 XEXP (x0, 0),
17226 GET_MODE_SIZE (inner_mode))))
17227 {
17228 rtx t;
17229 if (inner_mode == DFmode)
17230 t = gen_load_pair_lanesdf (target, x0, x1);
17231 else
17232 t = gen_load_pair_lanesdi (target, x0, x1);
17233 emit_insn (t);
17234 return;
17235 }
17236 }
17237 /* The subreg-move sequence below will move into lane zero of the
17238 vector register. For big-endian we want that position to hold
17239 the last element of VALS. */
17240 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
17241 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17242 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
17243 }
17244 else
17245 {
17246 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17247 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17248 }
17249
17250 /* Insert the rest. */
17251 for (int i = 0; i < n_elts; i++)
17252 {
17253 rtx x = XVECEXP (vals, 0, i);
17254 if (matches[i][0] == maxelement)
17255 continue;
17256 x = copy_to_mode_reg (inner_mode, x);
17257 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17258 }
17259 return;
17260 }
17261
17262 /* Initialise a vector which is part-variable. We want to first try
17263 to build those lanes which are constant in the most efficient way we
17264 can. */
17265 if (n_var != n_elts)
17266 {
17267 rtx copy = copy_rtx (vals);
17268
17269 /* Load constant part of vector. We really don't care what goes into the
17270 parts we will overwrite, but we're more likely to be able to load the
17271 constant efficiently if it has fewer, larger, repeating parts
17272 (see aarch64_simd_valid_immediate). */
17273 for (int i = 0; i < n_elts; i++)
17274 {
17275 rtx x = XVECEXP (vals, 0, i);
17276 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17277 continue;
17278 rtx subst = any_const;
17279 for (int bit = n_elts / 2; bit > 0; bit /= 2)
17280 {
17281 /* Look in the copied vector, as more elements are const. */
17282 rtx test = XVECEXP (copy, 0, i ^ bit);
17283 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
17284 {
17285 subst = test;
17286 break;
17287 }
17288 }
17289 XVECEXP (copy, 0, i) = subst;
17290 }
17291 aarch64_expand_vector_init (target, copy);
17292 }
17293
17294 /* Insert the variable lanes directly. */
17295 for (int i = 0; i < n_elts; i++)
17296 {
17297 rtx x = XVECEXP (vals, 0, i);
17298 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17299 continue;
17300 x = copy_to_mode_reg (inner_mode, x);
17301 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17302 }
17303 }
17304
17305 /* Emit RTL corresponding to:
17306 insr TARGET, ELEM. */
17307
17308 static void
17309 emit_insr (rtx target, rtx elem)
17310 {
17311 machine_mode mode = GET_MODE (target);
17312 scalar_mode elem_mode = GET_MODE_INNER (mode);
17313 elem = force_reg (elem_mode, elem);
17314
17315 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
17316 gcc_assert (icode != CODE_FOR_nothing);
17317 emit_insn (GEN_FCN (icode) (target, target, elem));
17318 }
17319
17320 /* Subroutine of aarch64_sve_expand_vector_init for handling
17321 trailing constants.
17322 This function works as follows:
17323 (a) Create a new vector consisting of trailing constants.
17324 (b) Initialize TARGET with the constant vector using emit_move_insn.
17325 (c) Insert remaining elements in TARGET using insr.
17326 NELTS is the total number of elements in original vector while
17327 while NELTS_REQD is the number of elements that are actually
17328 significant.
17329
17330 ??? The heuristic used is to do above only if number of constants
17331 is at least half the total number of elements. May need fine tuning. */
17332
17333 static bool
17334 aarch64_sve_expand_vector_init_handle_trailing_constants
17335 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
17336 {
17337 machine_mode mode = GET_MODE (target);
17338 scalar_mode elem_mode = GET_MODE_INNER (mode);
17339 int n_trailing_constants = 0;
17340
17341 for (int i = nelts_reqd - 1;
17342 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
17343 i--)
17344 n_trailing_constants++;
17345
17346 if (n_trailing_constants >= nelts_reqd / 2)
17347 {
17348 rtx_vector_builder v (mode, 1, nelts);
17349 for (int i = 0; i < nelts; i++)
17350 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
17351 rtx const_vec = v.build ();
17352 emit_move_insn (target, const_vec);
17353
17354 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
17355 emit_insr (target, builder.elt (i));
17356
17357 return true;
17358 }
17359
17360 return false;
17361 }
17362
17363 /* Subroutine of aarch64_sve_expand_vector_init.
17364 Works as follows:
17365 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
17366 (b) Skip trailing elements from BUILDER, which are the same as
17367 element NELTS_REQD - 1.
17368 (c) Insert earlier elements in reverse order in TARGET using insr. */
17369
17370 static void
17371 aarch64_sve_expand_vector_init_insert_elems (rtx target,
17372 const rtx_vector_builder &builder,
17373 int nelts_reqd)
17374 {
17375 machine_mode mode = GET_MODE (target);
17376 scalar_mode elem_mode = GET_MODE_INNER (mode);
17377
17378 struct expand_operand ops[2];
17379 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
17380 gcc_assert (icode != CODE_FOR_nothing);
17381
17382 create_output_operand (&ops[0], target, mode);
17383 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
17384 expand_insn (icode, 2, ops);
17385
17386 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17387 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
17388 emit_insr (target, builder.elt (i));
17389 }
17390
17391 /* Subroutine of aarch64_sve_expand_vector_init to handle case
17392 when all trailing elements of builder are same.
17393 This works as follows:
17394 (a) Use expand_insn interface to broadcast last vector element in TARGET.
17395 (b) Insert remaining elements in TARGET using insr.
17396
17397 ??? The heuristic used is to do above if number of same trailing elements
17398 is at least 3/4 of total number of elements, loosely based on
17399 heuristic from mostly_zeros_p. May need fine-tuning. */
17400
17401 static bool
17402 aarch64_sve_expand_vector_init_handle_trailing_same_elem
17403 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
17404 {
17405 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17406 if (ndups >= (3 * nelts_reqd) / 4)
17407 {
17408 aarch64_sve_expand_vector_init_insert_elems (target, builder,
17409 nelts_reqd - ndups + 1);
17410 return true;
17411 }
17412
17413 return false;
17414 }
17415
17416 /* Initialize register TARGET from BUILDER. NELTS is the constant number
17417 of elements in BUILDER.
17418
17419 The function tries to initialize TARGET from BUILDER if it fits one
17420 of the special cases outlined below.
17421
17422 Failing that, the function divides BUILDER into two sub-vectors:
17423 v_even = even elements of BUILDER;
17424 v_odd = odd elements of BUILDER;
17425
17426 and recursively calls itself with v_even and v_odd.
17427
17428 if (recursive call succeeded for v_even or v_odd)
17429 TARGET = zip (v_even, v_odd)
17430
17431 The function returns true if it managed to build TARGET from BUILDER
17432 with one of the special cases, false otherwise.
17433
17434 Example: {a, 1, b, 2, c, 3, d, 4}
17435
17436 The vector gets divided into:
17437 v_even = {a, b, c, d}
17438 v_odd = {1, 2, 3, 4}
17439
17440 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
17441 initialize tmp2 from constant vector v_odd using emit_move_insn.
17442
17443 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
17444 4 elements, so we construct tmp1 from v_even using insr:
17445 tmp1 = dup(d)
17446 insr tmp1, c
17447 insr tmp1, b
17448 insr tmp1, a
17449
17450 And finally:
17451 TARGET = zip (tmp1, tmp2)
17452 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
17453
17454 static bool
17455 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
17456 int nelts, int nelts_reqd)
17457 {
17458 machine_mode mode = GET_MODE (target);
17459
17460 /* Case 1: Vector contains trailing constants. */
17461
17462 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17463 (target, builder, nelts, nelts_reqd))
17464 return true;
17465
17466 /* Case 2: Vector contains leading constants. */
17467
17468 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
17469 for (int i = 0; i < nelts_reqd; i++)
17470 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
17471 rev_builder.finalize ();
17472
17473 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17474 (target, rev_builder, nelts, nelts_reqd))
17475 {
17476 emit_insn (gen_aarch64_sve_rev (mode, target, target));
17477 return true;
17478 }
17479
17480 /* Case 3: Vector contains trailing same element. */
17481
17482 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17483 (target, builder, nelts_reqd))
17484 return true;
17485
17486 /* Case 4: Vector contains leading same element. */
17487
17488 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17489 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
17490 {
17491 emit_insn (gen_aarch64_sve_rev (mode, target, target));
17492 return true;
17493 }
17494
17495 /* Avoid recursing below 4-elements.
17496 ??? The threshold 4 may need fine-tuning. */
17497
17498 if (nelts_reqd <= 4)
17499 return false;
17500
17501 rtx_vector_builder v_even (mode, 1, nelts);
17502 rtx_vector_builder v_odd (mode, 1, nelts);
17503
17504 for (int i = 0; i < nelts * 2; i += 2)
17505 {
17506 v_even.quick_push (builder.elt (i));
17507 v_odd.quick_push (builder.elt (i + 1));
17508 }
17509
17510 v_even.finalize ();
17511 v_odd.finalize ();
17512
17513 rtx tmp1 = gen_reg_rtx (mode);
17514 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
17515 nelts, nelts_reqd / 2);
17516
17517 rtx tmp2 = gen_reg_rtx (mode);
17518 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
17519 nelts, nelts_reqd / 2);
17520
17521 if (!did_even_p && !did_odd_p)
17522 return false;
17523
17524 /* Initialize v_even and v_odd using INSR if it didn't match any of the
17525 special cases and zip v_even, v_odd. */
17526
17527 if (!did_even_p)
17528 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
17529
17530 if (!did_odd_p)
17531 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
17532
17533 rtvec v = gen_rtvec (2, tmp1, tmp2);
17534 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
17535 return true;
17536 }
17537
17538 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
17539
17540 void
17541 aarch64_sve_expand_vector_init (rtx target, rtx vals)
17542 {
17543 machine_mode mode = GET_MODE (target);
17544 int nelts = XVECLEN (vals, 0);
17545
17546 rtx_vector_builder v (mode, 1, nelts);
17547 for (int i = 0; i < nelts; i++)
17548 v.quick_push (XVECEXP (vals, 0, i));
17549 v.finalize ();
17550
17551 /* If neither sub-vectors of v could be initialized specially,
17552 then use INSR to insert all elements from v into TARGET.
17553 ??? This might not be optimal for vectors with large
17554 initializers like 16-element or above.
17555 For nelts < 4, it probably isn't useful to handle specially. */
17556
17557 if (nelts < 4
17558 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
17559 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
17560 }
17561
17562 /* Check whether VALUE is a vector constant in which every element
17563 is either a power of 2 or a negated power of 2. If so, return
17564 a constant vector of log2s, and flip CODE between PLUS and MINUS
17565 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
17566
17567 static rtx
17568 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
17569 {
17570 if (GET_CODE (value) != CONST_VECTOR)
17571 return NULL_RTX;
17572
17573 rtx_vector_builder builder;
17574 if (!builder.new_unary_operation (GET_MODE (value), value, false))
17575 return NULL_RTX;
17576
17577 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
17578 /* 1 if the result of the multiplication must be negated,
17579 0 if it mustn't, or -1 if we don't yet care. */
17580 int negate = -1;
17581 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
17582 for (unsigned int i = 0; i < encoded_nelts; ++i)
17583 {
17584 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
17585 if (!CONST_SCALAR_INT_P (elt))
17586 return NULL_RTX;
17587 rtx_mode_t val (elt, int_mode);
17588 wide_int pow2 = wi::neg (val);
17589 if (val != pow2)
17590 {
17591 /* It matters whether we negate or not. Make that choice,
17592 and make sure that it's consistent with previous elements. */
17593 if (negate == !wi::neg_p (val))
17594 return NULL_RTX;
17595 negate = wi::neg_p (val);
17596 if (!negate)
17597 pow2 = val;
17598 }
17599 /* POW2 is now the value that we want to be a power of 2. */
17600 int shift = wi::exact_log2 (pow2);
17601 if (shift < 0)
17602 return NULL_RTX;
17603 builder.quick_push (gen_int_mode (shift, int_mode));
17604 }
17605 if (negate == -1)
17606 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
17607 code = PLUS;
17608 else if (negate == 1)
17609 code = code == PLUS ? MINUS : PLUS;
17610 return builder.build ();
17611 }
17612
17613 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
17614 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
17615 operands array, in the same order as for fma_optab. Return true if
17616 the function emitted all the necessary instructions, false if the caller
17617 should generate the pattern normally with the new OPERANDS array. */
17618
17619 bool
17620 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
17621 {
17622 machine_mode mode = GET_MODE (operands[0]);
17623 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
17624 {
17625 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
17626 NULL_RTX, true, OPTAB_DIRECT);
17627 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
17628 operands[3], product, operands[0], true,
17629 OPTAB_DIRECT);
17630 return true;
17631 }
17632 operands[2] = force_reg (mode, operands[2]);
17633 return false;
17634 }
17635
17636 /* Likewise, but for a conditional pattern. */
17637
17638 bool
17639 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
17640 {
17641 machine_mode mode = GET_MODE (operands[0]);
17642 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
17643 {
17644 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
17645 NULL_RTX, true, OPTAB_DIRECT);
17646 emit_insn (gen_cond (code, mode, operands[0], operands[1],
17647 operands[4], product, operands[5]));
17648 return true;
17649 }
17650 operands[3] = force_reg (mode, operands[3]);
17651 return false;
17652 }
17653
17654 static unsigned HOST_WIDE_INT
17655 aarch64_shift_truncation_mask (machine_mode mode)
17656 {
17657 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
17658 return 0;
17659 return GET_MODE_UNIT_BITSIZE (mode) - 1;
17660 }
17661
17662 /* Select a format to encode pointers in exception handling data. */
17663 int
17664 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
17665 {
17666 int type;
17667 switch (aarch64_cmodel)
17668 {
17669 case AARCH64_CMODEL_TINY:
17670 case AARCH64_CMODEL_TINY_PIC:
17671 case AARCH64_CMODEL_SMALL:
17672 case AARCH64_CMODEL_SMALL_PIC:
17673 case AARCH64_CMODEL_SMALL_SPIC:
17674 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
17675 for everything. */
17676 type = DW_EH_PE_sdata4;
17677 break;
17678 default:
17679 /* No assumptions here. 8-byte relocs required. */
17680 type = DW_EH_PE_sdata8;
17681 break;
17682 }
17683 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
17684 }
17685
17686 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
17687
17688 static void
17689 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
17690 {
17691 if (TREE_CODE (decl) == FUNCTION_DECL)
17692 {
17693 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
17694 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
17695 {
17696 fprintf (stream, "\t.variant_pcs\t");
17697 assemble_name (stream, name);
17698 fprintf (stream, "\n");
17699 }
17700 }
17701 }
17702
17703 /* The last .arch and .tune assembly strings that we printed. */
17704 static std::string aarch64_last_printed_arch_string;
17705 static std::string aarch64_last_printed_tune_string;
17706
17707 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
17708 by the function fndecl. */
17709
17710 void
17711 aarch64_declare_function_name (FILE *stream, const char* name,
17712 tree fndecl)
17713 {
17714 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
17715
17716 struct cl_target_option *targ_options;
17717 if (target_parts)
17718 targ_options = TREE_TARGET_OPTION (target_parts);
17719 else
17720 targ_options = TREE_TARGET_OPTION (target_option_current_node);
17721 gcc_assert (targ_options);
17722
17723 const struct processor *this_arch
17724 = aarch64_get_arch (targ_options->x_explicit_arch);
17725
17726 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
17727 std::string extension
17728 = aarch64_get_extension_string_for_isa_flags (isa_flags,
17729 this_arch->flags);
17730 /* Only update the assembler .arch string if it is distinct from the last
17731 such string we printed. */
17732 std::string to_print = this_arch->name + extension;
17733 if (to_print != aarch64_last_printed_arch_string)
17734 {
17735 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
17736 aarch64_last_printed_arch_string = to_print;
17737 }
17738
17739 /* Print the cpu name we're tuning for in the comments, might be
17740 useful to readers of the generated asm. Do it only when it changes
17741 from function to function and verbose assembly is requested. */
17742 const struct processor *this_tune
17743 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
17744
17745 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
17746 {
17747 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
17748 this_tune->name);
17749 aarch64_last_printed_tune_string = this_tune->name;
17750 }
17751
17752 aarch64_asm_output_variant_pcs (stream, fndecl, name);
17753
17754 /* Don't forget the type directive for ELF. */
17755 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
17756 ASM_OUTPUT_LABEL (stream, name);
17757 }
17758
17759 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
17760
17761 void
17762 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
17763 {
17764 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
17765 const char *value = IDENTIFIER_POINTER (target);
17766 aarch64_asm_output_variant_pcs (stream, decl, name);
17767 ASM_OUTPUT_DEF (stream, name, value);
17768 }
17769
17770 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
17771 function symbol references. */
17772
17773 void
17774 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
17775 {
17776 default_elf_asm_output_external (stream, decl, name);
17777 aarch64_asm_output_variant_pcs (stream, decl, name);
17778 }
17779
17780 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
17781 Used to output the .cfi_b_key_frame directive when signing the current
17782 function with the B key. */
17783
17784 void
17785 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
17786 {
17787 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
17788 && aarch64_ra_sign_key == AARCH64_KEY_B)
17789 asm_fprintf (f, "\t.cfi_b_key_frame\n");
17790 }
17791
17792 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
17793
17794 static void
17795 aarch64_start_file (void)
17796 {
17797 struct cl_target_option *default_options
17798 = TREE_TARGET_OPTION (target_option_default_node);
17799
17800 const struct processor *default_arch
17801 = aarch64_get_arch (default_options->x_explicit_arch);
17802 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
17803 std::string extension
17804 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
17805 default_arch->flags);
17806
17807 aarch64_last_printed_arch_string = default_arch->name + extension;
17808 aarch64_last_printed_tune_string = "";
17809 asm_fprintf (asm_out_file, "\t.arch %s\n",
17810 aarch64_last_printed_arch_string.c_str ());
17811
17812 default_file_start ();
17813 }
17814
17815 /* Emit load exclusive. */
17816
17817 static void
17818 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
17819 rtx mem, rtx model_rtx)
17820 {
17821 if (mode == TImode)
17822 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
17823 gen_highpart (DImode, rval),
17824 mem, model_rtx));
17825 else
17826 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
17827 }
17828
17829 /* Emit store exclusive. */
17830
17831 static void
17832 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
17833 rtx mem, rtx rval, rtx model_rtx)
17834 {
17835 if (mode == TImode)
17836 emit_insn (gen_aarch64_store_exclusive_pair
17837 (bval, mem, operand_subword (rval, 0, 0, TImode),
17838 operand_subword (rval, 1, 0, TImode), model_rtx));
17839 else
17840 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
17841 }
17842
17843 /* Mark the previous jump instruction as unlikely. */
17844
17845 static void
17846 aarch64_emit_unlikely_jump (rtx insn)
17847 {
17848 rtx_insn *jump = emit_jump_insn (insn);
17849 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
17850 }
17851
17852 /* We store the names of the various atomic helpers in a 5x4 array.
17853 Return the libcall function given MODE, MODEL and NAMES. */
17854
17855 rtx
17856 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
17857 const atomic_ool_names *names)
17858 {
17859 memmodel model = memmodel_base (INTVAL (model_rtx));
17860 int mode_idx, model_idx;
17861
17862 switch (mode)
17863 {
17864 case E_QImode:
17865 mode_idx = 0;
17866 break;
17867 case E_HImode:
17868 mode_idx = 1;
17869 break;
17870 case E_SImode:
17871 mode_idx = 2;
17872 break;
17873 case E_DImode:
17874 mode_idx = 3;
17875 break;
17876 case E_TImode:
17877 mode_idx = 4;
17878 break;
17879 default:
17880 gcc_unreachable ();
17881 }
17882
17883 switch (model)
17884 {
17885 case MEMMODEL_RELAXED:
17886 model_idx = 0;
17887 break;
17888 case MEMMODEL_CONSUME:
17889 case MEMMODEL_ACQUIRE:
17890 model_idx = 1;
17891 break;
17892 case MEMMODEL_RELEASE:
17893 model_idx = 2;
17894 break;
17895 case MEMMODEL_ACQ_REL:
17896 case MEMMODEL_SEQ_CST:
17897 model_idx = 3;
17898 break;
17899 default:
17900 gcc_unreachable ();
17901 }
17902
17903 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
17904 VISIBILITY_HIDDEN);
17905 }
17906
17907 #define DEF0(B, N) \
17908 { "__aarch64_" #B #N "_relax", \
17909 "__aarch64_" #B #N "_acq", \
17910 "__aarch64_" #B #N "_rel", \
17911 "__aarch64_" #B #N "_acq_rel" }
17912
17913 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
17914 { NULL, NULL, NULL, NULL }
17915 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
17916
17917 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
17918 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
17919 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
17920 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
17921 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
17922 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
17923
17924 #undef DEF0
17925 #undef DEF4
17926 #undef DEF5
17927
17928 /* Expand a compare and swap pattern. */
17929
17930 void
17931 aarch64_expand_compare_and_swap (rtx operands[])
17932 {
17933 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
17934 machine_mode mode, r_mode;
17935
17936 bval = operands[0];
17937 rval = operands[1];
17938 mem = operands[2];
17939 oldval = operands[3];
17940 newval = operands[4];
17941 is_weak = operands[5];
17942 mod_s = operands[6];
17943 mod_f = operands[7];
17944 mode = GET_MODE (mem);
17945
17946 /* Normally the succ memory model must be stronger than fail, but in the
17947 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
17948 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
17949 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
17950 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
17951 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
17952
17953 r_mode = mode;
17954 if (mode == QImode || mode == HImode)
17955 {
17956 r_mode = SImode;
17957 rval = gen_reg_rtx (r_mode);
17958 }
17959
17960 if (TARGET_LSE)
17961 {
17962 /* The CAS insn requires oldval and rval overlap, but we need to
17963 have a copy of oldval saved across the operation to tell if
17964 the operation is successful. */
17965 if (reg_overlap_mentioned_p (rval, oldval))
17966 rval = copy_to_mode_reg (r_mode, oldval);
17967 else
17968 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
17969
17970 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
17971 newval, mod_s));
17972 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17973 }
17974 else if (TARGET_OUTLINE_ATOMICS)
17975 {
17976 /* Oldval must satisfy compare afterward. */
17977 if (!aarch64_plus_operand (oldval, mode))
17978 oldval = force_reg (mode, oldval);
17979 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
17980 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
17981 oldval, mode, newval, mode,
17982 XEXP (mem, 0), Pmode);
17983 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17984 }
17985 else
17986 {
17987 /* The oldval predicate varies by mode. Test it and force to reg. */
17988 insn_code code = code_for_aarch64_compare_and_swap (mode);
17989 if (!insn_data[code].operand[2].predicate (oldval, mode))
17990 oldval = force_reg (mode, oldval);
17991
17992 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
17993 is_weak, mod_s, mod_f));
17994 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
17995 }
17996
17997 if (r_mode != mode)
17998 rval = gen_lowpart (mode, rval);
17999 emit_move_insn (operands[1], rval);
18000
18001 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
18002 emit_insn (gen_rtx_SET (bval, x));
18003 }
18004
18005 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
18006 sequence implementing an atomic operation. */
18007
18008 static void
18009 aarch64_emit_post_barrier (enum memmodel model)
18010 {
18011 const enum memmodel base_model = memmodel_base (model);
18012
18013 if (is_mm_sync (model)
18014 && (base_model == MEMMODEL_ACQUIRE
18015 || base_model == MEMMODEL_ACQ_REL
18016 || base_model == MEMMODEL_SEQ_CST))
18017 {
18018 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
18019 }
18020 }
18021
18022 /* Split a compare and swap pattern. */
18023
18024 void
18025 aarch64_split_compare_and_swap (rtx operands[])
18026 {
18027 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
18028 machine_mode mode;
18029 bool is_weak;
18030 rtx_code_label *label1, *label2;
18031 enum memmodel model;
18032
18033 rval = operands[0];
18034 mem = operands[1];
18035 oldval = operands[2];
18036 newval = operands[3];
18037 is_weak = (operands[4] != const0_rtx);
18038 model_rtx = operands[5];
18039 scratch = operands[7];
18040 mode = GET_MODE (mem);
18041 model = memmodel_from_int (INTVAL (model_rtx));
18042
18043 /* When OLDVAL is zero and we want the strong version we can emit a tighter
18044 loop:
18045 .label1:
18046 LD[A]XR rval, [mem]
18047 CBNZ rval, .label2
18048 ST[L]XR scratch, newval, [mem]
18049 CBNZ scratch, .label1
18050 .label2:
18051 CMP rval, 0. */
18052 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
18053 oldval == const0_rtx && mode != TImode);
18054
18055 label1 = NULL;
18056 if (!is_weak)
18057 {
18058 label1 = gen_label_rtx ();
18059 emit_label (label1);
18060 }
18061 label2 = gen_label_rtx ();
18062
18063 /* The initial load can be relaxed for a __sync operation since a final
18064 barrier will be emitted to stop code hoisting. */
18065 if (is_mm_sync (model))
18066 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
18067 else
18068 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
18069
18070 if (strong_zero_p)
18071 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
18072 else
18073 {
18074 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18075 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
18076 }
18077 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18078 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
18079 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18080
18081 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
18082
18083 if (!is_weak)
18084 {
18085 if (aarch64_track_speculation)
18086 {
18087 /* Emit an explicit compare instruction, so that we can correctly
18088 track the condition codes. */
18089 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18090 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18091 }
18092 else
18093 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
18094
18095 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18096 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
18097 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18098 }
18099 else
18100 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18101
18102 emit_label (label2);
18103
18104 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
18105 to set the condition flags. If this is not used it will be removed by
18106 later passes. */
18107 if (strong_zero_p)
18108 aarch64_gen_compare_reg (NE, rval, const0_rtx);
18109
18110 /* Emit any final barrier needed for a __sync operation. */
18111 if (is_mm_sync (model))
18112 aarch64_emit_post_barrier (model);
18113 }
18114
18115 /* Split an atomic operation. */
18116
18117 void
18118 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
18119 rtx value, rtx model_rtx, rtx cond)
18120 {
18121 machine_mode mode = GET_MODE (mem);
18122 machine_mode wmode = (mode == DImode ? DImode : SImode);
18123 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
18124 const bool is_sync = is_mm_sync (model);
18125 rtx_code_label *label;
18126 rtx x;
18127
18128 /* Split the atomic operation into a sequence. */
18129 label = gen_label_rtx ();
18130 emit_label (label);
18131
18132 if (new_out)
18133 new_out = gen_lowpart (wmode, new_out);
18134 if (old_out)
18135 old_out = gen_lowpart (wmode, old_out);
18136 else
18137 old_out = new_out;
18138 value = simplify_gen_subreg (wmode, value, mode, 0);
18139
18140 /* The initial load can be relaxed for a __sync operation since a final
18141 barrier will be emitted to stop code hoisting. */
18142 if (is_sync)
18143 aarch64_emit_load_exclusive (mode, old_out, mem,
18144 GEN_INT (MEMMODEL_RELAXED));
18145 else
18146 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
18147
18148 switch (code)
18149 {
18150 case SET:
18151 new_out = value;
18152 break;
18153
18154 case NOT:
18155 x = gen_rtx_AND (wmode, old_out, value);
18156 emit_insn (gen_rtx_SET (new_out, x));
18157 x = gen_rtx_NOT (wmode, new_out);
18158 emit_insn (gen_rtx_SET (new_out, x));
18159 break;
18160
18161 case MINUS:
18162 if (CONST_INT_P (value))
18163 {
18164 value = GEN_INT (-INTVAL (value));
18165 code = PLUS;
18166 }
18167 /* Fall through. */
18168
18169 default:
18170 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
18171 emit_insn (gen_rtx_SET (new_out, x));
18172 break;
18173 }
18174
18175 aarch64_emit_store_exclusive (mode, cond, mem,
18176 gen_lowpart (mode, new_out), model_rtx);
18177
18178 if (aarch64_track_speculation)
18179 {
18180 /* Emit an explicit compare instruction, so that we can correctly
18181 track the condition codes. */
18182 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
18183 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18184 }
18185 else
18186 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
18187
18188 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18189 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
18190 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18191
18192 /* Emit any final barrier needed for a __sync operation. */
18193 if (is_sync)
18194 aarch64_emit_post_barrier (model);
18195 }
18196
18197 static void
18198 aarch64_init_libfuncs (void)
18199 {
18200 /* Half-precision float operations. The compiler handles all operations
18201 with NULL libfuncs by converting to SFmode. */
18202
18203 /* Conversions. */
18204 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
18205 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
18206
18207 /* Arithmetic. */
18208 set_optab_libfunc (add_optab, HFmode, NULL);
18209 set_optab_libfunc (sdiv_optab, HFmode, NULL);
18210 set_optab_libfunc (smul_optab, HFmode, NULL);
18211 set_optab_libfunc (neg_optab, HFmode, NULL);
18212 set_optab_libfunc (sub_optab, HFmode, NULL);
18213
18214 /* Comparisons. */
18215 set_optab_libfunc (eq_optab, HFmode, NULL);
18216 set_optab_libfunc (ne_optab, HFmode, NULL);
18217 set_optab_libfunc (lt_optab, HFmode, NULL);
18218 set_optab_libfunc (le_optab, HFmode, NULL);
18219 set_optab_libfunc (ge_optab, HFmode, NULL);
18220 set_optab_libfunc (gt_optab, HFmode, NULL);
18221 set_optab_libfunc (unord_optab, HFmode, NULL);
18222 }
18223
18224 /* Target hook for c_mode_for_suffix. */
18225 static machine_mode
18226 aarch64_c_mode_for_suffix (char suffix)
18227 {
18228 if (suffix == 'q')
18229 return TFmode;
18230
18231 return VOIDmode;
18232 }
18233
18234 /* We can only represent floating point constants which will fit in
18235 "quarter-precision" values. These values are characterised by
18236 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
18237 by:
18238
18239 (-1)^s * (n/16) * 2^r
18240
18241 Where:
18242 's' is the sign bit.
18243 'n' is an integer in the range 16 <= n <= 31.
18244 'r' is an integer in the range -3 <= r <= 4. */
18245
18246 /* Return true iff X can be represented by a quarter-precision
18247 floating point immediate operand X. Note, we cannot represent 0.0. */
18248 bool
18249 aarch64_float_const_representable_p (rtx x)
18250 {
18251 /* This represents our current view of how many bits
18252 make up the mantissa. */
18253 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
18254 int exponent;
18255 unsigned HOST_WIDE_INT mantissa, mask;
18256 REAL_VALUE_TYPE r, m;
18257 bool fail;
18258
18259 x = unwrap_const_vec_duplicate (x);
18260 if (!CONST_DOUBLE_P (x))
18261 return false;
18262
18263 if (GET_MODE (x) == VOIDmode
18264 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
18265 return false;
18266
18267 r = *CONST_DOUBLE_REAL_VALUE (x);
18268
18269 /* We cannot represent infinities, NaNs or +/-zero. We won't
18270 know if we have +zero until we analyse the mantissa, but we
18271 can reject the other invalid values. */
18272 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
18273 || REAL_VALUE_MINUS_ZERO (r))
18274 return false;
18275
18276 /* Extract exponent. */
18277 r = real_value_abs (&r);
18278 exponent = REAL_EXP (&r);
18279
18280 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
18281 highest (sign) bit, with a fixed binary point at bit point_pos.
18282 m1 holds the low part of the mantissa, m2 the high part.
18283 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
18284 bits for the mantissa, this can fail (low bits will be lost). */
18285 real_ldexp (&m, &r, point_pos - exponent);
18286 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
18287
18288 /* If the low part of the mantissa has bits set we cannot represent
18289 the value. */
18290 if (w.ulow () != 0)
18291 return false;
18292 /* We have rejected the lower HOST_WIDE_INT, so update our
18293 understanding of how many bits lie in the mantissa and
18294 look only at the high HOST_WIDE_INT. */
18295 mantissa = w.elt (1);
18296 point_pos -= HOST_BITS_PER_WIDE_INT;
18297
18298 /* We can only represent values with a mantissa of the form 1.xxxx. */
18299 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
18300 if ((mantissa & mask) != 0)
18301 return false;
18302
18303 /* Having filtered unrepresentable values, we may now remove all
18304 but the highest 5 bits. */
18305 mantissa >>= point_pos - 5;
18306
18307 /* We cannot represent the value 0.0, so reject it. This is handled
18308 elsewhere. */
18309 if (mantissa == 0)
18310 return false;
18311
18312 /* Then, as bit 4 is always set, we can mask it off, leaving
18313 the mantissa in the range [0, 15]. */
18314 mantissa &= ~(1 << 4);
18315 gcc_assert (mantissa <= 15);
18316
18317 /* GCC internally does not use IEEE754-like encoding (where normalized
18318 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
18319 Our mantissa values are shifted 4 places to the left relative to
18320 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
18321 by 5 places to correct for GCC's representation. */
18322 exponent = 5 - exponent;
18323
18324 return (exponent >= 0 && exponent <= 7);
18325 }
18326
18327 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
18328 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
18329 output MOVI/MVNI, ORR or BIC immediate. */
18330 char*
18331 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
18332 enum simd_immediate_check which)
18333 {
18334 bool is_valid;
18335 static char templ[40];
18336 const char *mnemonic;
18337 const char *shift_op;
18338 unsigned int lane_count = 0;
18339 char element_char;
18340
18341 struct simd_immediate_info info;
18342
18343 /* This will return true to show const_vector is legal for use as either
18344 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
18345 It will also update INFO to show how the immediate should be generated.
18346 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
18347 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
18348 gcc_assert (is_valid);
18349
18350 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18351 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
18352
18353 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18354 {
18355 gcc_assert (info.insn == simd_immediate_info::MOV
18356 && info.u.mov.shift == 0);
18357 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
18358 move immediate path. */
18359 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18360 info.u.mov.value = GEN_INT (0);
18361 else
18362 {
18363 const unsigned int buf_size = 20;
18364 char float_buf[buf_size] = {'\0'};
18365 real_to_decimal_for_mode (float_buf,
18366 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18367 buf_size, buf_size, 1, info.elt_mode);
18368
18369 if (lane_count == 1)
18370 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
18371 else
18372 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
18373 lane_count, element_char, float_buf);
18374 return templ;
18375 }
18376 }
18377
18378 gcc_assert (CONST_INT_P (info.u.mov.value));
18379
18380 if (which == AARCH64_CHECK_MOV)
18381 {
18382 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
18383 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
18384 ? "msl" : "lsl");
18385 if (lane_count == 1)
18386 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
18387 mnemonic, UINTVAL (info.u.mov.value));
18388 else if (info.u.mov.shift)
18389 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18390 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
18391 element_char, UINTVAL (info.u.mov.value), shift_op,
18392 info.u.mov.shift);
18393 else
18394 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18395 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
18396 element_char, UINTVAL (info.u.mov.value));
18397 }
18398 else
18399 {
18400 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
18401 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
18402 if (info.u.mov.shift)
18403 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18404 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
18405 element_char, UINTVAL (info.u.mov.value), "lsl",
18406 info.u.mov.shift);
18407 else
18408 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18409 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
18410 element_char, UINTVAL (info.u.mov.value));
18411 }
18412 return templ;
18413 }
18414
18415 char*
18416 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
18417 {
18418
18419 /* If a floating point number was passed and we desire to use it in an
18420 integer mode do the conversion to integer. */
18421 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
18422 {
18423 unsigned HOST_WIDE_INT ival;
18424 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
18425 gcc_unreachable ();
18426 immediate = gen_int_mode (ival, mode);
18427 }
18428
18429 machine_mode vmode;
18430 /* use a 64 bit mode for everything except for DI/DF mode, where we use
18431 a 128 bit vector mode. */
18432 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
18433
18434 vmode = aarch64_simd_container_mode (mode, width);
18435 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
18436 return aarch64_output_simd_mov_immediate (v_op, width);
18437 }
18438
18439 /* Return the output string to use for moving immediate CONST_VECTOR
18440 into an SVE register. */
18441
18442 char *
18443 aarch64_output_sve_mov_immediate (rtx const_vector)
18444 {
18445 static char templ[40];
18446 struct simd_immediate_info info;
18447 char element_char;
18448
18449 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
18450 gcc_assert (is_valid);
18451
18452 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18453
18454 machine_mode vec_mode = GET_MODE (const_vector);
18455 if (aarch64_sve_pred_mode_p (vec_mode))
18456 {
18457 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
18458 if (info.insn == simd_immediate_info::MOV)
18459 {
18460 gcc_assert (info.u.mov.value == const0_rtx);
18461 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
18462 }
18463 else
18464 {
18465 gcc_assert (info.insn == simd_immediate_info::PTRUE);
18466 unsigned int total_bytes;
18467 if (info.u.pattern == AARCH64_SV_ALL
18468 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
18469 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
18470 total_bytes / GET_MODE_SIZE (info.elt_mode));
18471 else
18472 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
18473 svpattern_token (info.u.pattern));
18474 }
18475 return buf;
18476 }
18477
18478 if (info.insn == simd_immediate_info::INDEX)
18479 {
18480 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
18481 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
18482 element_char, INTVAL (info.u.index.base),
18483 INTVAL (info.u.index.step));
18484 return templ;
18485 }
18486
18487 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18488 {
18489 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18490 info.u.mov.value = GEN_INT (0);
18491 else
18492 {
18493 const int buf_size = 20;
18494 char float_buf[buf_size] = {};
18495 real_to_decimal_for_mode (float_buf,
18496 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18497 buf_size, buf_size, 1, info.elt_mode);
18498
18499 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
18500 element_char, float_buf);
18501 return templ;
18502 }
18503 }
18504
18505 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
18506 element_char, INTVAL (info.u.mov.value));
18507 return templ;
18508 }
18509
18510 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
18511 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
18512 pattern. */
18513
18514 char *
18515 aarch64_output_sve_ptrues (rtx const_unspec)
18516 {
18517 static char templ[40];
18518
18519 struct simd_immediate_info info;
18520 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
18521 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
18522
18523 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18524 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
18525 svpattern_token (info.u.pattern));
18526 return templ;
18527 }
18528
18529 /* Split operands into moves from op[1] + op[2] into op[0]. */
18530
18531 void
18532 aarch64_split_combinev16qi (rtx operands[3])
18533 {
18534 unsigned int dest = REGNO (operands[0]);
18535 unsigned int src1 = REGNO (operands[1]);
18536 unsigned int src2 = REGNO (operands[2]);
18537 machine_mode halfmode = GET_MODE (operands[1]);
18538 unsigned int halfregs = REG_NREGS (operands[1]);
18539 rtx destlo, desthi;
18540
18541 gcc_assert (halfmode == V16QImode);
18542
18543 if (src1 == dest && src2 == dest + halfregs)
18544 {
18545 /* No-op move. Can't split to nothing; emit something. */
18546 emit_note (NOTE_INSN_DELETED);
18547 return;
18548 }
18549
18550 /* Preserve register attributes for variable tracking. */
18551 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
18552 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
18553 GET_MODE_SIZE (halfmode));
18554
18555 /* Special case of reversed high/low parts. */
18556 if (reg_overlap_mentioned_p (operands[2], destlo)
18557 && reg_overlap_mentioned_p (operands[1], desthi))
18558 {
18559 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
18560 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
18561 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
18562 }
18563 else if (!reg_overlap_mentioned_p (operands[2], destlo))
18564 {
18565 /* Try to avoid unnecessary moves if part of the result
18566 is in the right place already. */
18567 if (src1 != dest)
18568 emit_move_insn (destlo, operands[1]);
18569 if (src2 != dest + halfregs)
18570 emit_move_insn (desthi, operands[2]);
18571 }
18572 else
18573 {
18574 if (src2 != dest + halfregs)
18575 emit_move_insn (desthi, operands[2]);
18576 if (src1 != dest)
18577 emit_move_insn (destlo, operands[1]);
18578 }
18579 }
18580
18581 /* vec_perm support. */
18582
18583 struct expand_vec_perm_d
18584 {
18585 rtx target, op0, op1;
18586 vec_perm_indices perm;
18587 machine_mode vmode;
18588 unsigned int vec_flags;
18589 bool one_vector_p;
18590 bool testing_p;
18591 };
18592
18593 /* Generate a variable permutation. */
18594
18595 static void
18596 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
18597 {
18598 machine_mode vmode = GET_MODE (target);
18599 bool one_vector_p = rtx_equal_p (op0, op1);
18600
18601 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
18602 gcc_checking_assert (GET_MODE (op0) == vmode);
18603 gcc_checking_assert (GET_MODE (op1) == vmode);
18604 gcc_checking_assert (GET_MODE (sel) == vmode);
18605 gcc_checking_assert (TARGET_SIMD);
18606
18607 if (one_vector_p)
18608 {
18609 if (vmode == V8QImode)
18610 {
18611 /* Expand the argument to a V16QI mode by duplicating it. */
18612 rtx pair = gen_reg_rtx (V16QImode);
18613 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
18614 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
18615 }
18616 else
18617 {
18618 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
18619 }
18620 }
18621 else
18622 {
18623 rtx pair;
18624
18625 if (vmode == V8QImode)
18626 {
18627 pair = gen_reg_rtx (V16QImode);
18628 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
18629 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
18630 }
18631 else
18632 {
18633 pair = gen_reg_rtx (OImode);
18634 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
18635 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
18636 }
18637 }
18638 }
18639
18640 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
18641 NELT is the number of elements in the vector. */
18642
18643 void
18644 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
18645 unsigned int nelt)
18646 {
18647 machine_mode vmode = GET_MODE (target);
18648 bool one_vector_p = rtx_equal_p (op0, op1);
18649 rtx mask;
18650
18651 /* The TBL instruction does not use a modulo index, so we must take care
18652 of that ourselves. */
18653 mask = aarch64_simd_gen_const_vector_dup (vmode,
18654 one_vector_p ? nelt - 1 : 2 * nelt - 1);
18655 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
18656
18657 /* For big-endian, we also need to reverse the index within the vector
18658 (but not which vector). */
18659 if (BYTES_BIG_ENDIAN)
18660 {
18661 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
18662 if (!one_vector_p)
18663 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
18664 sel = expand_simple_binop (vmode, XOR, sel, mask,
18665 NULL, 0, OPTAB_LIB_WIDEN);
18666 }
18667 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
18668 }
18669
18670 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
18671
18672 static void
18673 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
18674 {
18675 emit_insn (gen_rtx_SET (target,
18676 gen_rtx_UNSPEC (GET_MODE (target),
18677 gen_rtvec (2, op0, op1), code)));
18678 }
18679
18680 /* Expand an SVE vec_perm with the given operands. */
18681
18682 void
18683 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
18684 {
18685 machine_mode data_mode = GET_MODE (target);
18686 machine_mode sel_mode = GET_MODE (sel);
18687 /* Enforced by the pattern condition. */
18688 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
18689
18690 /* Note: vec_perm indices are supposed to wrap when they go beyond the
18691 size of the two value vectors, i.e. the upper bits of the indices
18692 are effectively ignored. SVE TBL instead produces 0 for any
18693 out-of-range indices, so we need to modulo all the vec_perm indices
18694 to ensure they are all in range. */
18695 rtx sel_reg = force_reg (sel_mode, sel);
18696
18697 /* Check if the sel only references the first values vector. */
18698 if (GET_CODE (sel) == CONST_VECTOR
18699 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
18700 {
18701 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
18702 return;
18703 }
18704
18705 /* Check if the two values vectors are the same. */
18706 if (rtx_equal_p (op0, op1))
18707 {
18708 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
18709 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
18710 NULL, 0, OPTAB_DIRECT);
18711 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
18712 return;
18713 }
18714
18715 /* Run TBL on for each value vector and combine the results. */
18716
18717 rtx res0 = gen_reg_rtx (data_mode);
18718 rtx res1 = gen_reg_rtx (data_mode);
18719 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
18720 if (GET_CODE (sel) != CONST_VECTOR
18721 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
18722 {
18723 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
18724 2 * nunits - 1);
18725 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
18726 NULL, 0, OPTAB_DIRECT);
18727 }
18728 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
18729 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
18730 NULL, 0, OPTAB_DIRECT);
18731 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
18732 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
18733 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
18734 else
18735 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
18736 }
18737
18738 /* Recognize patterns suitable for the TRN instructions. */
18739 static bool
18740 aarch64_evpc_trn (struct expand_vec_perm_d *d)
18741 {
18742 HOST_WIDE_INT odd;
18743 poly_uint64 nelt = d->perm.length ();
18744 rtx out, in0, in1, x;
18745 machine_mode vmode = d->vmode;
18746
18747 if (GET_MODE_UNIT_SIZE (vmode) > 8)
18748 return false;
18749
18750 /* Note that these are little-endian tests.
18751 We correct for big-endian later. */
18752 if (!d->perm[0].is_constant (&odd)
18753 || (odd != 0 && odd != 1)
18754 || !d->perm.series_p (0, 2, odd, 2)
18755 || !d->perm.series_p (1, 2, nelt + odd, 2))
18756 return false;
18757
18758 /* Success! */
18759 if (d->testing_p)
18760 return true;
18761
18762 in0 = d->op0;
18763 in1 = d->op1;
18764 /* We don't need a big-endian lane correction for SVE; see the comment
18765 at the head of aarch64-sve.md for details. */
18766 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
18767 {
18768 x = in0, in0 = in1, in1 = x;
18769 odd = !odd;
18770 }
18771 out = d->target;
18772
18773 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
18774 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
18775 return true;
18776 }
18777
18778 /* Recognize patterns suitable for the UZP instructions. */
18779 static bool
18780 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
18781 {
18782 HOST_WIDE_INT odd;
18783 rtx out, in0, in1, x;
18784 machine_mode vmode = d->vmode;
18785
18786 if (GET_MODE_UNIT_SIZE (vmode) > 8)
18787 return false;
18788
18789 /* Note that these are little-endian tests.
18790 We correct for big-endian later. */
18791 if (!d->perm[0].is_constant (&odd)
18792 || (odd != 0 && odd != 1)
18793 || !d->perm.series_p (0, 1, odd, 2))
18794 return false;
18795
18796 /* Success! */
18797 if (d->testing_p)
18798 return true;
18799
18800 in0 = d->op0;
18801 in1 = d->op1;
18802 /* We don't need a big-endian lane correction for SVE; see the comment
18803 at the head of aarch64-sve.md for details. */
18804 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
18805 {
18806 x = in0, in0 = in1, in1 = x;
18807 odd = !odd;
18808 }
18809 out = d->target;
18810
18811 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
18812 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
18813 return true;
18814 }
18815
18816 /* Recognize patterns suitable for the ZIP instructions. */
18817 static bool
18818 aarch64_evpc_zip (struct expand_vec_perm_d *d)
18819 {
18820 unsigned int high;
18821 poly_uint64 nelt = d->perm.length ();
18822 rtx out, in0, in1, x;
18823 machine_mode vmode = d->vmode;
18824
18825 if (GET_MODE_UNIT_SIZE (vmode) > 8)
18826 return false;
18827
18828 /* Note that these are little-endian tests.
18829 We correct for big-endian later. */
18830 poly_uint64 first = d->perm[0];
18831 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
18832 || !d->perm.series_p (0, 2, first, 1)
18833 || !d->perm.series_p (1, 2, first + nelt, 1))
18834 return false;
18835 high = maybe_ne (first, 0U);
18836
18837 /* Success! */
18838 if (d->testing_p)
18839 return true;
18840
18841 in0 = d->op0;
18842 in1 = d->op1;
18843 /* We don't need a big-endian lane correction for SVE; see the comment
18844 at the head of aarch64-sve.md for details. */
18845 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
18846 {
18847 x = in0, in0 = in1, in1 = x;
18848 high = !high;
18849 }
18850 out = d->target;
18851
18852 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
18853 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
18854 return true;
18855 }
18856
18857 /* Recognize patterns for the EXT insn. */
18858
18859 static bool
18860 aarch64_evpc_ext (struct expand_vec_perm_d *d)
18861 {
18862 HOST_WIDE_INT location;
18863 rtx offset;
18864
18865 /* The first element always refers to the first vector.
18866 Check if the extracted indices are increasing by one. */
18867 if (d->vec_flags == VEC_SVE_PRED
18868 || !d->perm[0].is_constant (&location)
18869 || !d->perm.series_p (0, 1, location, 1))
18870 return false;
18871
18872 /* Success! */
18873 if (d->testing_p)
18874 return true;
18875
18876 /* The case where (location == 0) is a no-op for both big- and little-endian,
18877 and is removed by the mid-end at optimization levels -O1 and higher.
18878
18879 We don't need a big-endian lane correction for SVE; see the comment
18880 at the head of aarch64-sve.md for details. */
18881 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
18882 {
18883 /* After setup, we want the high elements of the first vector (stored
18884 at the LSB end of the register), and the low elements of the second
18885 vector (stored at the MSB end of the register). So swap. */
18886 std::swap (d->op0, d->op1);
18887 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
18888 to_constant () is safe since this is restricted to Advanced SIMD
18889 vectors. */
18890 location = d->perm.length ().to_constant () - location;
18891 }
18892
18893 offset = GEN_INT (location);
18894 emit_set_insn (d->target,
18895 gen_rtx_UNSPEC (d->vmode,
18896 gen_rtvec (3, d->op0, d->op1, offset),
18897 UNSPEC_EXT));
18898 return true;
18899 }
18900
18901 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
18902 within each 64-bit, 32-bit or 16-bit granule. */
18903
18904 static bool
18905 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
18906 {
18907 HOST_WIDE_INT diff;
18908 unsigned int i, size, unspec;
18909 machine_mode pred_mode;
18910
18911 if (d->vec_flags == VEC_SVE_PRED
18912 || !d->one_vector_p
18913 || !d->perm[0].is_constant (&diff))
18914 return false;
18915
18916 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
18917 if (size == 8)
18918 {
18919 unspec = UNSPEC_REV64;
18920 pred_mode = VNx2BImode;
18921 }
18922 else if (size == 4)
18923 {
18924 unspec = UNSPEC_REV32;
18925 pred_mode = VNx4BImode;
18926 }
18927 else if (size == 2)
18928 {
18929 unspec = UNSPEC_REV16;
18930 pred_mode = VNx8BImode;
18931 }
18932 else
18933 return false;
18934
18935 unsigned int step = diff + 1;
18936 for (i = 0; i < step; ++i)
18937 if (!d->perm.series_p (i, step, diff - i, step))
18938 return false;
18939
18940 /* Success! */
18941 if (d->testing_p)
18942 return true;
18943
18944 if (d->vec_flags == VEC_SVE_DATA)
18945 {
18946 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
18947 rtx target = gen_reg_rtx (int_mode);
18948 if (BYTES_BIG_ENDIAN)
18949 /* The act of taking a subreg between INT_MODE and d->vmode
18950 is itself a reversing operation on big-endian targets;
18951 see the comment at the head of aarch64-sve.md for details.
18952 First reinterpret OP0 as INT_MODE without using a subreg
18953 and without changing the contents. */
18954 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
18955 else
18956 {
18957 /* For SVE we use REV[BHW] unspecs derived from the element size
18958 of v->mode and vector modes whose elements have SIZE bytes.
18959 This ensures that the vector modes match the predicate modes. */
18960 int unspec = aarch64_sve_rev_unspec (d->vmode);
18961 rtx pred = aarch64_ptrue_reg (pred_mode);
18962 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
18963 gen_lowpart (int_mode, d->op0)));
18964 }
18965 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18966 return true;
18967 }
18968 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
18969 emit_set_insn (d->target, src);
18970 return true;
18971 }
18972
18973 /* Recognize patterns for the REV insn, which reverses elements within
18974 a full vector. */
18975
18976 static bool
18977 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
18978 {
18979 poly_uint64 nelt = d->perm.length ();
18980
18981 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
18982 return false;
18983
18984 if (!d->perm.series_p (0, 1, nelt - 1, -1))
18985 return false;
18986
18987 /* Success! */
18988 if (d->testing_p)
18989 return true;
18990
18991 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
18992 emit_set_insn (d->target, src);
18993 return true;
18994 }
18995
18996 static bool
18997 aarch64_evpc_dup (struct expand_vec_perm_d *d)
18998 {
18999 rtx out = d->target;
19000 rtx in0;
19001 HOST_WIDE_INT elt;
19002 machine_mode vmode = d->vmode;
19003 rtx lane;
19004
19005 if (d->vec_flags == VEC_SVE_PRED
19006 || d->perm.encoding ().encoded_nelts () != 1
19007 || !d->perm[0].is_constant (&elt))
19008 return false;
19009
19010 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
19011 return false;
19012
19013 /* Success! */
19014 if (d->testing_p)
19015 return true;
19016
19017 /* The generic preparation in aarch64_expand_vec_perm_const_1
19018 swaps the operand order and the permute indices if it finds
19019 d->perm[0] to be in the second operand. Thus, we can always
19020 use d->op0 and need not do any extra arithmetic to get the
19021 correct lane number. */
19022 in0 = d->op0;
19023 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
19024
19025 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
19026 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
19027 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
19028 return true;
19029 }
19030
19031 static bool
19032 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
19033 {
19034 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
19035 machine_mode vmode = d->vmode;
19036
19037 /* Make sure that the indices are constant. */
19038 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
19039 for (unsigned int i = 0; i < encoded_nelts; ++i)
19040 if (!d->perm[i].is_constant ())
19041 return false;
19042
19043 if (d->testing_p)
19044 return true;
19045
19046 /* Generic code will try constant permutation twice. Once with the
19047 original mode and again with the elements lowered to QImode.
19048 So wait and don't do the selector expansion ourselves. */
19049 if (vmode != V8QImode && vmode != V16QImode)
19050 return false;
19051
19052 /* to_constant is safe since this routine is specific to Advanced SIMD
19053 vectors. */
19054 unsigned int nelt = d->perm.length ().to_constant ();
19055 for (unsigned int i = 0; i < nelt; ++i)
19056 /* If big-endian and two vectors we end up with a weird mixed-endian
19057 mode on NEON. Reverse the index within each word but not the word
19058 itself. to_constant is safe because we checked is_constant above. */
19059 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
19060 ? d->perm[i].to_constant () ^ (nelt - 1)
19061 : d->perm[i].to_constant ());
19062
19063 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19064 sel = force_reg (vmode, sel);
19065
19066 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
19067 return true;
19068 }
19069
19070 /* Try to implement D using an SVE TBL instruction. */
19071
19072 static bool
19073 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
19074 {
19075 unsigned HOST_WIDE_INT nelt;
19076
19077 /* Permuting two variable-length vectors could overflow the
19078 index range. */
19079 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
19080 return false;
19081
19082 if (d->testing_p)
19083 return true;
19084
19085 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
19086 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
19087 if (d->one_vector_p)
19088 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
19089 else
19090 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
19091 return true;
19092 }
19093
19094 /* Try to implement D using SVE SEL instruction. */
19095
19096 static bool
19097 aarch64_evpc_sel (struct expand_vec_perm_d *d)
19098 {
19099 machine_mode vmode = d->vmode;
19100 int unit_size = GET_MODE_UNIT_SIZE (vmode);
19101
19102 if (d->vec_flags != VEC_SVE_DATA
19103 || unit_size > 8)
19104 return false;
19105
19106 int n_patterns = d->perm.encoding ().npatterns ();
19107 poly_int64 vec_len = d->perm.length ();
19108
19109 for (int i = 0; i < n_patterns; ++i)
19110 if (!known_eq (d->perm[i], i)
19111 && !known_eq (d->perm[i], vec_len + i))
19112 return false;
19113
19114 for (int i = n_patterns; i < n_patterns * 2; i++)
19115 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
19116 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
19117 return false;
19118
19119 if (d->testing_p)
19120 return true;
19121
19122 machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
19123
19124 rtx_vector_builder builder (pred_mode, n_patterns, 2);
19125 for (int i = 0; i < n_patterns * 2; i++)
19126 {
19127 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
19128 : CONST0_RTX (BImode);
19129 builder.quick_push (elem);
19130 }
19131
19132 rtx const_vec = builder.build ();
19133 rtx pred = force_reg (pred_mode, const_vec);
19134 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op1, d->op0, pred));
19135 return true;
19136 }
19137
19138 static bool
19139 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19140 {
19141 /* The pattern matching functions above are written to look for a small
19142 number to begin the sequence (0, 1, N/2). If we begin with an index
19143 from the second operand, we can swap the operands. */
19144 poly_int64 nelt = d->perm.length ();
19145 if (known_ge (d->perm[0], nelt))
19146 {
19147 d->perm.rotate_inputs (1);
19148 std::swap (d->op0, d->op1);
19149 }
19150
19151 if ((d->vec_flags == VEC_ADVSIMD
19152 || d->vec_flags == VEC_SVE_DATA
19153 || d->vec_flags == VEC_SVE_PRED)
19154 && known_gt (nelt, 1))
19155 {
19156 if (aarch64_evpc_rev_local (d))
19157 return true;
19158 else if (aarch64_evpc_rev_global (d))
19159 return true;
19160 else if (aarch64_evpc_ext (d))
19161 return true;
19162 else if (aarch64_evpc_dup (d))
19163 return true;
19164 else if (aarch64_evpc_zip (d))
19165 return true;
19166 else if (aarch64_evpc_uzp (d))
19167 return true;
19168 else if (aarch64_evpc_trn (d))
19169 return true;
19170 else if (aarch64_evpc_sel (d))
19171 return true;
19172 if (d->vec_flags == VEC_SVE_DATA)
19173 return aarch64_evpc_sve_tbl (d);
19174 else if (d->vec_flags == VEC_ADVSIMD)
19175 return aarch64_evpc_tbl (d);
19176 }
19177 return false;
19178 }
19179
19180 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19181
19182 static bool
19183 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19184 rtx op1, const vec_perm_indices &sel)
19185 {
19186 struct expand_vec_perm_d d;
19187
19188 /* Check whether the mask can be applied to a single vector. */
19189 if (sel.ninputs () == 1
19190 || (op0 && rtx_equal_p (op0, op1)))
19191 d.one_vector_p = true;
19192 else if (sel.all_from_input_p (0))
19193 {
19194 d.one_vector_p = true;
19195 op1 = op0;
19196 }
19197 else if (sel.all_from_input_p (1))
19198 {
19199 d.one_vector_p = true;
19200 op0 = op1;
19201 }
19202 else
19203 d.one_vector_p = false;
19204
19205 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
19206 sel.nelts_per_input ());
19207 d.vmode = vmode;
19208 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
19209 d.target = target;
19210 d.op0 = op0;
19211 d.op1 = op1;
19212 d.testing_p = !target;
19213
19214 if (!d.testing_p)
19215 return aarch64_expand_vec_perm_const_1 (&d);
19216
19217 rtx_insn *last = get_last_insn ();
19218 bool ret = aarch64_expand_vec_perm_const_1 (&d);
19219 gcc_assert (last == get_last_insn ());
19220
19221 return ret;
19222 }
19223
19224 /* Generate a byte permute mask for a register of mode MODE,
19225 which has NUNITS units. */
19226
19227 rtx
19228 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
19229 {
19230 /* We have to reverse each vector because we dont have
19231 a permuted load that can reverse-load according to ABI rules. */
19232 rtx mask;
19233 rtvec v = rtvec_alloc (16);
19234 unsigned int i, j;
19235 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
19236
19237 gcc_assert (BYTES_BIG_ENDIAN);
19238 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
19239
19240 for (i = 0; i < nunits; i++)
19241 for (j = 0; j < usize; j++)
19242 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
19243 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
19244 return force_reg (V16QImode, mask);
19245 }
19246
19247 /* Expand an SVE integer comparison using the SVE equivalent of:
19248
19249 (set TARGET (CODE OP0 OP1)). */
19250
19251 void
19252 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
19253 {
19254 machine_mode pred_mode = GET_MODE (target);
19255 machine_mode data_mode = GET_MODE (op0);
19256 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
19257 op0, op1);
19258 if (!rtx_equal_p (target, res))
19259 emit_move_insn (target, res);
19260 }
19261
19262 /* Return the UNSPEC_COND_* code for comparison CODE. */
19263
19264 static unsigned int
19265 aarch64_unspec_cond_code (rtx_code code)
19266 {
19267 switch (code)
19268 {
19269 case NE:
19270 return UNSPEC_COND_FCMNE;
19271 case EQ:
19272 return UNSPEC_COND_FCMEQ;
19273 case LT:
19274 return UNSPEC_COND_FCMLT;
19275 case GT:
19276 return UNSPEC_COND_FCMGT;
19277 case LE:
19278 return UNSPEC_COND_FCMLE;
19279 case GE:
19280 return UNSPEC_COND_FCMGE;
19281 case UNORDERED:
19282 return UNSPEC_COND_FCMUO;
19283 default:
19284 gcc_unreachable ();
19285 }
19286 }
19287
19288 /* Emit:
19289
19290 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19291
19292 where <X> is the operation associated with comparison CODE.
19293 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19294
19295 static void
19296 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
19297 bool known_ptrue_p, rtx op0, rtx op1)
19298 {
19299 rtx flag = gen_int_mode (known_ptrue_p, SImode);
19300 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
19301 gen_rtvec (4, pred, flag, op0, op1),
19302 aarch64_unspec_cond_code (code));
19303 emit_set_insn (target, unspec);
19304 }
19305
19306 /* Emit the SVE equivalent of:
19307
19308 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
19309 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
19310 (set TARGET (ior:PRED_MODE TMP1 TMP2))
19311
19312 where <Xi> is the operation associated with comparison CODEi.
19313 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19314
19315 static void
19316 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
19317 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
19318 {
19319 machine_mode pred_mode = GET_MODE (pred);
19320 rtx tmp1 = gen_reg_rtx (pred_mode);
19321 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
19322 rtx tmp2 = gen_reg_rtx (pred_mode);
19323 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
19324 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
19325 }
19326
19327 /* Emit the SVE equivalent of:
19328
19329 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19330 (set TARGET (not TMP))
19331
19332 where <X> is the operation associated with comparison CODE.
19333 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19334
19335 static void
19336 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
19337 bool known_ptrue_p, rtx op0, rtx op1)
19338 {
19339 machine_mode pred_mode = GET_MODE (pred);
19340 rtx tmp = gen_reg_rtx (pred_mode);
19341 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
19342 aarch64_emit_unop (target, one_cmpl_optab, tmp);
19343 }
19344
19345 /* Expand an SVE floating-point comparison using the SVE equivalent of:
19346
19347 (set TARGET (CODE OP0 OP1))
19348
19349 If CAN_INVERT_P is true, the caller can also handle inverted results;
19350 return true if the result is in fact inverted. */
19351
19352 bool
19353 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
19354 rtx op0, rtx op1, bool can_invert_p)
19355 {
19356 machine_mode pred_mode = GET_MODE (target);
19357 machine_mode data_mode = GET_MODE (op0);
19358
19359 rtx ptrue = aarch64_ptrue_reg (pred_mode);
19360 switch (code)
19361 {
19362 case UNORDERED:
19363 /* UNORDERED has no immediate form. */
19364 op1 = force_reg (data_mode, op1);
19365 /* fall through */
19366 case LT:
19367 case LE:
19368 case GT:
19369 case GE:
19370 case EQ:
19371 case NE:
19372 {
19373 /* There is native support for the comparison. */
19374 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19375 return false;
19376 }
19377
19378 case LTGT:
19379 /* This is a trapping operation (LT or GT). */
19380 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
19381 return false;
19382
19383 case UNEQ:
19384 if (!flag_trapping_math)
19385 {
19386 /* This would trap for signaling NaNs. */
19387 op1 = force_reg (data_mode, op1);
19388 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
19389 ptrue, true, op0, op1);
19390 return false;
19391 }
19392 /* fall through */
19393 case UNLT:
19394 case UNLE:
19395 case UNGT:
19396 case UNGE:
19397 if (flag_trapping_math)
19398 {
19399 /* Work out which elements are ordered. */
19400 rtx ordered = gen_reg_rtx (pred_mode);
19401 op1 = force_reg (data_mode, op1);
19402 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
19403 ptrue, true, op0, op1);
19404
19405 /* Test the opposite condition for the ordered elements,
19406 then invert the result. */
19407 if (code == UNEQ)
19408 code = NE;
19409 else
19410 code = reverse_condition_maybe_unordered (code);
19411 if (can_invert_p)
19412 {
19413 aarch64_emit_sve_fp_cond (target, code,
19414 ordered, false, op0, op1);
19415 return true;
19416 }
19417 aarch64_emit_sve_invert_fp_cond (target, code,
19418 ordered, false, op0, op1);
19419 return false;
19420 }
19421 break;
19422
19423 case ORDERED:
19424 /* ORDERED has no immediate form. */
19425 op1 = force_reg (data_mode, op1);
19426 break;
19427
19428 default:
19429 gcc_unreachable ();
19430 }
19431
19432 /* There is native support for the inverse comparison. */
19433 code = reverse_condition_maybe_unordered (code);
19434 if (can_invert_p)
19435 {
19436 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19437 return true;
19438 }
19439 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
19440 return false;
19441 }
19442
19443 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
19444 of the data being selected and CMP_MODE is the mode of the values being
19445 compared. */
19446
19447 void
19448 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
19449 rtx *ops)
19450 {
19451 machine_mode pred_mode
19452 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
19453 GET_MODE_SIZE (cmp_mode)).require ();
19454 rtx pred = gen_reg_rtx (pred_mode);
19455 if (FLOAT_MODE_P (cmp_mode))
19456 {
19457 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
19458 ops[4], ops[5], true))
19459 std::swap (ops[1], ops[2]);
19460 }
19461 else
19462 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
19463
19464 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
19465 ops[1] = force_reg (data_mode, ops[1]);
19466 /* The "false" value can only be zero if the "true" value is a constant. */
19467 if (register_operand (ops[1], data_mode)
19468 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
19469 ops[2] = force_reg (data_mode, ops[2]);
19470
19471 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
19472 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
19473 }
19474
19475 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
19476 true. However due to issues with register allocation it is preferable
19477 to avoid tieing integer scalar and FP scalar modes. Executing integer
19478 operations in general registers is better than treating them as scalar
19479 vector operations. This reduces latency and avoids redundant int<->FP
19480 moves. So tie modes if they are either the same class, or vector modes
19481 with other vector modes, vector structs or any scalar mode. */
19482
19483 static bool
19484 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
19485 {
19486 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
19487 return true;
19488
19489 /* We specifically want to allow elements of "structure" modes to
19490 be tieable to the structure. This more general condition allows
19491 other rarer situations too. The reason we don't extend this to
19492 predicate modes is that there are no predicate structure modes
19493 nor any specific instructions for extracting part of a predicate
19494 register. */
19495 if (aarch64_vector_data_mode_p (mode1)
19496 && aarch64_vector_data_mode_p (mode2))
19497 return true;
19498
19499 /* Also allow any scalar modes with vectors. */
19500 if (aarch64_vector_mode_supported_p (mode1)
19501 || aarch64_vector_mode_supported_p (mode2))
19502 return true;
19503
19504 return false;
19505 }
19506
19507 /* Return a new RTX holding the result of moving POINTER forward by
19508 AMOUNT bytes. */
19509
19510 static rtx
19511 aarch64_move_pointer (rtx pointer, poly_int64 amount)
19512 {
19513 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
19514
19515 return adjust_automodify_address (pointer, GET_MODE (pointer),
19516 next, amount);
19517 }
19518
19519 /* Return a new RTX holding the result of moving POINTER forward by the
19520 size of the mode it points to. */
19521
19522 static rtx
19523 aarch64_progress_pointer (rtx pointer)
19524 {
19525 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
19526 }
19527
19528 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
19529 MODE bytes. */
19530
19531 static void
19532 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
19533 machine_mode mode)
19534 {
19535 rtx reg = gen_reg_rtx (mode);
19536
19537 /* "Cast" the pointers to the correct mode. */
19538 *src = adjust_address (*src, mode, 0);
19539 *dst = adjust_address (*dst, mode, 0);
19540 /* Emit the memcpy. */
19541 emit_move_insn (reg, *src);
19542 emit_move_insn (*dst, reg);
19543 /* Move the pointers forward. */
19544 *src = aarch64_progress_pointer (*src);
19545 *dst = aarch64_progress_pointer (*dst);
19546 }
19547
19548 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
19549 we succeed, otherwise return false. */
19550
19551 bool
19552 aarch64_expand_cpymem (rtx *operands)
19553 {
19554 int n, mode_bits;
19555 rtx dst = operands[0];
19556 rtx src = operands[1];
19557 rtx base;
19558 machine_mode cur_mode = BLKmode, next_mode;
19559 bool speed_p = !optimize_function_for_size_p (cfun);
19560
19561 /* When optimizing for size, give a better estimate of the length of a
19562 memcpy call, but use the default otherwise. Moves larger than 8 bytes
19563 will always require an even number of instructions to do now. And each
19564 operation requires both a load+store, so devide the max number by 2. */
19565 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
19566
19567 /* We can't do anything smart if the amount to copy is not constant. */
19568 if (!CONST_INT_P (operands[2]))
19569 return false;
19570
19571 n = INTVAL (operands[2]);
19572
19573 /* Try to keep the number of instructions low. For all cases we will do at
19574 most two moves for the residual amount, since we'll always overlap the
19575 remainder. */
19576 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
19577 return false;
19578
19579 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
19580 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
19581
19582 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
19583 src = adjust_automodify_address (src, VOIDmode, base, 0);
19584
19585 /* Convert n to bits to make the rest of the code simpler. */
19586 n = n * BITS_PER_UNIT;
19587
19588 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
19589 larger than TImode, but we should not use them for loads/stores here. */
19590 const int copy_limit = GET_MODE_BITSIZE (TImode);
19591
19592 while (n > 0)
19593 {
19594 /* Find the largest mode in which to do the copy in without over reading
19595 or writing. */
19596 opt_scalar_int_mode mode_iter;
19597 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
19598 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
19599 cur_mode = mode_iter.require ();
19600
19601 gcc_assert (cur_mode != BLKmode);
19602
19603 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
19604 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
19605
19606 n -= mode_bits;
19607
19608 /* Do certain trailing copies as overlapping if it's going to be
19609 cheaper. i.e. less instructions to do so. For instance doing a 15
19610 byte copy it's more efficient to do two overlapping 8 byte copies than
19611 8 + 6 + 1. */
19612 if (n > 0 && n <= 8 * BITS_PER_UNIT)
19613 {
19614 next_mode = smallest_mode_for_size (n, MODE_INT);
19615 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
19616 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
19617 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
19618 n = n_bits;
19619 }
19620 }
19621
19622 return true;
19623 }
19624
19625 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
19626 SImode stores. Handle the case when the constant has identical
19627 bottom and top halves. This is beneficial when the two stores can be
19628 merged into an STP and we avoid synthesising potentially expensive
19629 immediates twice. Return true if such a split is possible. */
19630
19631 bool
19632 aarch64_split_dimode_const_store (rtx dst, rtx src)
19633 {
19634 rtx lo = gen_lowpart (SImode, src);
19635 rtx hi = gen_highpart_mode (SImode, DImode, src);
19636
19637 bool size_p = optimize_function_for_size_p (cfun);
19638
19639 if (!rtx_equal_p (lo, hi))
19640 return false;
19641
19642 unsigned int orig_cost
19643 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
19644 unsigned int lo_cost
19645 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
19646
19647 /* We want to transform:
19648 MOV x1, 49370
19649 MOVK x1, 0x140, lsl 16
19650 MOVK x1, 0xc0da, lsl 32
19651 MOVK x1, 0x140, lsl 48
19652 STR x1, [x0]
19653 into:
19654 MOV w1, 49370
19655 MOVK w1, 0x140, lsl 16
19656 STP w1, w1, [x0]
19657 So we want to perform this only when we save two instructions
19658 or more. When optimizing for size, however, accept any code size
19659 savings we can. */
19660 if (size_p && orig_cost <= lo_cost)
19661 return false;
19662
19663 if (!size_p
19664 && (orig_cost <= lo_cost + 1))
19665 return false;
19666
19667 rtx mem_lo = adjust_address (dst, SImode, 0);
19668 if (!aarch64_mem_pair_operand (mem_lo, SImode))
19669 return false;
19670
19671 rtx tmp_reg = gen_reg_rtx (SImode);
19672 aarch64_expand_mov_immediate (tmp_reg, lo);
19673 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
19674 /* Don't emit an explicit store pair as this may not be always profitable.
19675 Let the sched-fusion logic decide whether to merge them. */
19676 emit_move_insn (mem_lo, tmp_reg);
19677 emit_move_insn (mem_hi, tmp_reg);
19678
19679 return true;
19680 }
19681
19682 /* Generate RTL for a conditional branch with rtx comparison CODE in
19683 mode CC_MODE. The destination of the unlikely conditional branch
19684 is LABEL_REF. */
19685
19686 void
19687 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
19688 rtx label_ref)
19689 {
19690 rtx x;
19691 x = gen_rtx_fmt_ee (code, VOIDmode,
19692 gen_rtx_REG (cc_mode, CC_REGNUM),
19693 const0_rtx);
19694
19695 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19696 gen_rtx_LABEL_REF (VOIDmode, label_ref),
19697 pc_rtx);
19698 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19699 }
19700
19701 /* Generate DImode scratch registers for 128-bit (TImode) addition.
19702
19703 OP1 represents the TImode destination operand 1
19704 OP2 represents the TImode destination operand 2
19705 LOW_DEST represents the low half (DImode) of TImode operand 0
19706 LOW_IN1 represents the low half (DImode) of TImode operand 1
19707 LOW_IN2 represents the low half (DImode) of TImode operand 2
19708 HIGH_DEST represents the high half (DImode) of TImode operand 0
19709 HIGH_IN1 represents the high half (DImode) of TImode operand 1
19710 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
19711
19712 void
19713 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
19714 rtx *low_in1, rtx *low_in2,
19715 rtx *high_dest, rtx *high_in1,
19716 rtx *high_in2)
19717 {
19718 *low_dest = gen_reg_rtx (DImode);
19719 *low_in1 = gen_lowpart (DImode, op1);
19720 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
19721 subreg_lowpart_offset (DImode, TImode));
19722 *high_dest = gen_reg_rtx (DImode);
19723 *high_in1 = gen_highpart (DImode, op1);
19724 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
19725 subreg_highpart_offset (DImode, TImode));
19726 }
19727
19728 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
19729
19730 This function differs from 'arch64_addti_scratch_regs' in that
19731 OP1 can be an immediate constant (zero). We must call
19732 subreg_highpart_offset with DImode and TImode arguments, otherwise
19733 VOIDmode will be used for the const_int which generates an internal
19734 error from subreg_size_highpart_offset which does not expect a size of zero.
19735
19736 OP1 represents the TImode destination operand 1
19737 OP2 represents the TImode destination operand 2
19738 LOW_DEST represents the low half (DImode) of TImode operand 0
19739 LOW_IN1 represents the low half (DImode) of TImode operand 1
19740 LOW_IN2 represents the low half (DImode) of TImode operand 2
19741 HIGH_DEST represents the high half (DImode) of TImode operand 0
19742 HIGH_IN1 represents the high half (DImode) of TImode operand 1
19743 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
19744
19745
19746 void
19747 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
19748 rtx *low_in1, rtx *low_in2,
19749 rtx *high_dest, rtx *high_in1,
19750 rtx *high_in2)
19751 {
19752 *low_dest = gen_reg_rtx (DImode);
19753 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
19754 subreg_lowpart_offset (DImode, TImode));
19755
19756 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
19757 subreg_lowpart_offset (DImode, TImode));
19758 *high_dest = gen_reg_rtx (DImode);
19759
19760 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
19761 subreg_highpart_offset (DImode, TImode));
19762 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
19763 subreg_highpart_offset (DImode, TImode));
19764 }
19765
19766 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
19767
19768 OP0 represents the TImode destination operand 0
19769 LOW_DEST represents the low half (DImode) of TImode operand 0
19770 LOW_IN1 represents the low half (DImode) of TImode operand 1
19771 LOW_IN2 represents the low half (DImode) of TImode operand 2
19772 HIGH_DEST represents the high half (DImode) of TImode operand 0
19773 HIGH_IN1 represents the high half (DImode) of TImode operand 1
19774 HIGH_IN2 represents the high half (DImode) of TImode operand 2
19775 UNSIGNED_P is true if the operation is being performed on unsigned
19776 values. */
19777 void
19778 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
19779 rtx low_in2, rtx high_dest, rtx high_in1,
19780 rtx high_in2, bool unsigned_p)
19781 {
19782 if (low_in2 == const0_rtx)
19783 {
19784 low_dest = low_in1;
19785 high_in2 = force_reg (DImode, high_in2);
19786 if (unsigned_p)
19787 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
19788 else
19789 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
19790 }
19791 else
19792 {
19793 if (CONST_INT_P (low_in2))
19794 {
19795 high_in2 = force_reg (DImode, high_in2);
19796 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
19797 GEN_INT (-INTVAL (low_in2))));
19798 }
19799 else
19800 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
19801
19802 if (unsigned_p)
19803 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
19804 else
19805 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
19806 }
19807
19808 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
19809 emit_move_insn (gen_highpart (DImode, op0), high_dest);
19810
19811 }
19812
19813 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
19814
19815 static unsigned HOST_WIDE_INT
19816 aarch64_asan_shadow_offset (void)
19817 {
19818 if (TARGET_ILP32)
19819 return (HOST_WIDE_INT_1 << 29);
19820 else
19821 return (HOST_WIDE_INT_1 << 36);
19822 }
19823
19824 static rtx
19825 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
19826 int code, tree treeop0, tree treeop1)
19827 {
19828 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
19829 rtx op0, op1;
19830 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
19831 insn_code icode;
19832 struct expand_operand ops[4];
19833
19834 start_sequence ();
19835 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
19836
19837 op_mode = GET_MODE (op0);
19838 if (op_mode == VOIDmode)
19839 op_mode = GET_MODE (op1);
19840
19841 switch (op_mode)
19842 {
19843 case E_QImode:
19844 case E_HImode:
19845 case E_SImode:
19846 cmp_mode = SImode;
19847 icode = CODE_FOR_cmpsi;
19848 break;
19849
19850 case E_DImode:
19851 cmp_mode = DImode;
19852 icode = CODE_FOR_cmpdi;
19853 break;
19854
19855 case E_SFmode:
19856 cmp_mode = SFmode;
19857 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
19858 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
19859 break;
19860
19861 case E_DFmode:
19862 cmp_mode = DFmode;
19863 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
19864 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
19865 break;
19866
19867 default:
19868 end_sequence ();
19869 return NULL_RTX;
19870 }
19871
19872 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
19873 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
19874 if (!op0 || !op1)
19875 {
19876 end_sequence ();
19877 return NULL_RTX;
19878 }
19879 *prep_seq = get_insns ();
19880 end_sequence ();
19881
19882 create_fixed_operand (&ops[0], op0);
19883 create_fixed_operand (&ops[1], op1);
19884
19885 start_sequence ();
19886 if (!maybe_expand_insn (icode, 2, ops))
19887 {
19888 end_sequence ();
19889 return NULL_RTX;
19890 }
19891 *gen_seq = get_insns ();
19892 end_sequence ();
19893
19894 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
19895 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
19896 }
19897
19898 static rtx
19899 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
19900 int cmp_code, tree treeop0, tree treeop1, int bit_code)
19901 {
19902 rtx op0, op1, target;
19903 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
19904 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
19905 insn_code icode;
19906 struct expand_operand ops[6];
19907 int aarch64_cond;
19908
19909 push_to_sequence (*prep_seq);
19910 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
19911
19912 op_mode = GET_MODE (op0);
19913 if (op_mode == VOIDmode)
19914 op_mode = GET_MODE (op1);
19915
19916 switch (op_mode)
19917 {
19918 case E_QImode:
19919 case E_HImode:
19920 case E_SImode:
19921 cmp_mode = SImode;
19922 icode = CODE_FOR_ccmpsi;
19923 break;
19924
19925 case E_DImode:
19926 cmp_mode = DImode;
19927 icode = CODE_FOR_ccmpdi;
19928 break;
19929
19930 case E_SFmode:
19931 cmp_mode = SFmode;
19932 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
19933 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
19934 break;
19935
19936 case E_DFmode:
19937 cmp_mode = DFmode;
19938 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
19939 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
19940 break;
19941
19942 default:
19943 end_sequence ();
19944 return NULL_RTX;
19945 }
19946
19947 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
19948 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
19949 if (!op0 || !op1)
19950 {
19951 end_sequence ();
19952 return NULL_RTX;
19953 }
19954 *prep_seq = get_insns ();
19955 end_sequence ();
19956
19957 target = gen_rtx_REG (cc_mode, CC_REGNUM);
19958 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
19959
19960 if (bit_code != AND)
19961 {
19962 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
19963 GET_MODE (XEXP (prev, 0))),
19964 VOIDmode, XEXP (prev, 0), const0_rtx);
19965 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
19966 }
19967
19968 create_fixed_operand (&ops[0], XEXP (prev, 0));
19969 create_fixed_operand (&ops[1], target);
19970 create_fixed_operand (&ops[2], op0);
19971 create_fixed_operand (&ops[3], op1);
19972 create_fixed_operand (&ops[4], prev);
19973 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
19974
19975 push_to_sequence (*gen_seq);
19976 if (!maybe_expand_insn (icode, 6, ops))
19977 {
19978 end_sequence ();
19979 return NULL_RTX;
19980 }
19981
19982 *gen_seq = get_insns ();
19983 end_sequence ();
19984
19985 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
19986 }
19987
19988 #undef TARGET_GEN_CCMP_FIRST
19989 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
19990
19991 #undef TARGET_GEN_CCMP_NEXT
19992 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
19993
19994 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
19995 instruction fusion of some sort. */
19996
19997 static bool
19998 aarch64_macro_fusion_p (void)
19999 {
20000 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
20001 }
20002
20003
20004 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
20005 should be kept together during scheduling. */
20006
20007 static bool
20008 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
20009 {
20010 rtx set_dest;
20011 rtx prev_set = single_set (prev);
20012 rtx curr_set = single_set (curr);
20013 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
20014 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
20015
20016 if (!aarch64_macro_fusion_p ())
20017 return false;
20018
20019 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
20020 {
20021 /* We are trying to match:
20022 prev (mov) == (set (reg r0) (const_int imm16))
20023 curr (movk) == (set (zero_extract (reg r0)
20024 (const_int 16)
20025 (const_int 16))
20026 (const_int imm16_1)) */
20027
20028 set_dest = SET_DEST (curr_set);
20029
20030 if (GET_CODE (set_dest) == ZERO_EXTRACT
20031 && CONST_INT_P (SET_SRC (curr_set))
20032 && CONST_INT_P (SET_SRC (prev_set))
20033 && CONST_INT_P (XEXP (set_dest, 2))
20034 && INTVAL (XEXP (set_dest, 2)) == 16
20035 && REG_P (XEXP (set_dest, 0))
20036 && REG_P (SET_DEST (prev_set))
20037 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
20038 {
20039 return true;
20040 }
20041 }
20042
20043 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
20044 {
20045
20046 /* We're trying to match:
20047 prev (adrp) == (set (reg r1)
20048 (high (symbol_ref ("SYM"))))
20049 curr (add) == (set (reg r0)
20050 (lo_sum (reg r1)
20051 (symbol_ref ("SYM"))))
20052 Note that r0 need not necessarily be the same as r1, especially
20053 during pre-regalloc scheduling. */
20054
20055 if (satisfies_constraint_Ush (SET_SRC (prev_set))
20056 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20057 {
20058 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
20059 && REG_P (XEXP (SET_SRC (curr_set), 0))
20060 && REGNO (XEXP (SET_SRC (curr_set), 0))
20061 == REGNO (SET_DEST (prev_set))
20062 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
20063 XEXP (SET_SRC (curr_set), 1)))
20064 return true;
20065 }
20066 }
20067
20068 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
20069 {
20070
20071 /* We're trying to match:
20072 prev (movk) == (set (zero_extract (reg r0)
20073 (const_int 16)
20074 (const_int 32))
20075 (const_int imm16_1))
20076 curr (movk) == (set (zero_extract (reg r0)
20077 (const_int 16)
20078 (const_int 48))
20079 (const_int imm16_2)) */
20080
20081 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
20082 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
20083 && REG_P (XEXP (SET_DEST (prev_set), 0))
20084 && REG_P (XEXP (SET_DEST (curr_set), 0))
20085 && REGNO (XEXP (SET_DEST (prev_set), 0))
20086 == REGNO (XEXP (SET_DEST (curr_set), 0))
20087 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
20088 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
20089 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
20090 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
20091 && CONST_INT_P (SET_SRC (prev_set))
20092 && CONST_INT_P (SET_SRC (curr_set)))
20093 return true;
20094
20095 }
20096 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
20097 {
20098 /* We're trying to match:
20099 prev (adrp) == (set (reg r0)
20100 (high (symbol_ref ("SYM"))))
20101 curr (ldr) == (set (reg r1)
20102 (mem (lo_sum (reg r0)
20103 (symbol_ref ("SYM")))))
20104 or
20105 curr (ldr) == (set (reg r1)
20106 (zero_extend (mem
20107 (lo_sum (reg r0)
20108 (symbol_ref ("SYM")))))) */
20109 if (satisfies_constraint_Ush (SET_SRC (prev_set))
20110 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20111 {
20112 rtx curr_src = SET_SRC (curr_set);
20113
20114 if (GET_CODE (curr_src) == ZERO_EXTEND)
20115 curr_src = XEXP (curr_src, 0);
20116
20117 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
20118 && REG_P (XEXP (XEXP (curr_src, 0), 0))
20119 && REGNO (XEXP (XEXP (curr_src, 0), 0))
20120 == REGNO (SET_DEST (prev_set))
20121 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
20122 XEXP (SET_SRC (prev_set), 0)))
20123 return true;
20124 }
20125 }
20126
20127 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
20128 && any_condjump_p (curr))
20129 {
20130 unsigned int condreg1, condreg2;
20131 rtx cc_reg_1;
20132 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
20133 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
20134
20135 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
20136 && prev
20137 && modified_in_p (cc_reg_1, prev))
20138 {
20139 enum attr_type prev_type = get_attr_type (prev);
20140
20141 /* FIXME: this misses some which is considered simple arthematic
20142 instructions for ThunderX. Simple shifts are missed here. */
20143 if (prev_type == TYPE_ALUS_SREG
20144 || prev_type == TYPE_ALUS_IMM
20145 || prev_type == TYPE_LOGICS_REG
20146 || prev_type == TYPE_LOGICS_IMM)
20147 return true;
20148 }
20149 }
20150
20151 if (prev_set
20152 && curr_set
20153 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
20154 && any_condjump_p (curr))
20155 {
20156 /* We're trying to match:
20157 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
20158 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
20159 (const_int 0))
20160 (label_ref ("SYM"))
20161 (pc)) */
20162 if (SET_DEST (curr_set) == (pc_rtx)
20163 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
20164 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
20165 && REG_P (SET_DEST (prev_set))
20166 && REGNO (SET_DEST (prev_set))
20167 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
20168 {
20169 /* Fuse ALU operations followed by conditional branch instruction. */
20170 switch (get_attr_type (prev))
20171 {
20172 case TYPE_ALU_IMM:
20173 case TYPE_ALU_SREG:
20174 case TYPE_ADC_REG:
20175 case TYPE_ADC_IMM:
20176 case TYPE_ADCS_REG:
20177 case TYPE_ADCS_IMM:
20178 case TYPE_LOGIC_REG:
20179 case TYPE_LOGIC_IMM:
20180 case TYPE_CSEL:
20181 case TYPE_ADR:
20182 case TYPE_MOV_IMM:
20183 case TYPE_SHIFT_REG:
20184 case TYPE_SHIFT_IMM:
20185 case TYPE_BFM:
20186 case TYPE_RBIT:
20187 case TYPE_REV:
20188 case TYPE_EXTEND:
20189 return true;
20190
20191 default:;
20192 }
20193 }
20194 }
20195
20196 return false;
20197 }
20198
20199 /* Return true iff the instruction fusion described by OP is enabled. */
20200
20201 bool
20202 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
20203 {
20204 return (aarch64_tune_params.fusible_ops & op) != 0;
20205 }
20206
20207 /* If MEM is in the form of [base+offset], extract the two parts
20208 of address and set to BASE and OFFSET, otherwise return false
20209 after clearing BASE and OFFSET. */
20210
20211 bool
20212 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
20213 {
20214 rtx addr;
20215
20216 gcc_assert (MEM_P (mem));
20217
20218 addr = XEXP (mem, 0);
20219
20220 if (REG_P (addr))
20221 {
20222 *base = addr;
20223 *offset = const0_rtx;
20224 return true;
20225 }
20226
20227 if (GET_CODE (addr) == PLUS
20228 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
20229 {
20230 *base = XEXP (addr, 0);
20231 *offset = XEXP (addr, 1);
20232 return true;
20233 }
20234
20235 *base = NULL_RTX;
20236 *offset = NULL_RTX;
20237
20238 return false;
20239 }
20240
20241 /* Types for scheduling fusion. */
20242 enum sched_fusion_type
20243 {
20244 SCHED_FUSION_NONE = 0,
20245 SCHED_FUSION_LD_SIGN_EXTEND,
20246 SCHED_FUSION_LD_ZERO_EXTEND,
20247 SCHED_FUSION_LD,
20248 SCHED_FUSION_ST,
20249 SCHED_FUSION_NUM
20250 };
20251
20252 /* If INSN is a load or store of address in the form of [base+offset],
20253 extract the two parts and set to BASE and OFFSET. Return scheduling
20254 fusion type this INSN is. */
20255
20256 static enum sched_fusion_type
20257 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
20258 {
20259 rtx x, dest, src;
20260 enum sched_fusion_type fusion = SCHED_FUSION_LD;
20261
20262 gcc_assert (INSN_P (insn));
20263 x = PATTERN (insn);
20264 if (GET_CODE (x) != SET)
20265 return SCHED_FUSION_NONE;
20266
20267 src = SET_SRC (x);
20268 dest = SET_DEST (x);
20269
20270 machine_mode dest_mode = GET_MODE (dest);
20271
20272 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
20273 return SCHED_FUSION_NONE;
20274
20275 if (GET_CODE (src) == SIGN_EXTEND)
20276 {
20277 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
20278 src = XEXP (src, 0);
20279 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20280 return SCHED_FUSION_NONE;
20281 }
20282 else if (GET_CODE (src) == ZERO_EXTEND)
20283 {
20284 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
20285 src = XEXP (src, 0);
20286 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20287 return SCHED_FUSION_NONE;
20288 }
20289
20290 if (GET_CODE (src) == MEM && REG_P (dest))
20291 extract_base_offset_in_addr (src, base, offset);
20292 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
20293 {
20294 fusion = SCHED_FUSION_ST;
20295 extract_base_offset_in_addr (dest, base, offset);
20296 }
20297 else
20298 return SCHED_FUSION_NONE;
20299
20300 if (*base == NULL_RTX || *offset == NULL_RTX)
20301 fusion = SCHED_FUSION_NONE;
20302
20303 return fusion;
20304 }
20305
20306 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
20307
20308 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
20309 and PRI are only calculated for these instructions. For other instruction,
20310 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
20311 type instruction fusion can be added by returning different priorities.
20312
20313 It's important that irrelevant instructions get the largest FUSION_PRI. */
20314
20315 static void
20316 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
20317 int *fusion_pri, int *pri)
20318 {
20319 int tmp, off_val;
20320 rtx base, offset;
20321 enum sched_fusion_type fusion;
20322
20323 gcc_assert (INSN_P (insn));
20324
20325 tmp = max_pri - 1;
20326 fusion = fusion_load_store (insn, &base, &offset);
20327 if (fusion == SCHED_FUSION_NONE)
20328 {
20329 *pri = tmp;
20330 *fusion_pri = tmp;
20331 return;
20332 }
20333
20334 /* Set FUSION_PRI according to fusion type and base register. */
20335 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
20336
20337 /* Calculate PRI. */
20338 tmp /= 2;
20339
20340 /* INSN with smaller offset goes first. */
20341 off_val = (int)(INTVAL (offset));
20342 if (off_val >= 0)
20343 tmp -= (off_val & 0xfffff);
20344 else
20345 tmp += ((- off_val) & 0xfffff);
20346
20347 *pri = tmp;
20348 return;
20349 }
20350
20351 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
20352 Adjust priority of sha1h instructions so they are scheduled before
20353 other SHA1 instructions. */
20354
20355 static int
20356 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
20357 {
20358 rtx x = PATTERN (insn);
20359
20360 if (GET_CODE (x) == SET)
20361 {
20362 x = SET_SRC (x);
20363
20364 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
20365 return priority + 10;
20366 }
20367
20368 return priority;
20369 }
20370
20371 /* Given OPERANDS of consecutive load/store, check if we can merge
20372 them into ldp/stp. LOAD is true if they are load instructions.
20373 MODE is the mode of memory operands. */
20374
20375 bool
20376 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
20377 machine_mode mode)
20378 {
20379 HOST_WIDE_INT offval_1, offval_2, msize;
20380 enum reg_class rclass_1, rclass_2;
20381 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
20382
20383 if (load)
20384 {
20385 mem_1 = operands[1];
20386 mem_2 = operands[3];
20387 reg_1 = operands[0];
20388 reg_2 = operands[2];
20389 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
20390 if (REGNO (reg_1) == REGNO (reg_2))
20391 return false;
20392 }
20393 else
20394 {
20395 mem_1 = operands[0];
20396 mem_2 = operands[2];
20397 reg_1 = operands[1];
20398 reg_2 = operands[3];
20399 }
20400
20401 /* The mems cannot be volatile. */
20402 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
20403 return false;
20404
20405 /* If we have SImode and slow unaligned ldp,
20406 check the alignment to be at least 8 byte. */
20407 if (mode == SImode
20408 && (aarch64_tune_params.extra_tuning_flags
20409 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
20410 && !optimize_size
20411 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
20412 return false;
20413
20414 /* Check if the addresses are in the form of [base+offset]. */
20415 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20416 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
20417 return false;
20418 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20419 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
20420 return false;
20421
20422 /* Check if the bases are same. */
20423 if (!rtx_equal_p (base_1, base_2))
20424 return false;
20425
20426 /* The operands must be of the same size. */
20427 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
20428 GET_MODE_SIZE (GET_MODE (mem_2))));
20429
20430 offval_1 = INTVAL (offset_1);
20431 offval_2 = INTVAL (offset_2);
20432 /* We should only be trying this for fixed-sized modes. There is no
20433 SVE LDP/STP instruction. */
20434 msize = GET_MODE_SIZE (mode).to_constant ();
20435 /* Check if the offsets are consecutive. */
20436 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
20437 return false;
20438
20439 /* Check if the addresses are clobbered by load. */
20440 if (load)
20441 {
20442 if (reg_mentioned_p (reg_1, mem_1))
20443 return false;
20444
20445 /* In increasing order, the last load can clobber the address. */
20446 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
20447 return false;
20448 }
20449
20450 /* One of the memory accesses must be a mempair operand.
20451 If it is not the first one, they need to be swapped by the
20452 peephole. */
20453 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
20454 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
20455 return false;
20456
20457 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
20458 rclass_1 = FP_REGS;
20459 else
20460 rclass_1 = GENERAL_REGS;
20461
20462 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
20463 rclass_2 = FP_REGS;
20464 else
20465 rclass_2 = GENERAL_REGS;
20466
20467 /* Check if the registers are of same class. */
20468 if (rclass_1 != rclass_2)
20469 return false;
20470
20471 return true;
20472 }
20473
20474 /* Given OPERANDS of consecutive load/store that can be merged,
20475 swap them if they are not in ascending order. */
20476 void
20477 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
20478 {
20479 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
20480 HOST_WIDE_INT offval_1, offval_2;
20481
20482 if (load)
20483 {
20484 mem_1 = operands[1];
20485 mem_2 = operands[3];
20486 }
20487 else
20488 {
20489 mem_1 = operands[0];
20490 mem_2 = operands[2];
20491 }
20492
20493 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20494 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20495
20496 offval_1 = INTVAL (offset_1);
20497 offval_2 = INTVAL (offset_2);
20498
20499 if (offval_1 > offval_2)
20500 {
20501 /* Irrespective of whether this is a load or a store,
20502 we do the same swap. */
20503 std::swap (operands[0], operands[2]);
20504 std::swap (operands[1], operands[3]);
20505 }
20506 }
20507
20508 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
20509 comparison between the two. */
20510 int
20511 aarch64_host_wide_int_compare (const void *x, const void *y)
20512 {
20513 return wi::cmps (* ((const HOST_WIDE_INT *) x),
20514 * ((const HOST_WIDE_INT *) y));
20515 }
20516
20517 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
20518 other pointing to a REG rtx containing an offset, compare the offsets
20519 of the two pairs.
20520
20521 Return:
20522
20523 1 iff offset (X) > offset (Y)
20524 0 iff offset (X) == offset (Y)
20525 -1 iff offset (X) < offset (Y) */
20526 int
20527 aarch64_ldrstr_offset_compare (const void *x, const void *y)
20528 {
20529 const rtx * operands_1 = (const rtx *) x;
20530 const rtx * operands_2 = (const rtx *) y;
20531 rtx mem_1, mem_2, base, offset_1, offset_2;
20532
20533 if (MEM_P (operands_1[0]))
20534 mem_1 = operands_1[0];
20535 else
20536 mem_1 = operands_1[1];
20537
20538 if (MEM_P (operands_2[0]))
20539 mem_2 = operands_2[0];
20540 else
20541 mem_2 = operands_2[1];
20542
20543 /* Extract the offsets. */
20544 extract_base_offset_in_addr (mem_1, &base, &offset_1);
20545 extract_base_offset_in_addr (mem_2, &base, &offset_2);
20546
20547 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
20548
20549 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
20550 }
20551
20552 /* Given OPERANDS of consecutive load/store, check if we can merge
20553 them into ldp/stp by adjusting the offset. LOAD is true if they
20554 are load instructions. MODE is the mode of memory operands.
20555
20556 Given below consecutive stores:
20557
20558 str w1, [xb, 0x100]
20559 str w1, [xb, 0x104]
20560 str w1, [xb, 0x108]
20561 str w1, [xb, 0x10c]
20562
20563 Though the offsets are out of the range supported by stp, we can
20564 still pair them after adjusting the offset, like:
20565
20566 add scratch, xb, 0x100
20567 stp w1, w1, [scratch]
20568 stp w1, w1, [scratch, 0x8]
20569
20570 The peephole patterns detecting this opportunity should guarantee
20571 the scratch register is avaliable. */
20572
20573 bool
20574 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
20575 scalar_mode mode)
20576 {
20577 const int num_insns = 4;
20578 enum reg_class rclass;
20579 HOST_WIDE_INT offvals[num_insns], msize;
20580 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
20581
20582 if (load)
20583 {
20584 for (int i = 0; i < num_insns; i++)
20585 {
20586 reg[i] = operands[2 * i];
20587 mem[i] = operands[2 * i + 1];
20588
20589 gcc_assert (REG_P (reg[i]));
20590 }
20591
20592 /* Do not attempt to merge the loads if the loads clobber each other. */
20593 for (int i = 0; i < 8; i += 2)
20594 for (int j = i + 2; j < 8; j += 2)
20595 if (reg_overlap_mentioned_p (operands[i], operands[j]))
20596 return false;
20597 }
20598 else
20599 for (int i = 0; i < num_insns; i++)
20600 {
20601 mem[i] = operands[2 * i];
20602 reg[i] = operands[2 * i + 1];
20603 }
20604
20605 /* Skip if memory operand is by itself valid for ldp/stp. */
20606 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
20607 return false;
20608
20609 for (int i = 0; i < num_insns; i++)
20610 {
20611 /* The mems cannot be volatile. */
20612 if (MEM_VOLATILE_P (mem[i]))
20613 return false;
20614
20615 /* Check if the addresses are in the form of [base+offset]. */
20616 extract_base_offset_in_addr (mem[i], base + i, offset + i);
20617 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
20618 return false;
20619 }
20620
20621 /* Check if the registers are of same class. */
20622 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
20623 ? FP_REGS : GENERAL_REGS;
20624
20625 for (int i = 1; i < num_insns; i++)
20626 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
20627 {
20628 if (rclass != FP_REGS)
20629 return false;
20630 }
20631 else
20632 {
20633 if (rclass != GENERAL_REGS)
20634 return false;
20635 }
20636
20637 /* Only the last register in the order in which they occur
20638 may be clobbered by the load. */
20639 if (rclass == GENERAL_REGS && load)
20640 for (int i = 0; i < num_insns - 1; i++)
20641 if (reg_mentioned_p (reg[i], mem[i]))
20642 return false;
20643
20644 /* Check if the bases are same. */
20645 for (int i = 0; i < num_insns - 1; i++)
20646 if (!rtx_equal_p (base[i], base[i + 1]))
20647 return false;
20648
20649 for (int i = 0; i < num_insns; i++)
20650 offvals[i] = INTVAL (offset[i]);
20651
20652 msize = GET_MODE_SIZE (mode);
20653
20654 /* Check if the offsets can be put in the right order to do a ldp/stp. */
20655 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
20656 aarch64_host_wide_int_compare);
20657
20658 if (!(offvals[1] == offvals[0] + msize
20659 && offvals[3] == offvals[2] + msize))
20660 return false;
20661
20662 /* Check that offsets are within range of each other. The ldp/stp
20663 instructions have 7 bit immediate offsets, so use 0x80. */
20664 if (offvals[2] - offvals[0] >= msize * 0x80)
20665 return false;
20666
20667 /* The offsets must be aligned with respect to each other. */
20668 if (offvals[0] % msize != offvals[2] % msize)
20669 return false;
20670
20671 /* If we have SImode and slow unaligned ldp,
20672 check the alignment to be at least 8 byte. */
20673 if (mode == SImode
20674 && (aarch64_tune_params.extra_tuning_flags
20675 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
20676 && !optimize_size
20677 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
20678 return false;
20679
20680 return true;
20681 }
20682
20683 /* Given OPERANDS of consecutive load/store, this function pairs them
20684 into LDP/STP after adjusting the offset. It depends on the fact
20685 that the operands can be sorted so the offsets are correct for STP.
20686 MODE is the mode of memory operands. CODE is the rtl operator
20687 which should be applied to all memory operands, it's SIGN_EXTEND,
20688 ZERO_EXTEND or UNKNOWN. */
20689
20690 bool
20691 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
20692 scalar_mode mode, RTX_CODE code)
20693 {
20694 rtx base, offset_1, offset_3, t1, t2;
20695 rtx mem_1, mem_2, mem_3, mem_4;
20696 rtx temp_operands[8];
20697 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
20698 stp_off_upper_limit, stp_off_lower_limit, msize;
20699
20700 /* We make changes on a copy as we may still bail out. */
20701 for (int i = 0; i < 8; i ++)
20702 temp_operands[i] = operands[i];
20703
20704 /* Sort the operands. */
20705 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
20706
20707 /* Copy the memory operands so that if we have to bail for some
20708 reason the original addresses are unchanged. */
20709 if (load)
20710 {
20711 mem_1 = copy_rtx (temp_operands[1]);
20712 mem_2 = copy_rtx (temp_operands[3]);
20713 mem_3 = copy_rtx (temp_operands[5]);
20714 mem_4 = copy_rtx (temp_operands[7]);
20715 }
20716 else
20717 {
20718 mem_1 = copy_rtx (temp_operands[0]);
20719 mem_2 = copy_rtx (temp_operands[2]);
20720 mem_3 = copy_rtx (temp_operands[4]);
20721 mem_4 = copy_rtx (temp_operands[6]);
20722 gcc_assert (code == UNKNOWN);
20723 }
20724
20725 extract_base_offset_in_addr (mem_1, &base, &offset_1);
20726 extract_base_offset_in_addr (mem_3, &base, &offset_3);
20727 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
20728 && offset_3 != NULL_RTX);
20729
20730 /* Adjust offset so it can fit in LDP/STP instruction. */
20731 msize = GET_MODE_SIZE (mode);
20732 stp_off_upper_limit = msize * (0x40 - 1);
20733 stp_off_lower_limit = - msize * 0x40;
20734
20735 off_val_1 = INTVAL (offset_1);
20736 off_val_3 = INTVAL (offset_3);
20737
20738 /* The base offset is optimally half way between the two STP/LDP offsets. */
20739 if (msize <= 4)
20740 base_off = (off_val_1 + off_val_3) / 2;
20741 else
20742 /* However, due to issues with negative LDP/STP offset generation for
20743 larger modes, for DF, DI and vector modes. we must not use negative
20744 addresses smaller than 9 signed unadjusted bits can store. This
20745 provides the most range in this case. */
20746 base_off = off_val_1;
20747
20748 /* Adjust the base so that it is aligned with the addresses but still
20749 optimal. */
20750 if (base_off % msize != off_val_1 % msize)
20751 /* Fix the offset, bearing in mind we want to make it bigger not
20752 smaller. */
20753 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
20754 else if (msize <= 4)
20755 /* The negative range of LDP/STP is one larger than the positive range. */
20756 base_off += msize;
20757
20758 /* Check if base offset is too big or too small. We can attempt to resolve
20759 this issue by setting it to the maximum value and seeing if the offsets
20760 still fit. */
20761 if (base_off >= 0x1000)
20762 {
20763 base_off = 0x1000 - 1;
20764 /* We must still make sure that the base offset is aligned with respect
20765 to the address. But it may may not be made any bigger. */
20766 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
20767 }
20768
20769 /* Likewise for the case where the base is too small. */
20770 if (base_off <= -0x1000)
20771 {
20772 base_off = -0x1000 + 1;
20773 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
20774 }
20775
20776 /* Offset of the first STP/LDP. */
20777 new_off_1 = off_val_1 - base_off;
20778
20779 /* Offset of the second STP/LDP. */
20780 new_off_3 = off_val_3 - base_off;
20781
20782 /* The offsets must be within the range of the LDP/STP instructions. */
20783 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
20784 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
20785 return false;
20786
20787 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
20788 new_off_1), true);
20789 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
20790 new_off_1 + msize), true);
20791 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
20792 new_off_3), true);
20793 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
20794 new_off_3 + msize), true);
20795
20796 if (!aarch64_mem_pair_operand (mem_1, mode)
20797 || !aarch64_mem_pair_operand (mem_3, mode))
20798 return false;
20799
20800 if (code == ZERO_EXTEND)
20801 {
20802 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
20803 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
20804 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
20805 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
20806 }
20807 else if (code == SIGN_EXTEND)
20808 {
20809 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
20810 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
20811 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
20812 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
20813 }
20814
20815 if (load)
20816 {
20817 operands[0] = temp_operands[0];
20818 operands[1] = mem_1;
20819 operands[2] = temp_operands[2];
20820 operands[3] = mem_2;
20821 operands[4] = temp_operands[4];
20822 operands[5] = mem_3;
20823 operands[6] = temp_operands[6];
20824 operands[7] = mem_4;
20825 }
20826 else
20827 {
20828 operands[0] = mem_1;
20829 operands[1] = temp_operands[1];
20830 operands[2] = mem_2;
20831 operands[3] = temp_operands[3];
20832 operands[4] = mem_3;
20833 operands[5] = temp_operands[5];
20834 operands[6] = mem_4;
20835 operands[7] = temp_operands[7];
20836 }
20837
20838 /* Emit adjusting instruction. */
20839 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
20840 /* Emit ldp/stp instructions. */
20841 t1 = gen_rtx_SET (operands[0], operands[1]);
20842 t2 = gen_rtx_SET (operands[2], operands[3]);
20843 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
20844 t1 = gen_rtx_SET (operands[4], operands[5]);
20845 t2 = gen_rtx_SET (operands[6], operands[7]);
20846 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
20847 return true;
20848 }
20849
20850 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
20851 it isn't worth branching around empty masked ops (including masked
20852 stores). */
20853
20854 static bool
20855 aarch64_empty_mask_is_expensive (unsigned)
20856 {
20857 return false;
20858 }
20859
20860 /* Return 1 if pseudo register should be created and used to hold
20861 GOT address for PIC code. */
20862
20863 bool
20864 aarch64_use_pseudo_pic_reg (void)
20865 {
20866 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
20867 }
20868
20869 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
20870
20871 static int
20872 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
20873 {
20874 switch (XINT (x, 1))
20875 {
20876 case UNSPEC_GOTSMALLPIC:
20877 case UNSPEC_GOTSMALLPIC28K:
20878 case UNSPEC_GOTTINYPIC:
20879 return 0;
20880 default:
20881 break;
20882 }
20883
20884 return default_unspec_may_trap_p (x, flags);
20885 }
20886
20887
20888 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
20889 return the log2 of that value. Otherwise return -1. */
20890
20891 int
20892 aarch64_fpconst_pow_of_2 (rtx x)
20893 {
20894 const REAL_VALUE_TYPE *r;
20895
20896 if (!CONST_DOUBLE_P (x))
20897 return -1;
20898
20899 r = CONST_DOUBLE_REAL_VALUE (x);
20900
20901 if (REAL_VALUE_NEGATIVE (*r)
20902 || REAL_VALUE_ISNAN (*r)
20903 || REAL_VALUE_ISINF (*r)
20904 || !real_isinteger (r, DFmode))
20905 return -1;
20906
20907 return exact_log2 (real_to_integer (r));
20908 }
20909
20910 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
20911 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
20912 return n. Otherwise return -1. */
20913
20914 int
20915 aarch64_fpconst_pow2_recip (rtx x)
20916 {
20917 REAL_VALUE_TYPE r0;
20918
20919 if (!CONST_DOUBLE_P (x))
20920 return -1;
20921
20922 r0 = *CONST_DOUBLE_REAL_VALUE (x);
20923 if (exact_real_inverse (DFmode, &r0)
20924 && !REAL_VALUE_NEGATIVE (r0))
20925 {
20926 int ret = exact_log2 (real_to_integer (&r0));
20927 if (ret >= 1 && ret <= 32)
20928 return ret;
20929 }
20930 return -1;
20931 }
20932
20933 /* If X is a vector of equal CONST_DOUBLE values and that value is
20934 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
20935
20936 int
20937 aarch64_vec_fpconst_pow_of_2 (rtx x)
20938 {
20939 int nelts;
20940 if (GET_CODE (x) != CONST_VECTOR
20941 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
20942 return -1;
20943
20944 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
20945 return -1;
20946
20947 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
20948 if (firstval <= 0)
20949 return -1;
20950
20951 for (int i = 1; i < nelts; i++)
20952 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
20953 return -1;
20954
20955 return firstval;
20956 }
20957
20958 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
20959 to float.
20960
20961 __fp16 always promotes through this hook.
20962 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
20963 through the generic excess precision logic rather than here. */
20964
20965 static tree
20966 aarch64_promoted_type (const_tree t)
20967 {
20968 if (SCALAR_FLOAT_TYPE_P (t)
20969 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
20970 return float_type_node;
20971
20972 return NULL_TREE;
20973 }
20974
20975 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
20976
20977 static bool
20978 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
20979 optimization_type opt_type)
20980 {
20981 switch (op)
20982 {
20983 case rsqrt_optab:
20984 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
20985
20986 default:
20987 return true;
20988 }
20989 }
20990
20991 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
20992
20993 static unsigned int
20994 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
20995 int *offset)
20996 {
20997 /* Polynomial invariant 1 == (VG / 2) - 1. */
20998 gcc_assert (i == 1);
20999 *factor = 2;
21000 *offset = 1;
21001 return AARCH64_DWARF_VG;
21002 }
21003
21004 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
21005 if MODE is HFmode, and punt to the generic implementation otherwise. */
21006
21007 static bool
21008 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
21009 {
21010 return (mode == HFmode
21011 ? true
21012 : default_libgcc_floating_mode_supported_p (mode));
21013 }
21014
21015 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
21016 if MODE is HFmode, and punt to the generic implementation otherwise. */
21017
21018 static bool
21019 aarch64_scalar_mode_supported_p (scalar_mode mode)
21020 {
21021 return (mode == HFmode
21022 ? true
21023 : default_scalar_mode_supported_p (mode));
21024 }
21025
21026 /* Set the value of FLT_EVAL_METHOD.
21027 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
21028
21029 0: evaluate all operations and constants, whose semantic type has at
21030 most the range and precision of type float, to the range and
21031 precision of float; evaluate all other operations and constants to
21032 the range and precision of the semantic type;
21033
21034 N, where _FloatN is a supported interchange floating type
21035 evaluate all operations and constants, whose semantic type has at
21036 most the range and precision of _FloatN type, to the range and
21037 precision of the _FloatN type; evaluate all other operations and
21038 constants to the range and precision of the semantic type;
21039
21040 If we have the ARMv8.2-A extensions then we support _Float16 in native
21041 precision, so we should set this to 16. Otherwise, we support the type,
21042 but want to evaluate expressions in float precision, so set this to
21043 0. */
21044
21045 static enum flt_eval_method
21046 aarch64_excess_precision (enum excess_precision_type type)
21047 {
21048 switch (type)
21049 {
21050 case EXCESS_PRECISION_TYPE_FAST:
21051 case EXCESS_PRECISION_TYPE_STANDARD:
21052 /* We can calculate either in 16-bit range and precision or
21053 32-bit range and precision. Make that decision based on whether
21054 we have native support for the ARMv8.2-A 16-bit floating-point
21055 instructions or not. */
21056 return (TARGET_FP_F16INST
21057 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
21058 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
21059 case EXCESS_PRECISION_TYPE_IMPLICIT:
21060 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
21061 default:
21062 gcc_unreachable ();
21063 }
21064 return FLT_EVAL_METHOD_UNPREDICTABLE;
21065 }
21066
21067 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
21068 scheduled for speculative execution. Reject the long-running division
21069 and square-root instructions. */
21070
21071 static bool
21072 aarch64_sched_can_speculate_insn (rtx_insn *insn)
21073 {
21074 switch (get_attr_type (insn))
21075 {
21076 case TYPE_SDIV:
21077 case TYPE_UDIV:
21078 case TYPE_FDIVS:
21079 case TYPE_FDIVD:
21080 case TYPE_FSQRTS:
21081 case TYPE_FSQRTD:
21082 case TYPE_NEON_FP_SQRT_S:
21083 case TYPE_NEON_FP_SQRT_D:
21084 case TYPE_NEON_FP_SQRT_S_Q:
21085 case TYPE_NEON_FP_SQRT_D_Q:
21086 case TYPE_NEON_FP_DIV_S:
21087 case TYPE_NEON_FP_DIV_D:
21088 case TYPE_NEON_FP_DIV_S_Q:
21089 case TYPE_NEON_FP_DIV_D_Q:
21090 return false;
21091 default:
21092 return true;
21093 }
21094 }
21095
21096 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
21097
21098 static int
21099 aarch64_compute_pressure_classes (reg_class *classes)
21100 {
21101 int i = 0;
21102 classes[i++] = GENERAL_REGS;
21103 classes[i++] = FP_REGS;
21104 /* PR_REGS isn't a useful pressure class because many predicate pseudo
21105 registers need to go in PR_LO_REGS at some point during their
21106 lifetime. Splitting it into two halves has the effect of making
21107 all predicates count against PR_LO_REGS, so that we try whenever
21108 possible to restrict the number of live predicates to 8. This
21109 greatly reduces the amount of spilling in certain loops. */
21110 classes[i++] = PR_LO_REGS;
21111 classes[i++] = PR_HI_REGS;
21112 return i;
21113 }
21114
21115 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
21116
21117 static bool
21118 aarch64_can_change_mode_class (machine_mode from,
21119 machine_mode to, reg_class_t)
21120 {
21121 if (BYTES_BIG_ENDIAN)
21122 {
21123 bool from_sve_p = aarch64_sve_data_mode_p (from);
21124 bool to_sve_p = aarch64_sve_data_mode_p (to);
21125
21126 /* Don't allow changes between SVE data modes and non-SVE modes.
21127 See the comment at the head of aarch64-sve.md for details. */
21128 if (from_sve_p != to_sve_p)
21129 return false;
21130
21131 /* Don't allow changes in element size: lane 0 of the new vector
21132 would not then be lane 0 of the old vector. See the comment
21133 above aarch64_maybe_expand_sve_subreg_move for a more detailed
21134 description.
21135
21136 In the worst case, this forces a register to be spilled in
21137 one mode and reloaded in the other, which handles the
21138 endianness correctly. */
21139 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
21140 return false;
21141 }
21142 return true;
21143 }
21144
21145 /* Implement TARGET_EARLY_REMAT_MODES. */
21146
21147 static void
21148 aarch64_select_early_remat_modes (sbitmap modes)
21149 {
21150 /* SVE values are not normally live across a call, so it should be
21151 worth doing early rematerialization even in VL-specific mode. */
21152 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
21153 if (aarch64_sve_mode_p ((machine_mode) i))
21154 bitmap_set_bit (modes, i);
21155 }
21156
21157 /* Override the default target speculation_safe_value. */
21158 static rtx
21159 aarch64_speculation_safe_value (machine_mode mode,
21160 rtx result, rtx val, rtx failval)
21161 {
21162 /* Maybe we should warn if falling back to hard barriers. They are
21163 likely to be noticably more expensive than the alternative below. */
21164 if (!aarch64_track_speculation)
21165 return default_speculation_safe_value (mode, result, val, failval);
21166
21167 if (!REG_P (val))
21168 val = copy_to_mode_reg (mode, val);
21169
21170 if (!aarch64_reg_or_zero (failval, mode))
21171 failval = copy_to_mode_reg (mode, failval);
21172
21173 emit_insn (gen_despeculate_copy (mode, result, val, failval));
21174 return result;
21175 }
21176
21177 /* Implement TARGET_ESTIMATED_POLY_VALUE.
21178 Look into the tuning structure for an estimate.
21179 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
21180 Advanced SIMD 128 bits. */
21181
21182 static HOST_WIDE_INT
21183 aarch64_estimated_poly_value (poly_int64 val)
21184 {
21185 enum aarch64_sve_vector_bits_enum width_source
21186 = aarch64_tune_params.sve_width;
21187
21188 /* If we still don't have an estimate, use the default. */
21189 if (width_source == SVE_SCALABLE)
21190 return default_estimated_poly_value (val);
21191
21192 HOST_WIDE_INT over_128 = width_source - 128;
21193 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
21194 }
21195
21196
21197 /* Return true for types that could be supported as SIMD return or
21198 argument types. */
21199
21200 static bool
21201 supported_simd_type (tree t)
21202 {
21203 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
21204 {
21205 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
21206 return s == 1 || s == 2 || s == 4 || s == 8;
21207 }
21208 return false;
21209 }
21210
21211 /* Return true for types that currently are supported as SIMD return
21212 or argument types. */
21213
21214 static bool
21215 currently_supported_simd_type (tree t, tree b)
21216 {
21217 if (COMPLEX_FLOAT_TYPE_P (t))
21218 return false;
21219
21220 if (TYPE_SIZE (t) != TYPE_SIZE (b))
21221 return false;
21222
21223 return supported_simd_type (t);
21224 }
21225
21226 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
21227
21228 static int
21229 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
21230 struct cgraph_simd_clone *clonei,
21231 tree base_type, int num)
21232 {
21233 tree t, ret_type, arg_type;
21234 unsigned int elt_bits, vec_bits, count;
21235
21236 if (!TARGET_SIMD)
21237 return 0;
21238
21239 if (clonei->simdlen
21240 && (clonei->simdlen < 2
21241 || clonei->simdlen > 1024
21242 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
21243 {
21244 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21245 "unsupported simdlen %d", clonei->simdlen);
21246 return 0;
21247 }
21248
21249 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
21250 if (TREE_CODE (ret_type) != VOID_TYPE
21251 && !currently_supported_simd_type (ret_type, base_type))
21252 {
21253 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
21254 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21255 "GCC does not currently support mixed size types "
21256 "for %<simd%> functions");
21257 else if (supported_simd_type (ret_type))
21258 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21259 "GCC does not currently support return type %qT "
21260 "for %<simd%> functions", ret_type);
21261 else
21262 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21263 "unsupported return type %qT for %<simd%> functions",
21264 ret_type);
21265 return 0;
21266 }
21267
21268 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
21269 {
21270 arg_type = TREE_TYPE (t);
21271
21272 if (!currently_supported_simd_type (arg_type, base_type))
21273 {
21274 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
21275 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21276 "GCC does not currently support mixed size types "
21277 "for %<simd%> functions");
21278 else
21279 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21280 "GCC does not currently support argument type %qT "
21281 "for %<simd%> functions", arg_type);
21282 return 0;
21283 }
21284 }
21285
21286 clonei->vecsize_mangle = 'n';
21287 clonei->mask_mode = VOIDmode;
21288 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
21289 if (clonei->simdlen == 0)
21290 {
21291 count = 2;
21292 vec_bits = (num == 0 ? 64 : 128);
21293 clonei->simdlen = vec_bits / elt_bits;
21294 }
21295 else
21296 {
21297 count = 1;
21298 vec_bits = clonei->simdlen * elt_bits;
21299 if (vec_bits != 64 && vec_bits != 128)
21300 {
21301 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21302 "GCC does not currently support simdlen %d for type %qT",
21303 clonei->simdlen, base_type);
21304 return 0;
21305 }
21306 }
21307 clonei->vecsize_int = vec_bits;
21308 clonei->vecsize_float = vec_bits;
21309 return count;
21310 }
21311
21312 /* Implement TARGET_SIMD_CLONE_ADJUST. */
21313
21314 static void
21315 aarch64_simd_clone_adjust (struct cgraph_node *node)
21316 {
21317 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
21318 use the correct ABI. */
21319
21320 tree t = TREE_TYPE (node->decl);
21321 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
21322 TYPE_ATTRIBUTES (t));
21323 }
21324
21325 /* Implement TARGET_SIMD_CLONE_USABLE. */
21326
21327 static int
21328 aarch64_simd_clone_usable (struct cgraph_node *node)
21329 {
21330 switch (node->simdclone->vecsize_mangle)
21331 {
21332 case 'n':
21333 if (!TARGET_SIMD)
21334 return -1;
21335 return 0;
21336 default:
21337 gcc_unreachable ();
21338 }
21339 }
21340
21341 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
21342
21343 static int
21344 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
21345 {
21346 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
21347 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
21348 return 0;
21349 return 1;
21350 }
21351
21352 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
21353
21354 static const char *
21355 aarch64_get_multilib_abi_name (void)
21356 {
21357 if (TARGET_BIG_END)
21358 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
21359 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
21360 }
21361
21362 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
21363 global variable based guard use the default else
21364 return a null tree. */
21365 static tree
21366 aarch64_stack_protect_guard (void)
21367 {
21368 if (aarch64_stack_protector_guard == SSP_GLOBAL)
21369 return default_stack_protect_guard ();
21370
21371 return NULL_TREE;
21372 }
21373
21374 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
21375 section at the end if needed. */
21376 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
21377 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
21378 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
21379 void
21380 aarch64_file_end_indicate_exec_stack ()
21381 {
21382 file_end_indicate_exec_stack ();
21383
21384 unsigned feature_1_and = 0;
21385 if (aarch64_bti_enabled ())
21386 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
21387
21388 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
21389 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
21390
21391 if (feature_1_and)
21392 {
21393 /* Generate .note.gnu.property section. */
21394 switch_to_section (get_section (".note.gnu.property",
21395 SECTION_NOTYPE, NULL));
21396
21397 /* PT_NOTE header: namesz, descsz, type.
21398 namesz = 4 ("GNU\0")
21399 descsz = 16 (Size of the program property array)
21400 [(12 + padding) * Number of array elements]
21401 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
21402 assemble_align (POINTER_SIZE);
21403 assemble_integer (GEN_INT (4), 4, 32, 1);
21404 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
21405 assemble_integer (GEN_INT (5), 4, 32, 1);
21406
21407 /* PT_NOTE name. */
21408 assemble_string ("GNU", 4);
21409
21410 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
21411 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
21412 datasz = 4
21413 data = feature_1_and. */
21414 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
21415 assemble_integer (GEN_INT (4), 4, 32, 1);
21416 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
21417
21418 /* Pad the size of the note to the required alignment. */
21419 assemble_align (POINTER_SIZE);
21420 }
21421 }
21422 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
21423 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
21424 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
21425
21426 /* Target-specific selftests. */
21427
21428 #if CHECKING_P
21429
21430 namespace selftest {
21431
21432 /* Selftest for the RTL loader.
21433 Verify that the RTL loader copes with a dump from
21434 print_rtx_function. This is essentially just a test that class
21435 function_reader can handle a real dump, but it also verifies
21436 that lookup_reg_by_dump_name correctly handles hard regs.
21437 The presence of hard reg names in the dump means that the test is
21438 target-specific, hence it is in this file. */
21439
21440 static void
21441 aarch64_test_loading_full_dump ()
21442 {
21443 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
21444
21445 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
21446
21447 rtx_insn *insn_1 = get_insn_by_uid (1);
21448 ASSERT_EQ (NOTE, GET_CODE (insn_1));
21449
21450 rtx_insn *insn_15 = get_insn_by_uid (15);
21451 ASSERT_EQ (INSN, GET_CODE (insn_15));
21452 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
21453
21454 /* Verify crtl->return_rtx. */
21455 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
21456 ASSERT_EQ (0, REGNO (crtl->return_rtx));
21457 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
21458 }
21459
21460 /* Run all target-specific selftests. */
21461
21462 static void
21463 aarch64_run_selftests (void)
21464 {
21465 aarch64_test_loading_full_dump ();
21466 }
21467
21468 } // namespace selftest
21469
21470 #endif /* #if CHECKING_P */
21471
21472 #undef TARGET_STACK_PROTECT_GUARD
21473 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
21474
21475 #undef TARGET_ADDRESS_COST
21476 #define TARGET_ADDRESS_COST aarch64_address_cost
21477
21478 /* This hook will determines whether unnamed bitfields affect the alignment
21479 of the containing structure. The hook returns true if the structure
21480 should inherit the alignment requirements of an unnamed bitfield's
21481 type. */
21482 #undef TARGET_ALIGN_ANON_BITFIELD
21483 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
21484
21485 #undef TARGET_ASM_ALIGNED_DI_OP
21486 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
21487
21488 #undef TARGET_ASM_ALIGNED_HI_OP
21489 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
21490
21491 #undef TARGET_ASM_ALIGNED_SI_OP
21492 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
21493
21494 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
21495 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
21496 hook_bool_const_tree_hwi_hwi_const_tree_true
21497
21498 #undef TARGET_ASM_FILE_START
21499 #define TARGET_ASM_FILE_START aarch64_start_file
21500
21501 #undef TARGET_ASM_OUTPUT_MI_THUNK
21502 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
21503
21504 #undef TARGET_ASM_SELECT_RTX_SECTION
21505 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
21506
21507 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
21508 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
21509
21510 #undef TARGET_BUILD_BUILTIN_VA_LIST
21511 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
21512
21513 #undef TARGET_CALLEE_COPIES
21514 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
21515
21516 #undef TARGET_CAN_ELIMINATE
21517 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
21518
21519 #undef TARGET_CAN_INLINE_P
21520 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
21521
21522 #undef TARGET_CANNOT_FORCE_CONST_MEM
21523 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
21524
21525 #undef TARGET_CASE_VALUES_THRESHOLD
21526 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
21527
21528 #undef TARGET_CONDITIONAL_REGISTER_USAGE
21529 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
21530
21531 /* Only the least significant bit is used for initialization guard
21532 variables. */
21533 #undef TARGET_CXX_GUARD_MASK_BIT
21534 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
21535
21536 #undef TARGET_C_MODE_FOR_SUFFIX
21537 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
21538
21539 #ifdef TARGET_BIG_ENDIAN_DEFAULT
21540 #undef TARGET_DEFAULT_TARGET_FLAGS
21541 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
21542 #endif
21543
21544 #undef TARGET_CLASS_MAX_NREGS
21545 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
21546
21547 #undef TARGET_BUILTIN_DECL
21548 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
21549
21550 #undef TARGET_BUILTIN_RECIPROCAL
21551 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
21552
21553 #undef TARGET_C_EXCESS_PRECISION
21554 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
21555
21556 #undef TARGET_EXPAND_BUILTIN
21557 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
21558
21559 #undef TARGET_EXPAND_BUILTIN_VA_START
21560 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
21561
21562 #undef TARGET_FOLD_BUILTIN
21563 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
21564
21565 #undef TARGET_FUNCTION_ARG
21566 #define TARGET_FUNCTION_ARG aarch64_function_arg
21567
21568 #undef TARGET_FUNCTION_ARG_ADVANCE
21569 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
21570
21571 #undef TARGET_FUNCTION_ARG_BOUNDARY
21572 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
21573
21574 #undef TARGET_FUNCTION_ARG_PADDING
21575 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
21576
21577 #undef TARGET_GET_RAW_RESULT_MODE
21578 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
21579 #undef TARGET_GET_RAW_ARG_MODE
21580 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
21581
21582 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
21583 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
21584
21585 #undef TARGET_FUNCTION_VALUE
21586 #define TARGET_FUNCTION_VALUE aarch64_function_value
21587
21588 #undef TARGET_FUNCTION_VALUE_REGNO_P
21589 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
21590
21591 #undef TARGET_GIMPLE_FOLD_BUILTIN
21592 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
21593
21594 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
21595 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
21596
21597 #undef TARGET_INIT_BUILTINS
21598 #define TARGET_INIT_BUILTINS aarch64_init_builtins
21599
21600 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
21601 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
21602 aarch64_ira_change_pseudo_allocno_class
21603
21604 #undef TARGET_LEGITIMATE_ADDRESS_P
21605 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
21606
21607 #undef TARGET_LEGITIMATE_CONSTANT_P
21608 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
21609
21610 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
21611 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
21612 aarch64_legitimize_address_displacement
21613
21614 #undef TARGET_LIBGCC_CMP_RETURN_MODE
21615 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
21616
21617 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
21618 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
21619 aarch64_libgcc_floating_mode_supported_p
21620
21621 #undef TARGET_MANGLE_TYPE
21622 #define TARGET_MANGLE_TYPE aarch64_mangle_type
21623
21624 #undef TARGET_MEMORY_MOVE_COST
21625 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
21626
21627 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
21628 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
21629
21630 #undef TARGET_MUST_PASS_IN_STACK
21631 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
21632
21633 /* This target hook should return true if accesses to volatile bitfields
21634 should use the narrowest mode possible. It should return false if these
21635 accesses should use the bitfield container type. */
21636 #undef TARGET_NARROW_VOLATILE_BITFIELD
21637 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
21638
21639 #undef TARGET_OPTION_OVERRIDE
21640 #define TARGET_OPTION_OVERRIDE aarch64_override_options
21641
21642 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
21643 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
21644 aarch64_override_options_after_change
21645
21646 #undef TARGET_OPTION_SAVE
21647 #define TARGET_OPTION_SAVE aarch64_option_save
21648
21649 #undef TARGET_OPTION_RESTORE
21650 #define TARGET_OPTION_RESTORE aarch64_option_restore
21651
21652 #undef TARGET_OPTION_PRINT
21653 #define TARGET_OPTION_PRINT aarch64_option_print
21654
21655 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
21656 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
21657
21658 #undef TARGET_SET_CURRENT_FUNCTION
21659 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
21660
21661 #undef TARGET_PASS_BY_REFERENCE
21662 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
21663
21664 #undef TARGET_PREFERRED_RELOAD_CLASS
21665 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
21666
21667 #undef TARGET_SCHED_REASSOCIATION_WIDTH
21668 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
21669
21670 #undef TARGET_PROMOTED_TYPE
21671 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
21672
21673 #undef TARGET_SECONDARY_RELOAD
21674 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
21675
21676 #undef TARGET_SHIFT_TRUNCATION_MASK
21677 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
21678
21679 #undef TARGET_SETUP_INCOMING_VARARGS
21680 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
21681
21682 #undef TARGET_STRUCT_VALUE_RTX
21683 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
21684
21685 #undef TARGET_REGISTER_MOVE_COST
21686 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
21687
21688 #undef TARGET_RETURN_IN_MEMORY
21689 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
21690
21691 #undef TARGET_RETURN_IN_MSB
21692 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
21693
21694 #undef TARGET_RTX_COSTS
21695 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
21696
21697 #undef TARGET_SCALAR_MODE_SUPPORTED_P
21698 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
21699
21700 #undef TARGET_SCHED_ISSUE_RATE
21701 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
21702
21703 #undef TARGET_SCHED_VARIABLE_ISSUE
21704 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
21705
21706 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
21707 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
21708 aarch64_sched_first_cycle_multipass_dfa_lookahead
21709
21710 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
21711 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
21712 aarch64_first_cycle_multipass_dfa_lookahead_guard
21713
21714 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
21715 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
21716 aarch64_get_separate_components
21717
21718 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
21719 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
21720 aarch64_components_for_bb
21721
21722 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
21723 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
21724 aarch64_disqualify_components
21725
21726 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
21727 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
21728 aarch64_emit_prologue_components
21729
21730 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
21731 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
21732 aarch64_emit_epilogue_components
21733
21734 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
21735 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
21736 aarch64_set_handled_components
21737
21738 #undef TARGET_TRAMPOLINE_INIT
21739 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
21740
21741 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
21742 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
21743
21744 #undef TARGET_VECTOR_MODE_SUPPORTED_P
21745 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
21746
21747 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
21748 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
21749 aarch64_builtin_support_vector_misalignment
21750
21751 #undef TARGET_ARRAY_MODE
21752 #define TARGET_ARRAY_MODE aarch64_array_mode
21753
21754 #undef TARGET_ARRAY_MODE_SUPPORTED_P
21755 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
21756
21757 #undef TARGET_VECTORIZE_ADD_STMT_COST
21758 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
21759
21760 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
21761 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
21762 aarch64_builtin_vectorization_cost
21763
21764 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
21765 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
21766
21767 #undef TARGET_VECTORIZE_BUILTINS
21768 #define TARGET_VECTORIZE_BUILTINS
21769
21770 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
21771 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
21772 aarch64_builtin_vectorized_function
21773
21774 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
21775 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
21776 aarch64_autovectorize_vector_sizes
21777
21778 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
21779 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
21780 aarch64_atomic_assign_expand_fenv
21781
21782 /* Section anchor support. */
21783
21784 #undef TARGET_MIN_ANCHOR_OFFSET
21785 #define TARGET_MIN_ANCHOR_OFFSET -256
21786
21787 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
21788 byte offset; we can do much more for larger data types, but have no way
21789 to determine the size of the access. We assume accesses are aligned. */
21790 #undef TARGET_MAX_ANCHOR_OFFSET
21791 #define TARGET_MAX_ANCHOR_OFFSET 4095
21792
21793 #undef TARGET_VECTOR_ALIGNMENT
21794 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
21795
21796 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
21797 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
21798 aarch64_vectorize_preferred_vector_alignment
21799 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
21800 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
21801 aarch64_simd_vector_alignment_reachable
21802
21803 /* vec_perm support. */
21804
21805 #undef TARGET_VECTORIZE_VEC_PERM_CONST
21806 #define TARGET_VECTORIZE_VEC_PERM_CONST \
21807 aarch64_vectorize_vec_perm_const
21808
21809 #undef TARGET_VECTORIZE_GET_MASK_MODE
21810 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
21811 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
21812 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
21813 aarch64_empty_mask_is_expensive
21814 #undef TARGET_PREFERRED_ELSE_VALUE
21815 #define TARGET_PREFERRED_ELSE_VALUE \
21816 aarch64_preferred_else_value
21817
21818 #undef TARGET_INIT_LIBFUNCS
21819 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
21820
21821 #undef TARGET_FIXED_CONDITION_CODE_REGS
21822 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
21823
21824 #undef TARGET_FLAGS_REGNUM
21825 #define TARGET_FLAGS_REGNUM CC_REGNUM
21826
21827 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
21828 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
21829
21830 #undef TARGET_ASAN_SHADOW_OFFSET
21831 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
21832
21833 #undef TARGET_LEGITIMIZE_ADDRESS
21834 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
21835
21836 #undef TARGET_SCHED_CAN_SPECULATE_INSN
21837 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
21838
21839 #undef TARGET_CAN_USE_DOLOOP_P
21840 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
21841
21842 #undef TARGET_SCHED_ADJUST_PRIORITY
21843 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
21844
21845 #undef TARGET_SCHED_MACRO_FUSION_P
21846 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
21847
21848 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
21849 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
21850
21851 #undef TARGET_SCHED_FUSION_PRIORITY
21852 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
21853
21854 #undef TARGET_UNSPEC_MAY_TRAP_P
21855 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
21856
21857 #undef TARGET_USE_PSEUDO_PIC_REG
21858 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
21859
21860 #undef TARGET_PRINT_OPERAND
21861 #define TARGET_PRINT_OPERAND aarch64_print_operand
21862
21863 #undef TARGET_PRINT_OPERAND_ADDRESS
21864 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
21865
21866 #undef TARGET_OPTAB_SUPPORTED_P
21867 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
21868
21869 #undef TARGET_OMIT_STRUCT_RETURN_REG
21870 #define TARGET_OMIT_STRUCT_RETURN_REG true
21871
21872 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
21873 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
21874 aarch64_dwarf_poly_indeterminate_value
21875
21876 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
21877 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
21878 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
21879
21880 #undef TARGET_HARD_REGNO_NREGS
21881 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
21882 #undef TARGET_HARD_REGNO_MODE_OK
21883 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
21884
21885 #undef TARGET_MODES_TIEABLE_P
21886 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
21887
21888 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
21889 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
21890 aarch64_hard_regno_call_part_clobbered
21891
21892 #undef TARGET_INSN_CALLEE_ABI
21893 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
21894
21895 #undef TARGET_CONSTANT_ALIGNMENT
21896 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
21897
21898 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
21899 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
21900 aarch64_stack_clash_protection_alloca_probe_range
21901
21902 #undef TARGET_COMPUTE_PRESSURE_CLASSES
21903 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
21904
21905 #undef TARGET_CAN_CHANGE_MODE_CLASS
21906 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
21907
21908 #undef TARGET_SELECT_EARLY_REMAT_MODES
21909 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
21910
21911 #undef TARGET_SPECULATION_SAFE_VALUE
21912 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
21913
21914 #undef TARGET_ESTIMATED_POLY_VALUE
21915 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
21916
21917 #undef TARGET_ATTRIBUTE_TABLE
21918 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
21919
21920 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
21921 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
21922 aarch64_simd_clone_compute_vecsize_and_simdlen
21923
21924 #undef TARGET_SIMD_CLONE_ADJUST
21925 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
21926
21927 #undef TARGET_SIMD_CLONE_USABLE
21928 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
21929
21930 #undef TARGET_COMP_TYPE_ATTRIBUTES
21931 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
21932
21933 #undef TARGET_GET_MULTILIB_ABI_NAME
21934 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
21935
21936 #undef TARGET_FNTYPE_ABI
21937 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
21938
21939 #if CHECKING_P
21940 #undef TARGET_RUN_TARGET_SELFTESTS
21941 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
21942 #endif /* #if CHECKING_P */
21943
21944 #undef TARGET_ASM_POST_CFI_STARTPROC
21945 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
21946
21947 #undef TARGET_STRICT_ARGUMENT_NAMING
21948 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
21949
21950 struct gcc_target targetm = TARGET_INITIALIZER;
21951
21952 #include "gt-aarch64.h"