]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
[AArch64] Rework SVE PTEST patterns
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
85 {
86 enum insn_type { MOV, MVN, INDEX, PTRUE };
87 enum modifier_type { LSL, MSL };
88
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode, rtx);
91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 insn_type = MOV, modifier_type = LSL,
93 unsigned int = 0);
94 simd_immediate_info (scalar_mode, rtx, rtx);
95 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
96
97 /* The mode of the elements. */
98 scalar_mode elt_mode;
99
100 /* The instruction to use to move the immediate into a vector. */
101 insn_type insn;
102
103 union
104 {
105 /* For MOV and MVN. */
106 struct
107 {
108 /* The value of each element. */
109 rtx value;
110
111 /* The kind of shift modifier to use, and the number of bits to shift.
112 This is (LSL, 0) if no shift is needed. */
113 modifier_type modifier;
114 unsigned int shift;
115 } mov;
116
117 /* For INDEX. */
118 struct
119 {
120 /* The value of the first element and the step to be added for each
121 subsequent element. */
122 rtx base, step;
123 } index;
124
125 /* For PTRUE. */
126 aarch64_svpattern pattern;
127 } u;
128 };
129
130 /* Construct a floating-point immediate in which each element has mode
131 ELT_MODE_IN and value VALUE_IN. */
132 inline simd_immediate_info
133 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
134 : elt_mode (elt_mode_in), insn (MOV)
135 {
136 u.mov.value = value_in;
137 u.mov.modifier = LSL;
138 u.mov.shift = 0;
139 }
140
141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
142 and value VALUE_IN. The other parameters are as for the structure
143 fields. */
144 inline simd_immediate_info
145 ::simd_immediate_info (scalar_int_mode elt_mode_in,
146 unsigned HOST_WIDE_INT value_in,
147 insn_type insn_in, modifier_type modifier_in,
148 unsigned int shift_in)
149 : elt_mode (elt_mode_in), insn (insn_in)
150 {
151 u.mov.value = gen_int_mode (value_in, elt_mode_in);
152 u.mov.modifier = modifier_in;
153 u.mov.shift = shift_in;
154 }
155
156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
157 and where element I is equal to BASE_IN + I * STEP_IN. */
158 inline simd_immediate_info
159 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
160 : elt_mode (elt_mode_in), insn (INDEX)
161 {
162 u.index.base = base_in;
163 u.index.step = step_in;
164 }
165
166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
167 and has PTRUE pattern PATTERN_IN. */
168 inline simd_immediate_info
169 ::simd_immediate_info (scalar_int_mode elt_mode_in,
170 aarch64_svpattern pattern_in)
171 : elt_mode (elt_mode_in), insn (PTRUE)
172 {
173 u.pattern = pattern_in;
174 }
175
176 /* The current code model. */
177 enum aarch64_code_model aarch64_cmodel;
178
179 /* The number of 64-bit elements in an SVE vector. */
180 poly_uint16 aarch64_sve_vg;
181
182 #ifdef HAVE_AS_TLS
183 #undef TARGET_HAVE_TLS
184 #define TARGET_HAVE_TLS 1
185 #endif
186
187 static bool aarch64_composite_type_p (const_tree, machine_mode);
188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
189 const_tree,
190 machine_mode *, int *,
191 bool *);
192 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
193 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
194 static void aarch64_override_options_after_change (void);
195 static bool aarch64_vector_mode_supported_p (machine_mode);
196 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
198 const_tree type,
199 int misalignment,
200 bool is_packed);
201 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
202 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
203 aarch64_addr_query_type);
204 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
205
206 /* Major revision number of the ARM Architecture implemented by the target. */
207 unsigned aarch64_architecture_version;
208
209 /* The processor for which instructions should be scheduled. */
210 enum aarch64_processor aarch64_tune = cortexa53;
211
212 /* Mask to specify which instruction scheduling options should be used. */
213 uint64_t aarch64_tune_flags = 0;
214
215 /* Global flag for PC relative loads. */
216 bool aarch64_pcrelative_literal_loads;
217
218 /* Global flag for whether frame pointer is enabled. */
219 bool aarch64_use_frame_pointer;
220
221 #define BRANCH_PROTECT_STR_MAX 255
222 char *accepted_branch_protection_string = NULL;
223
224 static enum aarch64_parse_opt_result
225 aarch64_parse_branch_protection (const char*, char**);
226
227 /* Support for command line parsing of boolean flags in the tuning
228 structures. */
229 struct aarch64_flag_desc
230 {
231 const char* name;
232 unsigned int flag;
233 };
234
235 #define AARCH64_FUSION_PAIR(name, internal_name) \
236 { name, AARCH64_FUSE_##internal_name },
237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
238 {
239 { "none", AARCH64_FUSE_NOTHING },
240 #include "aarch64-fusion-pairs.def"
241 { "all", AARCH64_FUSE_ALL },
242 { NULL, AARCH64_FUSE_NOTHING }
243 };
244
245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
246 { name, AARCH64_EXTRA_TUNE_##internal_name },
247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
248 {
249 { "none", AARCH64_EXTRA_TUNE_NONE },
250 #include "aarch64-tuning-flags.def"
251 { "all", AARCH64_EXTRA_TUNE_ALL },
252 { NULL, AARCH64_EXTRA_TUNE_NONE }
253 };
254
255 /* Tuning parameters. */
256
257 static const struct cpu_addrcost_table generic_addrcost_table =
258 {
259 {
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
264 },
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
271 };
272
273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
274 {
275 {
276 0, /* hi */
277 0, /* si */
278 0, /* di */
279 2, /* ti */
280 },
281 0, /* pre_modify */
282 0, /* post_modify */
283 1, /* register_offset */
284 1, /* register_sextend */
285 2, /* register_zextend */
286 0, /* imm_offset */
287 };
288
289 static const struct cpu_addrcost_table xgene1_addrcost_table =
290 {
291 {
292 1, /* hi */
293 0, /* si */
294 0, /* di */
295 1, /* ti */
296 },
297 1, /* pre_modify */
298 1, /* post_modify */
299 0, /* register_offset */
300 1, /* register_sextend */
301 1, /* register_zextend */
302 0, /* imm_offset */
303 };
304
305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
306 {
307 {
308 1, /* hi */
309 1, /* si */
310 1, /* di */
311 2, /* ti */
312 },
313 0, /* pre_modify */
314 0, /* post_modify */
315 2, /* register_offset */
316 3, /* register_sextend */
317 3, /* register_zextend */
318 0, /* imm_offset */
319 };
320
321 static const struct cpu_addrcost_table tsv110_addrcost_table =
322 {
323 {
324 1, /* hi */
325 0, /* si */
326 0, /* di */
327 1, /* ti */
328 },
329 0, /* pre_modify */
330 0, /* post_modify */
331 0, /* register_offset */
332 1, /* register_sextend */
333 1, /* register_zextend */
334 0, /* imm_offset */
335 };
336
337 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
338 {
339 {
340 1, /* hi */
341 1, /* si */
342 1, /* di */
343 2, /* ti */
344 },
345 1, /* pre_modify */
346 1, /* post_modify */
347 3, /* register_offset */
348 3, /* register_sextend */
349 3, /* register_zextend */
350 2, /* imm_offset */
351 };
352
353 static const struct cpu_regmove_cost generic_regmove_cost =
354 {
355 1, /* GP2GP */
356 /* Avoid the use of slow int<->fp moves for spilling by setting
357 their cost higher than memmov_cost. */
358 5, /* GP2FP */
359 5, /* FP2GP */
360 2 /* FP2FP */
361 };
362
363 static const struct cpu_regmove_cost cortexa57_regmove_cost =
364 {
365 1, /* GP2GP */
366 /* Avoid the use of slow int<->fp moves for spilling by setting
367 their cost higher than memmov_cost. */
368 5, /* GP2FP */
369 5, /* FP2GP */
370 2 /* FP2FP */
371 };
372
373 static const struct cpu_regmove_cost cortexa53_regmove_cost =
374 {
375 1, /* GP2GP */
376 /* Avoid the use of slow int<->fp moves for spilling by setting
377 their cost higher than memmov_cost. */
378 5, /* GP2FP */
379 5, /* FP2GP */
380 2 /* FP2FP */
381 };
382
383 static const struct cpu_regmove_cost exynosm1_regmove_cost =
384 {
385 1, /* GP2GP */
386 /* Avoid the use of slow int<->fp moves for spilling by setting
387 their cost higher than memmov_cost (actual, 4 and 9). */
388 9, /* GP2FP */
389 9, /* FP2GP */
390 1 /* FP2FP */
391 };
392
393 static const struct cpu_regmove_cost thunderx_regmove_cost =
394 {
395 2, /* GP2GP */
396 2, /* GP2FP */
397 6, /* FP2GP */
398 4 /* FP2FP */
399 };
400
401 static const struct cpu_regmove_cost xgene1_regmove_cost =
402 {
403 1, /* GP2GP */
404 /* Avoid the use of slow int<->fp moves for spilling by setting
405 their cost higher than memmov_cost. */
406 8, /* GP2FP */
407 8, /* FP2GP */
408 2 /* FP2FP */
409 };
410
411 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
412 {
413 2, /* GP2GP */
414 /* Avoid the use of int<->fp moves for spilling. */
415 6, /* GP2FP */
416 6, /* FP2GP */
417 4 /* FP2FP */
418 };
419
420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
421 {
422 1, /* GP2GP */
423 /* Avoid the use of int<->fp moves for spilling. */
424 8, /* GP2FP */
425 8, /* FP2GP */
426 4 /* FP2FP */
427 };
428
429 static const struct cpu_regmove_cost tsv110_regmove_cost =
430 {
431 1, /* GP2GP */
432 /* Avoid the use of slow int<->fp moves for spilling by setting
433 their cost higher than memmov_cost. */
434 2, /* GP2FP */
435 3, /* FP2GP */
436 2 /* FP2FP */
437 };
438
439 /* Generic costs for vector insn classes. */
440 static const struct cpu_vector_cost generic_vector_cost =
441 {
442 1, /* scalar_int_stmt_cost */
443 1, /* scalar_fp_stmt_cost */
444 1, /* scalar_load_cost */
445 1, /* scalar_store_cost */
446 1, /* vec_int_stmt_cost */
447 1, /* vec_fp_stmt_cost */
448 2, /* vec_permute_cost */
449 1, /* vec_to_scalar_cost */
450 1, /* scalar_to_vec_cost */
451 1, /* vec_align_load_cost */
452 1, /* vec_unalign_load_cost */
453 1, /* vec_unalign_store_cost */
454 1, /* vec_store_cost */
455 3, /* cond_taken_branch_cost */
456 1 /* cond_not_taken_branch_cost */
457 };
458
459 /* QDF24XX costs for vector insn classes. */
460 static const struct cpu_vector_cost qdf24xx_vector_cost =
461 {
462 1, /* scalar_int_stmt_cost */
463 1, /* scalar_fp_stmt_cost */
464 1, /* scalar_load_cost */
465 1, /* scalar_store_cost */
466 1, /* vec_int_stmt_cost */
467 3, /* vec_fp_stmt_cost */
468 2, /* vec_permute_cost */
469 1, /* vec_to_scalar_cost */
470 1, /* scalar_to_vec_cost */
471 1, /* vec_align_load_cost */
472 1, /* vec_unalign_load_cost */
473 1, /* vec_unalign_store_cost */
474 1, /* vec_store_cost */
475 3, /* cond_taken_branch_cost */
476 1 /* cond_not_taken_branch_cost */
477 };
478
479 /* ThunderX costs for vector insn classes. */
480 static const struct cpu_vector_cost thunderx_vector_cost =
481 {
482 1, /* scalar_int_stmt_cost */
483 1, /* scalar_fp_stmt_cost */
484 3, /* scalar_load_cost */
485 1, /* scalar_store_cost */
486 4, /* vec_int_stmt_cost */
487 1, /* vec_fp_stmt_cost */
488 4, /* vec_permute_cost */
489 2, /* vec_to_scalar_cost */
490 2, /* scalar_to_vec_cost */
491 3, /* vec_align_load_cost */
492 5, /* vec_unalign_load_cost */
493 5, /* vec_unalign_store_cost */
494 1, /* vec_store_cost */
495 3, /* cond_taken_branch_cost */
496 3 /* cond_not_taken_branch_cost */
497 };
498
499 static const struct cpu_vector_cost tsv110_vector_cost =
500 {
501 1, /* scalar_int_stmt_cost */
502 1, /* scalar_fp_stmt_cost */
503 5, /* scalar_load_cost */
504 1, /* scalar_store_cost */
505 2, /* vec_int_stmt_cost */
506 2, /* vec_fp_stmt_cost */
507 2, /* vec_permute_cost */
508 3, /* vec_to_scalar_cost */
509 2, /* scalar_to_vec_cost */
510 5, /* vec_align_load_cost */
511 5, /* vec_unalign_load_cost */
512 1, /* vec_unalign_store_cost */
513 1, /* vec_store_cost */
514 1, /* cond_taken_branch_cost */
515 1 /* cond_not_taken_branch_cost */
516 };
517
518 /* Generic costs for vector insn classes. */
519 static const struct cpu_vector_cost cortexa57_vector_cost =
520 {
521 1, /* scalar_int_stmt_cost */
522 1, /* scalar_fp_stmt_cost */
523 4, /* scalar_load_cost */
524 1, /* scalar_store_cost */
525 2, /* vec_int_stmt_cost */
526 2, /* vec_fp_stmt_cost */
527 3, /* vec_permute_cost */
528 8, /* vec_to_scalar_cost */
529 8, /* scalar_to_vec_cost */
530 4, /* vec_align_load_cost */
531 4, /* vec_unalign_load_cost */
532 1, /* vec_unalign_store_cost */
533 1, /* vec_store_cost */
534 1, /* cond_taken_branch_cost */
535 1 /* cond_not_taken_branch_cost */
536 };
537
538 static const struct cpu_vector_cost exynosm1_vector_cost =
539 {
540 1, /* scalar_int_stmt_cost */
541 1, /* scalar_fp_stmt_cost */
542 5, /* scalar_load_cost */
543 1, /* scalar_store_cost */
544 3, /* vec_int_stmt_cost */
545 3, /* vec_fp_stmt_cost */
546 3, /* vec_permute_cost */
547 3, /* vec_to_scalar_cost */
548 3, /* scalar_to_vec_cost */
549 5, /* vec_align_load_cost */
550 5, /* vec_unalign_load_cost */
551 1, /* vec_unalign_store_cost */
552 1, /* vec_store_cost */
553 1, /* cond_taken_branch_cost */
554 1 /* cond_not_taken_branch_cost */
555 };
556
557 /* Generic costs for vector insn classes. */
558 static const struct cpu_vector_cost xgene1_vector_cost =
559 {
560 1, /* scalar_int_stmt_cost */
561 1, /* scalar_fp_stmt_cost */
562 5, /* scalar_load_cost */
563 1, /* scalar_store_cost */
564 2, /* vec_int_stmt_cost */
565 2, /* vec_fp_stmt_cost */
566 2, /* vec_permute_cost */
567 4, /* vec_to_scalar_cost */
568 4, /* scalar_to_vec_cost */
569 10, /* vec_align_load_cost */
570 10, /* vec_unalign_load_cost */
571 2, /* vec_unalign_store_cost */
572 2, /* vec_store_cost */
573 2, /* cond_taken_branch_cost */
574 1 /* cond_not_taken_branch_cost */
575 };
576
577 /* Costs for vector insn classes for Vulcan. */
578 static const struct cpu_vector_cost thunderx2t99_vector_cost =
579 {
580 1, /* scalar_int_stmt_cost */
581 6, /* scalar_fp_stmt_cost */
582 4, /* scalar_load_cost */
583 1, /* scalar_store_cost */
584 5, /* vec_int_stmt_cost */
585 6, /* vec_fp_stmt_cost */
586 3, /* vec_permute_cost */
587 6, /* vec_to_scalar_cost */
588 5, /* scalar_to_vec_cost */
589 8, /* vec_align_load_cost */
590 8, /* vec_unalign_load_cost */
591 4, /* vec_unalign_store_cost */
592 4, /* vec_store_cost */
593 2, /* cond_taken_branch_cost */
594 1 /* cond_not_taken_branch_cost */
595 };
596
597 /* Generic costs for branch instructions. */
598 static const struct cpu_branch_cost generic_branch_cost =
599 {
600 1, /* Predictable. */
601 3 /* Unpredictable. */
602 };
603
604 /* Generic approximation modes. */
605 static const cpu_approx_modes generic_approx_modes =
606 {
607 AARCH64_APPROX_NONE, /* division */
608 AARCH64_APPROX_NONE, /* sqrt */
609 AARCH64_APPROX_NONE /* recip_sqrt */
610 };
611
612 /* Approximation modes for Exynos M1. */
613 static const cpu_approx_modes exynosm1_approx_modes =
614 {
615 AARCH64_APPROX_NONE, /* division */
616 AARCH64_APPROX_ALL, /* sqrt */
617 AARCH64_APPROX_ALL /* recip_sqrt */
618 };
619
620 /* Approximation modes for X-Gene 1. */
621 static const cpu_approx_modes xgene1_approx_modes =
622 {
623 AARCH64_APPROX_NONE, /* division */
624 AARCH64_APPROX_NONE, /* sqrt */
625 AARCH64_APPROX_ALL /* recip_sqrt */
626 };
627
628 /* Generic prefetch settings (which disable prefetch). */
629 static const cpu_prefetch_tune generic_prefetch_tune =
630 {
631 0, /* num_slots */
632 -1, /* l1_cache_size */
633 -1, /* l1_cache_line_size */
634 -1, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 -1 /* default_opt_level */
638 };
639
640 static const cpu_prefetch_tune exynosm1_prefetch_tune =
641 {
642 0, /* num_slots */
643 -1, /* l1_cache_size */
644 64, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
649 };
650
651 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
652 {
653 4, /* num_slots */
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 512, /* l2_cache_size */
657 false, /* prefetch_dynamic_strides */
658 2048, /* minimum_stride */
659 3 /* default_opt_level */
660 };
661
662 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
663 {
664 8, /* num_slots */
665 32, /* l1_cache_size */
666 128, /* l1_cache_line_size */
667 16*1024, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 3 /* default_opt_level */
671 };
672
673 static const cpu_prefetch_tune thunderx_prefetch_tune =
674 {
675 8, /* num_slots */
676 32, /* l1_cache_size */
677 128, /* l1_cache_line_size */
678 -1, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
682 };
683
684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
685 {
686 8, /* num_slots */
687 32, /* l1_cache_size */
688 64, /* l1_cache_line_size */
689 256, /* l2_cache_size */
690 true, /* prefetch_dynamic_strides */
691 -1, /* minimum_stride */
692 -1 /* default_opt_level */
693 };
694
695 static const cpu_prefetch_tune tsv110_prefetch_tune =
696 {
697 0, /* num_slots */
698 64, /* l1_cache_size */
699 64, /* l1_cache_line_size */
700 512, /* l2_cache_size */
701 true, /* prefetch_dynamic_strides */
702 -1, /* minimum_stride */
703 -1 /* default_opt_level */
704 };
705
706 static const cpu_prefetch_tune xgene1_prefetch_tune =
707 {
708 8, /* num_slots */
709 32, /* l1_cache_size */
710 64, /* l1_cache_line_size */
711 256, /* l2_cache_size */
712 true, /* prefetch_dynamic_strides */
713 -1, /* minimum_stride */
714 -1 /* default_opt_level */
715 };
716
717 static const struct tune_params generic_tunings =
718 {
719 &cortexa57_extra_costs,
720 &generic_addrcost_table,
721 &generic_regmove_cost,
722 &generic_vector_cost,
723 &generic_branch_cost,
724 &generic_approx_modes,
725 SVE_NOT_IMPLEMENTED, /* sve_width */
726 4, /* memmov_cost */
727 2, /* issue_rate */
728 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
729 "16:12", /* function_align. */
730 "4", /* jump_align. */
731 "8", /* loop_align. */
732 2, /* int_reassoc_width. */
733 4, /* fp_reassoc_width. */
734 1, /* vec_reassoc_width. */
735 2, /* min_div_recip_mul_sf. */
736 2, /* min_div_recip_mul_df. */
737 0, /* max_case_values. */
738 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
739 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
740 &generic_prefetch_tune
741 };
742
743 static const struct tune_params cortexa35_tunings =
744 {
745 &cortexa53_extra_costs,
746 &generic_addrcost_table,
747 &cortexa53_regmove_cost,
748 &generic_vector_cost,
749 &generic_branch_cost,
750 &generic_approx_modes,
751 SVE_NOT_IMPLEMENTED, /* sve_width */
752 4, /* memmov_cost */
753 1, /* issue_rate */
754 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
755 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
756 "16", /* function_align. */
757 "4", /* jump_align. */
758 "8", /* loop_align. */
759 2, /* int_reassoc_width. */
760 4, /* fp_reassoc_width. */
761 1, /* vec_reassoc_width. */
762 2, /* min_div_recip_mul_sf. */
763 2, /* min_div_recip_mul_df. */
764 0, /* max_case_values. */
765 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
766 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
767 &generic_prefetch_tune
768 };
769
770 static const struct tune_params cortexa53_tunings =
771 {
772 &cortexa53_extra_costs,
773 &generic_addrcost_table,
774 &cortexa53_regmove_cost,
775 &generic_vector_cost,
776 &generic_branch_cost,
777 &generic_approx_modes,
778 SVE_NOT_IMPLEMENTED, /* sve_width */
779 4, /* memmov_cost */
780 2, /* issue_rate */
781 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
782 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
783 "16", /* function_align. */
784 "4", /* jump_align. */
785 "8", /* loop_align. */
786 2, /* int_reassoc_width. */
787 4, /* fp_reassoc_width. */
788 1, /* vec_reassoc_width. */
789 2, /* min_div_recip_mul_sf. */
790 2, /* min_div_recip_mul_df. */
791 0, /* max_case_values. */
792 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
793 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
794 &generic_prefetch_tune
795 };
796
797 static const struct tune_params cortexa57_tunings =
798 {
799 &cortexa57_extra_costs,
800 &generic_addrcost_table,
801 &cortexa57_regmove_cost,
802 &cortexa57_vector_cost,
803 &generic_branch_cost,
804 &generic_approx_modes,
805 SVE_NOT_IMPLEMENTED, /* sve_width */
806 4, /* memmov_cost */
807 3, /* issue_rate */
808 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
809 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
810 "16", /* function_align. */
811 "4", /* jump_align. */
812 "8", /* loop_align. */
813 2, /* int_reassoc_width. */
814 4, /* fp_reassoc_width. */
815 1, /* vec_reassoc_width. */
816 2, /* min_div_recip_mul_sf. */
817 2, /* min_div_recip_mul_df. */
818 0, /* max_case_values. */
819 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
820 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
821 &generic_prefetch_tune
822 };
823
824 static const struct tune_params cortexa72_tunings =
825 {
826 &cortexa57_extra_costs,
827 &generic_addrcost_table,
828 &cortexa57_regmove_cost,
829 &cortexa57_vector_cost,
830 &generic_branch_cost,
831 &generic_approx_modes,
832 SVE_NOT_IMPLEMENTED, /* sve_width */
833 4, /* memmov_cost */
834 3, /* issue_rate */
835 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
836 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
837 "16", /* function_align. */
838 "4", /* jump_align. */
839 "8", /* loop_align. */
840 2, /* int_reassoc_width. */
841 4, /* fp_reassoc_width. */
842 1, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &generic_prefetch_tune
849 };
850
851 static const struct tune_params cortexa73_tunings =
852 {
853 &cortexa57_extra_costs,
854 &generic_addrcost_table,
855 &cortexa57_regmove_cost,
856 &cortexa57_vector_cost,
857 &generic_branch_cost,
858 &generic_approx_modes,
859 SVE_NOT_IMPLEMENTED, /* sve_width */
860 4, /* memmov_cost. */
861 2, /* issue_rate. */
862 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
863 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
864 "16", /* function_align. */
865 "4", /* jump_align. */
866 "8", /* loop_align. */
867 2, /* int_reassoc_width. */
868 4, /* fp_reassoc_width. */
869 1, /* vec_reassoc_width. */
870 2, /* min_div_recip_mul_sf. */
871 2, /* min_div_recip_mul_df. */
872 0, /* max_case_values. */
873 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
874 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
875 &generic_prefetch_tune
876 };
877
878
879
880 static const struct tune_params exynosm1_tunings =
881 {
882 &exynosm1_extra_costs,
883 &exynosm1_addrcost_table,
884 &exynosm1_regmove_cost,
885 &exynosm1_vector_cost,
886 &generic_branch_cost,
887 &exynosm1_approx_modes,
888 SVE_NOT_IMPLEMENTED, /* sve_width */
889 4, /* memmov_cost */
890 3, /* issue_rate */
891 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
892 "4", /* function_align. */
893 "4", /* jump_align. */
894 "4", /* loop_align. */
895 2, /* int_reassoc_width. */
896 4, /* fp_reassoc_width. */
897 1, /* vec_reassoc_width. */
898 2, /* min_div_recip_mul_sf. */
899 2, /* min_div_recip_mul_df. */
900 48, /* max_case_values. */
901 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
902 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
903 &exynosm1_prefetch_tune
904 };
905
906 static const struct tune_params thunderxt88_tunings =
907 {
908 &thunderx_extra_costs,
909 &generic_addrcost_table,
910 &thunderx_regmove_cost,
911 &thunderx_vector_cost,
912 &generic_branch_cost,
913 &generic_approx_modes,
914 SVE_NOT_IMPLEMENTED, /* sve_width */
915 6, /* memmov_cost */
916 2, /* issue_rate */
917 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
918 "8", /* function_align. */
919 "8", /* jump_align. */
920 "8", /* loop_align. */
921 2, /* int_reassoc_width. */
922 4, /* fp_reassoc_width. */
923 1, /* vec_reassoc_width. */
924 2, /* min_div_recip_mul_sf. */
925 2, /* min_div_recip_mul_df. */
926 0, /* max_case_values. */
927 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
928 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
929 &thunderxt88_prefetch_tune
930 };
931
932 static const struct tune_params thunderx_tunings =
933 {
934 &thunderx_extra_costs,
935 &generic_addrcost_table,
936 &thunderx_regmove_cost,
937 &thunderx_vector_cost,
938 &generic_branch_cost,
939 &generic_approx_modes,
940 SVE_NOT_IMPLEMENTED, /* sve_width */
941 6, /* memmov_cost */
942 2, /* issue_rate */
943 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
944 "8", /* function_align. */
945 "8", /* jump_align. */
946 "8", /* loop_align. */
947 2, /* int_reassoc_width. */
948 4, /* fp_reassoc_width. */
949 1, /* vec_reassoc_width. */
950 2, /* min_div_recip_mul_sf. */
951 2, /* min_div_recip_mul_df. */
952 0, /* max_case_values. */
953 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
954 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
955 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
956 &thunderx_prefetch_tune
957 };
958
959 static const struct tune_params tsv110_tunings =
960 {
961 &tsv110_extra_costs,
962 &tsv110_addrcost_table,
963 &tsv110_regmove_cost,
964 &tsv110_vector_cost,
965 &generic_branch_cost,
966 &generic_approx_modes,
967 SVE_NOT_IMPLEMENTED, /* sve_width */
968 4, /* memmov_cost */
969 4, /* issue_rate */
970 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
971 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
972 "16", /* function_align. */
973 "4", /* jump_align. */
974 "8", /* loop_align. */
975 2, /* int_reassoc_width. */
976 4, /* fp_reassoc_width. */
977 1, /* vec_reassoc_width. */
978 2, /* min_div_recip_mul_sf. */
979 2, /* min_div_recip_mul_df. */
980 0, /* max_case_values. */
981 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
982 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
983 &tsv110_prefetch_tune
984 };
985
986 static const struct tune_params xgene1_tunings =
987 {
988 &xgene1_extra_costs,
989 &xgene1_addrcost_table,
990 &xgene1_regmove_cost,
991 &xgene1_vector_cost,
992 &generic_branch_cost,
993 &xgene1_approx_modes,
994 SVE_NOT_IMPLEMENTED, /* sve_width */
995 6, /* memmov_cost */
996 4, /* issue_rate */
997 AARCH64_FUSE_NOTHING, /* fusible_ops */
998 "16", /* function_align. */
999 "16", /* jump_align. */
1000 "16", /* loop_align. */
1001 2, /* int_reassoc_width. */
1002 4, /* fp_reassoc_width. */
1003 1, /* vec_reassoc_width. */
1004 2, /* min_div_recip_mul_sf. */
1005 2, /* min_div_recip_mul_df. */
1006 17, /* max_case_values. */
1007 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1008 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1009 &xgene1_prefetch_tune
1010 };
1011
1012 static const struct tune_params emag_tunings =
1013 {
1014 &xgene1_extra_costs,
1015 &xgene1_addrcost_table,
1016 &xgene1_regmove_cost,
1017 &xgene1_vector_cost,
1018 &generic_branch_cost,
1019 &xgene1_approx_modes,
1020 SVE_NOT_IMPLEMENTED,
1021 6, /* memmov_cost */
1022 4, /* issue_rate */
1023 AARCH64_FUSE_NOTHING, /* fusible_ops */
1024 "16", /* function_align. */
1025 "16", /* jump_align. */
1026 "16", /* loop_align. */
1027 2, /* int_reassoc_width. */
1028 4, /* fp_reassoc_width. */
1029 1, /* vec_reassoc_width. */
1030 2, /* min_div_recip_mul_sf. */
1031 2, /* min_div_recip_mul_df. */
1032 17, /* max_case_values. */
1033 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1034 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1035 &xgene1_prefetch_tune
1036 };
1037
1038 static const struct tune_params qdf24xx_tunings =
1039 {
1040 &qdf24xx_extra_costs,
1041 &qdf24xx_addrcost_table,
1042 &qdf24xx_regmove_cost,
1043 &qdf24xx_vector_cost,
1044 &generic_branch_cost,
1045 &generic_approx_modes,
1046 SVE_NOT_IMPLEMENTED, /* sve_width */
1047 4, /* memmov_cost */
1048 4, /* issue_rate */
1049 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1050 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1051 "16", /* function_align. */
1052 "8", /* jump_align. */
1053 "16", /* loop_align. */
1054 2, /* int_reassoc_width. */
1055 4, /* fp_reassoc_width. */
1056 1, /* vec_reassoc_width. */
1057 2, /* min_div_recip_mul_sf. */
1058 2, /* min_div_recip_mul_df. */
1059 0, /* max_case_values. */
1060 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1061 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1062 &qdf24xx_prefetch_tune
1063 };
1064
1065 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1066 for now. */
1067 static const struct tune_params saphira_tunings =
1068 {
1069 &generic_extra_costs,
1070 &generic_addrcost_table,
1071 &generic_regmove_cost,
1072 &generic_vector_cost,
1073 &generic_branch_cost,
1074 &generic_approx_modes,
1075 SVE_NOT_IMPLEMENTED, /* sve_width */
1076 4, /* memmov_cost */
1077 4, /* issue_rate */
1078 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1079 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1080 "16", /* function_align. */
1081 "8", /* jump_align. */
1082 "16", /* loop_align. */
1083 2, /* int_reassoc_width. */
1084 4, /* fp_reassoc_width. */
1085 1, /* vec_reassoc_width. */
1086 2, /* min_div_recip_mul_sf. */
1087 2, /* min_div_recip_mul_df. */
1088 0, /* max_case_values. */
1089 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1090 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1091 &generic_prefetch_tune
1092 };
1093
1094 static const struct tune_params thunderx2t99_tunings =
1095 {
1096 &thunderx2t99_extra_costs,
1097 &thunderx2t99_addrcost_table,
1098 &thunderx2t99_regmove_cost,
1099 &thunderx2t99_vector_cost,
1100 &generic_branch_cost,
1101 &generic_approx_modes,
1102 SVE_NOT_IMPLEMENTED, /* sve_width */
1103 4, /* memmov_cost. */
1104 4, /* issue_rate. */
1105 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1106 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1107 "16", /* function_align. */
1108 "8", /* jump_align. */
1109 "16", /* loop_align. */
1110 3, /* int_reassoc_width. */
1111 2, /* fp_reassoc_width. */
1112 2, /* vec_reassoc_width. */
1113 2, /* min_div_recip_mul_sf. */
1114 2, /* min_div_recip_mul_df. */
1115 0, /* max_case_values. */
1116 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1117 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1118 &thunderx2t99_prefetch_tune
1119 };
1120
1121 static const struct tune_params neoversen1_tunings =
1122 {
1123 &cortexa57_extra_costs,
1124 &generic_addrcost_table,
1125 &generic_regmove_cost,
1126 &cortexa57_vector_cost,
1127 &generic_branch_cost,
1128 &generic_approx_modes,
1129 SVE_NOT_IMPLEMENTED, /* sve_width */
1130 4, /* memmov_cost */
1131 3, /* issue_rate */
1132 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1133 "32:16", /* function_align. */
1134 "32:16", /* jump_align. */
1135 "32:16", /* loop_align. */
1136 2, /* int_reassoc_width. */
1137 4, /* fp_reassoc_width. */
1138 2, /* vec_reassoc_width. */
1139 2, /* min_div_recip_mul_sf. */
1140 2, /* min_div_recip_mul_df. */
1141 0, /* max_case_values. */
1142 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1143 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1144 &generic_prefetch_tune
1145 };
1146
1147 /* Support for fine-grained override of the tuning structures. */
1148 struct aarch64_tuning_override_function
1149 {
1150 const char* name;
1151 void (*parse_override)(const char*, struct tune_params*);
1152 };
1153
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1157
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions[] =
1160 {
1161 { "fuse", aarch64_parse_fuse_string },
1162 { "tune", aarch64_parse_tune_string },
1163 { "sve_width", aarch64_parse_sve_width_string },
1164 { NULL, NULL }
1165 };
1166
1167 /* A processor implementing AArch64. */
1168 struct processor
1169 {
1170 const char *const name;
1171 enum aarch64_processor ident;
1172 enum aarch64_processor sched_core;
1173 enum aarch64_arch arch;
1174 unsigned architecture_version;
1175 const uint64_t flags;
1176 const struct tune_params *const tune;
1177 };
1178
1179 /* Architectures implementing AArch64. */
1180 static const struct processor all_architectures[] =
1181 {
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1186 };
1187
1188 /* Processor cores implementing AArch64. */
1189 static const struct processor all_cores[] =
1190 {
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1193 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1194 FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1197 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1198 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1199 };
1200
1201
1202 /* Target specification. These are populated by the -march, -mtune, -mcpu
1203 handling code or by target attributes. */
1204 static const struct processor *selected_arch;
1205 static const struct processor *selected_cpu;
1206 static const struct processor *selected_tune;
1207
1208 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1209
1210 /* The current tuning set. */
1211 struct tune_params aarch64_tune_params = generic_tunings;
1212
1213 /* Table of machine attributes. */
1214 static const struct attribute_spec aarch64_attribute_table[] =
1215 {
1216 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217 affects_type_identity, handler, exclude } */
1218 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1219 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1220 };
1221
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1223
1224 /* An ISA extension in the co-processor and main instruction set space. */
1225 struct aarch64_option_extension
1226 {
1227 const char *const name;
1228 const unsigned long flags_on;
1229 const unsigned long flags_off;
1230 };
1231
1232 typedef enum aarch64_cond_code
1233 {
1234 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1235 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1236 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1237 }
1238 aarch64_cc;
1239
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1241
1242 struct aarch64_branch_protect_type
1243 {
1244 /* The type's name that the user passes to the branch-protection option
1245 string. */
1246 const char* name;
1247 /* Function to handle the protection type and set global variables.
1248 First argument is the string token corresponding with this type and the
1249 second argument is the next token in the option string.
1250 Return values:
1251 * AARCH64_PARSE_OK: Handling was sucessful.
1252 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253 should print an error.
1254 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1255 own error. */
1256 enum aarch64_parse_opt_result (*handler)(char*, char*);
1257 /* A list of types that can follow this type in the option string. */
1258 const aarch64_branch_protect_type* subtypes;
1259 unsigned int num_subtypes;
1260 };
1261
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str, char* rest)
1264 {
1265 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1266 aarch64_enable_bti = 0;
1267 if (rest)
1268 {
1269 error ("unexpected %<%s%> after %<%s%>", rest, str);
1270 return AARCH64_PARSE_INVALID_FEATURE;
1271 }
1272 return AARCH64_PARSE_OK;
1273 }
1274
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str, char* rest)
1277 {
1278 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1279 aarch64_ra_sign_key = AARCH64_KEY_A;
1280 aarch64_enable_bti = 1;
1281 if (rest)
1282 {
1283 error ("unexpected %<%s%> after %<%s%>", rest, str);
1284 return AARCH64_PARSE_INVALID_FEATURE;
1285 }
1286 return AARCH64_PARSE_OK;
1287 }
1288
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1291 char* rest ATTRIBUTE_UNUSED)
1292 {
1293 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1294 aarch64_ra_sign_key = AARCH64_KEY_A;
1295 return AARCH64_PARSE_OK;
1296 }
1297
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1300 char* rest ATTRIBUTE_UNUSED)
1301 {
1302 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1303 return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1308 char* rest ATTRIBUTE_UNUSED)
1309 {
1310 aarch64_ra_sign_key = AARCH64_KEY_B;
1311 return AARCH64_PARSE_OK;
1312 }
1313
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1316 char* rest ATTRIBUTE_UNUSED)
1317 {
1318 aarch64_enable_bti = 1;
1319 return AARCH64_PARSE_OK;
1320 }
1321
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1323 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1324 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1325 { NULL, NULL, NULL, 0 }
1326 };
1327
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1329 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1330 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1331 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1332 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1333 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1334 { NULL, NULL, NULL, 0 }
1335 };
1336
1337 /* The condition codes of the processor, and the inverse function. */
1338 static const char * const aarch64_condition_codes[] =
1339 {
1340 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1342 };
1343
1344 /* The preferred condition codes for SVE conditions. */
1345 static const char *const aarch64_sve_condition_codes[] =
1346 {
1347 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1349 };
1350
1351 /* Return the assembly token for svpattern value VALUE. */
1352
1353 static const char *
1354 svpattern_token (enum aarch64_svpattern pattern)
1355 {
1356 switch (pattern)
1357 {
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359 AARCH64_FOR_SVPATTERN (CASE)
1360 #undef CASE
1361 case AARCH64_NUM_SVPATTERNS:
1362 break;
1363 }
1364 gcc_unreachable ();
1365 }
1366
1367 /* Generate code to enable conditional branches in functions over 1 MiB. */
1368 const char *
1369 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1370 const char * branch_format)
1371 {
1372 rtx_code_label * tmp_label = gen_label_rtx ();
1373 char label_buf[256];
1374 char buffer[128];
1375 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1376 CODE_LABEL_NUMBER (tmp_label));
1377 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1378 rtx dest_label = operands[pos_label];
1379 operands[pos_label] = tmp_label;
1380
1381 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1382 output_asm_insn (buffer, operands);
1383
1384 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1385 operands[pos_label] = dest_label;
1386 output_asm_insn (buffer, operands);
1387 return "";
1388 }
1389
1390 void
1391 aarch64_err_no_fpadvsimd (machine_mode mode)
1392 {
1393 if (TARGET_GENERAL_REGS_ONLY)
1394 if (FLOAT_MODE_P (mode))
1395 error ("%qs is incompatible with the use of floating-point types",
1396 "-mgeneral-regs-only");
1397 else
1398 error ("%qs is incompatible with the use of vector types",
1399 "-mgeneral-regs-only");
1400 else
1401 if (FLOAT_MODE_P (mode))
1402 error ("%qs feature modifier is incompatible with the use of"
1403 " floating-point types", "+nofp");
1404 else
1405 error ("%qs feature modifier is incompatible with the use of"
1406 " vector types", "+nofp");
1407 }
1408
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413 and GENERAL_REGS is lower than the memory cost (in this case the best class
1414 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1415 cost results in bad allocations with many redundant int<->FP moves which
1416 are expensive on various cores.
1417 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1419 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1420 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1421 The result of this is that it is no longer inefficient to have a higher
1422 memory move cost than the register move cost.
1423 */
1424
1425 static reg_class_t
1426 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1427 reg_class_t best_class)
1428 {
1429 machine_mode mode;
1430
1431 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1432 || !reg_class_subset_p (FP_REGS, allocno_class))
1433 return allocno_class;
1434
1435 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1436 || !reg_class_subset_p (FP_REGS, best_class))
1437 return best_class;
1438
1439 mode = PSEUDO_REGNO_MODE (regno);
1440 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1441 }
1442
1443 static unsigned int
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1445 {
1446 if (GET_MODE_UNIT_SIZE (mode) == 4)
1447 return aarch64_tune_params.min_div_recip_mul_sf;
1448 return aarch64_tune_params.min_div_recip_mul_df;
1449 }
1450
1451 /* Return the reassociation width of treeop OPC with mode MODE. */
1452 static int
1453 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1454 {
1455 if (VECTOR_MODE_P (mode))
1456 return aarch64_tune_params.vec_reassoc_width;
1457 if (INTEGRAL_MODE_P (mode))
1458 return aarch64_tune_params.int_reassoc_width;
1459 /* Avoid reassociating floating point addition so we emit more FMAs. */
1460 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1461 return aarch64_tune_params.fp_reassoc_width;
1462 return 1;
1463 }
1464
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1466 unsigned
1467 aarch64_dbx_register_number (unsigned regno)
1468 {
1469 if (GP_REGNUM_P (regno))
1470 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1471 else if (regno == SP_REGNUM)
1472 return AARCH64_DWARF_SP;
1473 else if (FP_REGNUM_P (regno))
1474 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1475 else if (PR_REGNUM_P (regno))
1476 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1477 else if (regno == VG_REGNUM)
1478 return AARCH64_DWARF_VG;
1479
1480 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481 equivalent DWARF register. */
1482 return DWARF_FRAME_REGISTERS;
1483 }
1484
1485 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1486 static bool
1487 aarch64_advsimd_struct_mode_p (machine_mode mode)
1488 {
1489 return (TARGET_SIMD
1490 && (mode == OImode || mode == CImode || mode == XImode));
1491 }
1492
1493 /* Return true if MODE is an SVE predicate mode. */
1494 static bool
1495 aarch64_sve_pred_mode_p (machine_mode mode)
1496 {
1497 return (TARGET_SVE
1498 && (mode == VNx16BImode
1499 || mode == VNx8BImode
1500 || mode == VNx4BImode
1501 || mode == VNx2BImode));
1502 }
1503
1504 /* Three mutually-exclusive flags describing a vector or predicate type. */
1505 const unsigned int VEC_ADVSIMD = 1;
1506 const unsigned int VEC_SVE_DATA = 2;
1507 const unsigned int VEC_SVE_PRED = 4;
1508 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1509 a structure of 2, 3 or 4 vectors. */
1510 const unsigned int VEC_STRUCT = 8;
1511 /* Useful combinations of the above. */
1512 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1513 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1514
1515 /* Return a set of flags describing the vector properties of mode MODE.
1516 Ignore modes that are not supported by the current target. */
1517 static unsigned int
1518 aarch64_classify_vector_mode (machine_mode mode)
1519 {
1520 if (aarch64_advsimd_struct_mode_p (mode))
1521 return VEC_ADVSIMD | VEC_STRUCT;
1522
1523 if (aarch64_sve_pred_mode_p (mode))
1524 return VEC_SVE_PRED;
1525
1526 /* Make the decision based on the mode's enum value rather than its
1527 properties, so that we keep the correct classification regardless
1528 of -msve-vector-bits. */
1529 switch (mode)
1530 {
1531 /* Single SVE vectors. */
1532 case E_VNx16QImode:
1533 case E_VNx8HImode:
1534 case E_VNx4SImode:
1535 case E_VNx2DImode:
1536 case E_VNx8HFmode:
1537 case E_VNx4SFmode:
1538 case E_VNx2DFmode:
1539 return TARGET_SVE ? VEC_SVE_DATA : 0;
1540
1541 /* x2 SVE vectors. */
1542 case E_VNx32QImode:
1543 case E_VNx16HImode:
1544 case E_VNx8SImode:
1545 case E_VNx4DImode:
1546 case E_VNx16HFmode:
1547 case E_VNx8SFmode:
1548 case E_VNx4DFmode:
1549 /* x3 SVE vectors. */
1550 case E_VNx48QImode:
1551 case E_VNx24HImode:
1552 case E_VNx12SImode:
1553 case E_VNx6DImode:
1554 case E_VNx24HFmode:
1555 case E_VNx12SFmode:
1556 case E_VNx6DFmode:
1557 /* x4 SVE vectors. */
1558 case E_VNx64QImode:
1559 case E_VNx32HImode:
1560 case E_VNx16SImode:
1561 case E_VNx8DImode:
1562 case E_VNx32HFmode:
1563 case E_VNx16SFmode:
1564 case E_VNx8DFmode:
1565 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1566
1567 /* 64-bit Advanced SIMD vectors. */
1568 case E_V8QImode:
1569 case E_V4HImode:
1570 case E_V2SImode:
1571 /* ...E_V1DImode doesn't exist. */
1572 case E_V4HFmode:
1573 case E_V2SFmode:
1574 case E_V1DFmode:
1575 /* 128-bit Advanced SIMD vectors. */
1576 case E_V16QImode:
1577 case E_V8HImode:
1578 case E_V4SImode:
1579 case E_V2DImode:
1580 case E_V8HFmode:
1581 case E_V4SFmode:
1582 case E_V2DFmode:
1583 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1584
1585 default:
1586 return 0;
1587 }
1588 }
1589
1590 /* Return true if MODE is any of the data vector modes, including
1591 structure modes. */
1592 static bool
1593 aarch64_vector_data_mode_p (machine_mode mode)
1594 {
1595 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1596 }
1597
1598 /* Return true if MODE is an SVE data vector mode; either a single vector
1599 or a structure of vectors. */
1600 static bool
1601 aarch64_sve_data_mode_p (machine_mode mode)
1602 {
1603 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1604 }
1605
1606 /* Implement target hook TARGET_ARRAY_MODE. */
1607 static opt_machine_mode
1608 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1609 {
1610 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1611 && IN_RANGE (nelems, 2, 4))
1612 return mode_for_vector (GET_MODE_INNER (mode),
1613 GET_MODE_NUNITS (mode) * nelems);
1614
1615 return opt_machine_mode ();
1616 }
1617
1618 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1619 static bool
1620 aarch64_array_mode_supported_p (machine_mode mode,
1621 unsigned HOST_WIDE_INT nelems)
1622 {
1623 if (TARGET_SIMD
1624 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1625 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1626 && (nelems >= 2 && nelems <= 4))
1627 return true;
1628
1629 return false;
1630 }
1631
1632 /* Return the SVE predicate mode to use for elements that have
1633 ELEM_NBYTES bytes, if such a mode exists. */
1634
1635 opt_machine_mode
1636 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1637 {
1638 if (TARGET_SVE)
1639 {
1640 if (elem_nbytes == 1)
1641 return VNx16BImode;
1642 if (elem_nbytes == 2)
1643 return VNx8BImode;
1644 if (elem_nbytes == 4)
1645 return VNx4BImode;
1646 if (elem_nbytes == 8)
1647 return VNx2BImode;
1648 }
1649 return opt_machine_mode ();
1650 }
1651
1652 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1653
1654 static opt_machine_mode
1655 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1656 {
1657 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1658 {
1659 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1660 machine_mode pred_mode;
1661 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1662 return pred_mode;
1663 }
1664
1665 return default_get_mask_mode (nunits, nbytes);
1666 }
1667
1668 /* Return the integer element mode associated with SVE mode MODE. */
1669
1670 static scalar_int_mode
1671 aarch64_sve_element_int_mode (machine_mode mode)
1672 {
1673 unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1674 GET_MODE_NUNITS (mode));
1675 return int_mode_for_size (elt_bits, 0).require ();
1676 }
1677
1678 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1679 prefer to use the first arithmetic operand as the else value if
1680 the else value doesn't matter, since that exactly matches the SVE
1681 destructive merging form. For ternary operations we could either
1682 pick the first operand and use FMAD-like instructions or the last
1683 operand and use FMLA-like instructions; the latter seems more
1684 natural. */
1685
1686 static tree
1687 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1688 {
1689 return nops == 3 ? ops[2] : ops[0];
1690 }
1691
1692 /* Implement TARGET_HARD_REGNO_NREGS. */
1693
1694 static unsigned int
1695 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1696 {
1697 /* ??? Logically we should only need to provide a value when
1698 HARD_REGNO_MODE_OK says that the combination is valid,
1699 but at the moment we need to handle all modes. Just ignore
1700 any runtime parts for registers that can't store them. */
1701 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1702 switch (aarch64_regno_regclass (regno))
1703 {
1704 case FP_REGS:
1705 case FP_LO_REGS:
1706 case FP_LO8_REGS:
1707 if (aarch64_sve_data_mode_p (mode))
1708 return exact_div (GET_MODE_SIZE (mode),
1709 BYTES_PER_SVE_VECTOR).to_constant ();
1710 return CEIL (lowest_size, UNITS_PER_VREG);
1711 case PR_REGS:
1712 case PR_LO_REGS:
1713 case PR_HI_REGS:
1714 return 1;
1715 default:
1716 return CEIL (lowest_size, UNITS_PER_WORD);
1717 }
1718 gcc_unreachable ();
1719 }
1720
1721 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1722
1723 static bool
1724 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1725 {
1726 if (GET_MODE_CLASS (mode) == MODE_CC)
1727 return regno == CC_REGNUM;
1728
1729 if (regno == VG_REGNUM)
1730 /* This must have the same size as _Unwind_Word. */
1731 return mode == DImode;
1732
1733 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1734 if (vec_flags & VEC_SVE_PRED)
1735 return PR_REGNUM_P (regno);
1736
1737 if (PR_REGNUM_P (regno))
1738 return 0;
1739
1740 if (regno == SP_REGNUM)
1741 /* The purpose of comparing with ptr_mode is to support the
1742 global register variable associated with the stack pointer
1743 register via the syntax of asm ("wsp") in ILP32. */
1744 return mode == Pmode || mode == ptr_mode;
1745
1746 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1747 return mode == Pmode;
1748
1749 if (GP_REGNUM_P (regno))
1750 {
1751 if (known_le (GET_MODE_SIZE (mode), 8))
1752 return true;
1753 else if (known_le (GET_MODE_SIZE (mode), 16))
1754 return (regno & 1) == 0;
1755 }
1756 else if (FP_REGNUM_P (regno))
1757 {
1758 if (vec_flags & VEC_STRUCT)
1759 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1760 else
1761 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1762 }
1763
1764 return false;
1765 }
1766
1767 /* Return true if this is a definition of a vectorized simd function. */
1768
1769 static bool
1770 aarch64_simd_decl_p (tree fndecl)
1771 {
1772 tree fntype;
1773
1774 if (fndecl == NULL)
1775 return false;
1776 fntype = TREE_TYPE (fndecl);
1777 if (fntype == NULL)
1778 return false;
1779
1780 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1781 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1782 return true;
1783
1784 return false;
1785 }
1786
1787 /* Return the mode a register save/restore should use. DImode for integer
1788 registers, DFmode for FP registers in non-SIMD functions (they only save
1789 the bottom half of a 128 bit register), or TFmode for FP registers in
1790 SIMD functions. */
1791
1792 static machine_mode
1793 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1794 {
1795 return GP_REGNUM_P (regno)
1796 ? E_DImode
1797 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1798 }
1799
1800 /* Return true if the instruction is a call to a SIMD function, false
1801 if it is not a SIMD function or if we do not know anything about
1802 the function. */
1803
1804 static bool
1805 aarch64_simd_call_p (rtx_insn *insn)
1806 {
1807 rtx symbol;
1808 rtx call;
1809 tree fndecl;
1810
1811 gcc_assert (CALL_P (insn));
1812 call = get_call_rtx_from (insn);
1813 symbol = XEXP (XEXP (call, 0), 0);
1814 if (GET_CODE (symbol) != SYMBOL_REF)
1815 return false;
1816 fndecl = SYMBOL_REF_DECL (symbol);
1817 if (!fndecl)
1818 return false;
1819
1820 return aarch64_simd_decl_p (fndecl);
1821 }
1822
1823 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1824 a function that uses the SIMD ABI, take advantage of the extra
1825 call-preserved registers that the ABI provides. */
1826
1827 void
1828 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1829 HARD_REG_SET *return_set)
1830 {
1831 if (aarch64_simd_call_p (insn))
1832 {
1833 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1834 if (FP_SIMD_SAVED_REGNUM_P (regno))
1835 CLEAR_HARD_REG_BIT (*return_set, regno);
1836 }
1837 }
1838
1839 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1840 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1841 clobbers the top 64 bits when restoring the bottom 64 bits. */
1842
1843 static bool
1844 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1845 machine_mode mode)
1846 {
1847 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1848 return FP_REGNUM_P (regno)
1849 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1850 }
1851
1852 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1853
1854 rtx_insn *
1855 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1856 {
1857 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1858
1859 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1860 return call_1;
1861 else
1862 return call_2;
1863 }
1864
1865 /* Implement REGMODE_NATURAL_SIZE. */
1866 poly_uint64
1867 aarch64_regmode_natural_size (machine_mode mode)
1868 {
1869 /* The natural size for SVE data modes is one SVE data vector,
1870 and similarly for predicates. We can't independently modify
1871 anything smaller than that. */
1872 /* ??? For now, only do this for variable-width SVE registers.
1873 Doing it for constant-sized registers breaks lower-subreg.c. */
1874 /* ??? And once that's fixed, we should probably have similar
1875 code for Advanced SIMD. */
1876 if (!aarch64_sve_vg.is_constant ())
1877 {
1878 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1879 if (vec_flags & VEC_SVE_PRED)
1880 return BYTES_PER_SVE_PRED;
1881 if (vec_flags & VEC_SVE_DATA)
1882 return BYTES_PER_SVE_VECTOR;
1883 }
1884 return UNITS_PER_WORD;
1885 }
1886
1887 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1888 machine_mode
1889 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1890 machine_mode mode)
1891 {
1892 /* The predicate mode determines which bits are significant and
1893 which are "don't care". Decreasing the number of lanes would
1894 lose data while increasing the number of lanes would make bits
1895 unnecessarily significant. */
1896 if (PR_REGNUM_P (regno))
1897 return mode;
1898 if (known_ge (GET_MODE_SIZE (mode), 4))
1899 return mode;
1900 else
1901 return SImode;
1902 }
1903
1904 /* Return true if I's bits are consecutive ones from the MSB. */
1905 bool
1906 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1907 {
1908 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1909 }
1910
1911 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1912 that strcpy from constants will be faster. */
1913
1914 static HOST_WIDE_INT
1915 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1916 {
1917 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1918 return MAX (align, BITS_PER_WORD);
1919 return align;
1920 }
1921
1922 /* Return true if calls to DECL should be treated as
1923 long-calls (ie called via a register). */
1924 static bool
1925 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1926 {
1927 return false;
1928 }
1929
1930 /* Return true if calls to symbol-ref SYM should be treated as
1931 long-calls (ie called via a register). */
1932 bool
1933 aarch64_is_long_call_p (rtx sym)
1934 {
1935 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1936 }
1937
1938 /* Return true if calls to symbol-ref SYM should not go through
1939 plt stubs. */
1940
1941 bool
1942 aarch64_is_noplt_call_p (rtx sym)
1943 {
1944 const_tree decl = SYMBOL_REF_DECL (sym);
1945
1946 if (flag_pic
1947 && decl
1948 && (!flag_plt
1949 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1950 && !targetm.binds_local_p (decl))
1951 return true;
1952
1953 return false;
1954 }
1955
1956 /* Return true if the offsets to a zero/sign-extract operation
1957 represent an expression that matches an extend operation. The
1958 operands represent the paramters from
1959
1960 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1961 bool
1962 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1963 rtx extract_imm)
1964 {
1965 HOST_WIDE_INT mult_val, extract_val;
1966
1967 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1968 return false;
1969
1970 mult_val = INTVAL (mult_imm);
1971 extract_val = INTVAL (extract_imm);
1972
1973 if (extract_val > 8
1974 && extract_val < GET_MODE_BITSIZE (mode)
1975 && exact_log2 (extract_val & ~7) > 0
1976 && (extract_val & 7) <= 4
1977 && mult_val == (1 << (extract_val & 7)))
1978 return true;
1979
1980 return false;
1981 }
1982
1983 /* Emit an insn that's a simple single-set. Both the operands must be
1984 known to be valid. */
1985 inline static rtx_insn *
1986 emit_set_insn (rtx x, rtx y)
1987 {
1988 return emit_insn (gen_rtx_SET (x, y));
1989 }
1990
1991 /* X and Y are two things to compare using CODE. Emit the compare insn and
1992 return the rtx for register 0 in the proper mode. */
1993 rtx
1994 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1995 {
1996 machine_mode mode = SELECT_CC_MODE (code, x, y);
1997 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1998
1999 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2000 return cc_reg;
2001 }
2002
2003 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2004
2005 static rtx
2006 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2007 machine_mode y_mode)
2008 {
2009 if (y_mode == E_QImode || y_mode == E_HImode)
2010 {
2011 if (CONST_INT_P (y))
2012 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2013 else
2014 {
2015 rtx t, cc_reg;
2016 machine_mode cc_mode;
2017
2018 t = gen_rtx_ZERO_EXTEND (SImode, y);
2019 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2020 cc_mode = CC_SWPmode;
2021 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2022 emit_set_insn (cc_reg, t);
2023 return cc_reg;
2024 }
2025 }
2026
2027 return aarch64_gen_compare_reg (code, x, y);
2028 }
2029
2030 /* Build the SYMBOL_REF for __tls_get_addr. */
2031
2032 static GTY(()) rtx tls_get_addr_libfunc;
2033
2034 rtx
2035 aarch64_tls_get_addr (void)
2036 {
2037 if (!tls_get_addr_libfunc)
2038 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2039 return tls_get_addr_libfunc;
2040 }
2041
2042 /* Return the TLS model to use for ADDR. */
2043
2044 static enum tls_model
2045 tls_symbolic_operand_type (rtx addr)
2046 {
2047 enum tls_model tls_kind = TLS_MODEL_NONE;
2048 if (GET_CODE (addr) == CONST)
2049 {
2050 poly_int64 addend;
2051 rtx sym = strip_offset (addr, &addend);
2052 if (GET_CODE (sym) == SYMBOL_REF)
2053 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2054 }
2055 else if (GET_CODE (addr) == SYMBOL_REF)
2056 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2057
2058 return tls_kind;
2059 }
2060
2061 /* We'll allow lo_sum's in addresses in our legitimate addresses
2062 so that combine would take care of combining addresses where
2063 necessary, but for generation purposes, we'll generate the address
2064 as :
2065 RTL Absolute
2066 tmp = hi (symbol_ref); adrp x1, foo
2067 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2068 nop
2069
2070 PIC TLS
2071 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2072 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2073 bl __tls_get_addr
2074 nop
2075
2076 Load TLS symbol, depending on TLS mechanism and TLS access model.
2077
2078 Global Dynamic - Traditional TLS:
2079 adrp tmp, :tlsgd:imm
2080 add dest, tmp, #:tlsgd_lo12:imm
2081 bl __tls_get_addr
2082
2083 Global Dynamic - TLS Descriptors:
2084 adrp dest, :tlsdesc:imm
2085 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2086 add dest, dest, #:tlsdesc_lo12:imm
2087 blr tmp
2088 mrs tp, tpidr_el0
2089 add dest, dest, tp
2090
2091 Initial Exec:
2092 mrs tp, tpidr_el0
2093 adrp tmp, :gottprel:imm
2094 ldr dest, [tmp, #:gottprel_lo12:imm]
2095 add dest, dest, tp
2096
2097 Local Exec:
2098 mrs tp, tpidr_el0
2099 add t0, tp, #:tprel_hi12:imm, lsl #12
2100 add t0, t0, #:tprel_lo12_nc:imm
2101 */
2102
2103 static void
2104 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2105 enum aarch64_symbol_type type)
2106 {
2107 switch (type)
2108 {
2109 case SYMBOL_SMALL_ABSOLUTE:
2110 {
2111 /* In ILP32, the mode of dest can be either SImode or DImode. */
2112 rtx tmp_reg = dest;
2113 machine_mode mode = GET_MODE (dest);
2114
2115 gcc_assert (mode == Pmode || mode == ptr_mode);
2116
2117 if (can_create_pseudo_p ())
2118 tmp_reg = gen_reg_rtx (mode);
2119
2120 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2121 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2122 return;
2123 }
2124
2125 case SYMBOL_TINY_ABSOLUTE:
2126 emit_insn (gen_rtx_SET (dest, imm));
2127 return;
2128
2129 case SYMBOL_SMALL_GOT_28K:
2130 {
2131 machine_mode mode = GET_MODE (dest);
2132 rtx gp_rtx = pic_offset_table_rtx;
2133 rtx insn;
2134 rtx mem;
2135
2136 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2137 here before rtl expand. Tree IVOPT will generate rtl pattern to
2138 decide rtx costs, in which case pic_offset_table_rtx is not
2139 initialized. For that case no need to generate the first adrp
2140 instruction as the final cost for global variable access is
2141 one instruction. */
2142 if (gp_rtx != NULL)
2143 {
2144 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2145 using the page base as GOT base, the first page may be wasted,
2146 in the worst scenario, there is only 28K space for GOT).
2147
2148 The generate instruction sequence for accessing global variable
2149 is:
2150
2151 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2152
2153 Only one instruction needed. But we must initialize
2154 pic_offset_table_rtx properly. We generate initialize insn for
2155 every global access, and allow CSE to remove all redundant.
2156
2157 The final instruction sequences will look like the following
2158 for multiply global variables access.
2159
2160 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2161
2162 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2163 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2164 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2165 ... */
2166
2167 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2168 crtl->uses_pic_offset_table = 1;
2169 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2170
2171 if (mode != GET_MODE (gp_rtx))
2172 gp_rtx = gen_lowpart (mode, gp_rtx);
2173
2174 }
2175
2176 if (mode == ptr_mode)
2177 {
2178 if (mode == DImode)
2179 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2180 else
2181 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2182
2183 mem = XVECEXP (SET_SRC (insn), 0, 0);
2184 }
2185 else
2186 {
2187 gcc_assert (mode == Pmode);
2188
2189 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2190 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2191 }
2192
2193 /* The operand is expected to be MEM. Whenever the related insn
2194 pattern changed, above code which calculate mem should be
2195 updated. */
2196 gcc_assert (GET_CODE (mem) == MEM);
2197 MEM_READONLY_P (mem) = 1;
2198 MEM_NOTRAP_P (mem) = 1;
2199 emit_insn (insn);
2200 return;
2201 }
2202
2203 case SYMBOL_SMALL_GOT_4G:
2204 {
2205 /* In ILP32, the mode of dest can be either SImode or DImode,
2206 while the got entry is always of SImode size. The mode of
2207 dest depends on how dest is used: if dest is assigned to a
2208 pointer (e.g. in the memory), it has SImode; it may have
2209 DImode if dest is dereferenced to access the memeory.
2210 This is why we have to handle three different ldr_got_small
2211 patterns here (two patterns for ILP32). */
2212
2213 rtx insn;
2214 rtx mem;
2215 rtx tmp_reg = dest;
2216 machine_mode mode = GET_MODE (dest);
2217
2218 if (can_create_pseudo_p ())
2219 tmp_reg = gen_reg_rtx (mode);
2220
2221 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2222 if (mode == ptr_mode)
2223 {
2224 if (mode == DImode)
2225 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2226 else
2227 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2228
2229 mem = XVECEXP (SET_SRC (insn), 0, 0);
2230 }
2231 else
2232 {
2233 gcc_assert (mode == Pmode);
2234
2235 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2236 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2237 }
2238
2239 gcc_assert (GET_CODE (mem) == MEM);
2240 MEM_READONLY_P (mem) = 1;
2241 MEM_NOTRAP_P (mem) = 1;
2242 emit_insn (insn);
2243 return;
2244 }
2245
2246 case SYMBOL_SMALL_TLSGD:
2247 {
2248 rtx_insn *insns;
2249 machine_mode mode = GET_MODE (dest);
2250 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2251
2252 start_sequence ();
2253 if (TARGET_ILP32)
2254 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2255 else
2256 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2257 insns = get_insns ();
2258 end_sequence ();
2259
2260 RTL_CONST_CALL_P (insns) = 1;
2261 emit_libcall_block (insns, dest, result, imm);
2262 return;
2263 }
2264
2265 case SYMBOL_SMALL_TLSDESC:
2266 {
2267 machine_mode mode = GET_MODE (dest);
2268 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2269 rtx tp;
2270
2271 gcc_assert (mode == Pmode || mode == ptr_mode);
2272
2273 /* In ILP32, the got entry is always of SImode size. Unlike
2274 small GOT, the dest is fixed at reg 0. */
2275 if (TARGET_ILP32)
2276 emit_insn (gen_tlsdesc_small_si (imm));
2277 else
2278 emit_insn (gen_tlsdesc_small_di (imm));
2279 tp = aarch64_load_tp (NULL);
2280
2281 if (mode != Pmode)
2282 tp = gen_lowpart (mode, tp);
2283
2284 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2285 if (REG_P (dest))
2286 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2287 return;
2288 }
2289
2290 case SYMBOL_SMALL_TLSIE:
2291 {
2292 /* In ILP32, the mode of dest can be either SImode or DImode,
2293 while the got entry is always of SImode size. The mode of
2294 dest depends on how dest is used: if dest is assigned to a
2295 pointer (e.g. in the memory), it has SImode; it may have
2296 DImode if dest is dereferenced to access the memeory.
2297 This is why we have to handle three different tlsie_small
2298 patterns here (two patterns for ILP32). */
2299 machine_mode mode = GET_MODE (dest);
2300 rtx tmp_reg = gen_reg_rtx (mode);
2301 rtx tp = aarch64_load_tp (NULL);
2302
2303 if (mode == ptr_mode)
2304 {
2305 if (mode == DImode)
2306 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2307 else
2308 {
2309 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2310 tp = gen_lowpart (mode, tp);
2311 }
2312 }
2313 else
2314 {
2315 gcc_assert (mode == Pmode);
2316 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2317 }
2318
2319 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2320 if (REG_P (dest))
2321 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2322 return;
2323 }
2324
2325 case SYMBOL_TLSLE12:
2326 case SYMBOL_TLSLE24:
2327 case SYMBOL_TLSLE32:
2328 case SYMBOL_TLSLE48:
2329 {
2330 machine_mode mode = GET_MODE (dest);
2331 rtx tp = aarch64_load_tp (NULL);
2332
2333 if (mode != Pmode)
2334 tp = gen_lowpart (mode, tp);
2335
2336 switch (type)
2337 {
2338 case SYMBOL_TLSLE12:
2339 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2340 (dest, tp, imm));
2341 break;
2342 case SYMBOL_TLSLE24:
2343 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2344 (dest, tp, imm));
2345 break;
2346 case SYMBOL_TLSLE32:
2347 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2348 (dest, imm));
2349 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2350 (dest, dest, tp));
2351 break;
2352 case SYMBOL_TLSLE48:
2353 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2354 (dest, imm));
2355 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2356 (dest, dest, tp));
2357 break;
2358 default:
2359 gcc_unreachable ();
2360 }
2361
2362 if (REG_P (dest))
2363 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2364 return;
2365 }
2366
2367 case SYMBOL_TINY_GOT:
2368 emit_insn (gen_ldr_got_tiny (dest, imm));
2369 return;
2370
2371 case SYMBOL_TINY_TLSIE:
2372 {
2373 machine_mode mode = GET_MODE (dest);
2374 rtx tp = aarch64_load_tp (NULL);
2375
2376 if (mode == ptr_mode)
2377 {
2378 if (mode == DImode)
2379 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2380 else
2381 {
2382 tp = gen_lowpart (mode, tp);
2383 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2384 }
2385 }
2386 else
2387 {
2388 gcc_assert (mode == Pmode);
2389 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2390 }
2391
2392 if (REG_P (dest))
2393 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2394 return;
2395 }
2396
2397 default:
2398 gcc_unreachable ();
2399 }
2400 }
2401
2402 /* Emit a move from SRC to DEST. Assume that the move expanders can
2403 handle all moves if !can_create_pseudo_p (). The distinction is
2404 important because, unlike emit_move_insn, the move expanders know
2405 how to force Pmode objects into the constant pool even when the
2406 constant pool address is not itself legitimate. */
2407 static rtx
2408 aarch64_emit_move (rtx dest, rtx src)
2409 {
2410 return (can_create_pseudo_p ()
2411 ? emit_move_insn (dest, src)
2412 : emit_move_insn_1 (dest, src));
2413 }
2414
2415 /* Apply UNOPTAB to OP and store the result in DEST. */
2416
2417 static void
2418 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2419 {
2420 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2421 if (dest != tmp)
2422 emit_move_insn (dest, tmp);
2423 }
2424
2425 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2426
2427 static void
2428 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2429 {
2430 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2431 OPTAB_DIRECT);
2432 if (dest != tmp)
2433 emit_move_insn (dest, tmp);
2434 }
2435
2436 /* Split a 128-bit move operation into two 64-bit move operations,
2437 taking care to handle partial overlap of register to register
2438 copies. Special cases are needed when moving between GP regs and
2439 FP regs. SRC can be a register, constant or memory; DST a register
2440 or memory. If either operand is memory it must not have any side
2441 effects. */
2442 void
2443 aarch64_split_128bit_move (rtx dst, rtx src)
2444 {
2445 rtx dst_lo, dst_hi;
2446 rtx src_lo, src_hi;
2447
2448 machine_mode mode = GET_MODE (dst);
2449
2450 gcc_assert (mode == TImode || mode == TFmode);
2451 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2452 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2453
2454 if (REG_P (dst) && REG_P (src))
2455 {
2456 int src_regno = REGNO (src);
2457 int dst_regno = REGNO (dst);
2458
2459 /* Handle FP <-> GP regs. */
2460 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2461 {
2462 src_lo = gen_lowpart (word_mode, src);
2463 src_hi = gen_highpart (word_mode, src);
2464
2465 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2466 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2467 return;
2468 }
2469 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2470 {
2471 dst_lo = gen_lowpart (word_mode, dst);
2472 dst_hi = gen_highpart (word_mode, dst);
2473
2474 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2475 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2476 return;
2477 }
2478 }
2479
2480 dst_lo = gen_lowpart (word_mode, dst);
2481 dst_hi = gen_highpart (word_mode, dst);
2482 src_lo = gen_lowpart (word_mode, src);
2483 src_hi = gen_highpart_mode (word_mode, mode, src);
2484
2485 /* At most one pairing may overlap. */
2486 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2487 {
2488 aarch64_emit_move (dst_hi, src_hi);
2489 aarch64_emit_move (dst_lo, src_lo);
2490 }
2491 else
2492 {
2493 aarch64_emit_move (dst_lo, src_lo);
2494 aarch64_emit_move (dst_hi, src_hi);
2495 }
2496 }
2497
2498 bool
2499 aarch64_split_128bit_move_p (rtx dst, rtx src)
2500 {
2501 return (! REG_P (src)
2502 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2503 }
2504
2505 /* Split a complex SIMD combine. */
2506
2507 void
2508 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2509 {
2510 machine_mode src_mode = GET_MODE (src1);
2511 machine_mode dst_mode = GET_MODE (dst);
2512
2513 gcc_assert (VECTOR_MODE_P (dst_mode));
2514 gcc_assert (register_operand (dst, dst_mode)
2515 && register_operand (src1, src_mode)
2516 && register_operand (src2, src_mode));
2517
2518 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2519 return;
2520 }
2521
2522 /* Split a complex SIMD move. */
2523
2524 void
2525 aarch64_split_simd_move (rtx dst, rtx src)
2526 {
2527 machine_mode src_mode = GET_MODE (src);
2528 machine_mode dst_mode = GET_MODE (dst);
2529
2530 gcc_assert (VECTOR_MODE_P (dst_mode));
2531
2532 if (REG_P (dst) && REG_P (src))
2533 {
2534 gcc_assert (VECTOR_MODE_P (src_mode));
2535 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2536 }
2537 }
2538
2539 bool
2540 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2541 machine_mode ymode, rtx y)
2542 {
2543 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2544 gcc_assert (r != NULL);
2545 return rtx_equal_p (x, r);
2546 }
2547
2548
2549 static rtx
2550 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2551 {
2552 if (can_create_pseudo_p ())
2553 return force_reg (mode, value);
2554 else
2555 {
2556 gcc_assert (x);
2557 aarch64_emit_move (x, value);
2558 return x;
2559 }
2560 }
2561
2562 /* Return true if predicate value X is a constant in which every element
2563 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2564 value, i.e. as a predicate in which all bits are significant. */
2565
2566 static bool
2567 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2568 {
2569 if (GET_CODE (x) != CONST_VECTOR)
2570 return false;
2571
2572 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2573 GET_MODE_NUNITS (GET_MODE (x)));
2574 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2575 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2576 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2577
2578 unsigned int nelts = const_vector_encoded_nelts (x);
2579 for (unsigned int i = 0; i < nelts; ++i)
2580 {
2581 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2582 if (!CONST_INT_P (elt))
2583 return false;
2584
2585 builder.quick_push (elt);
2586 for (unsigned int j = 1; j < factor; ++j)
2587 builder.quick_push (const0_rtx);
2588 }
2589 builder.finalize ();
2590 return true;
2591 }
2592
2593 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2594 widest predicate element size it can have (that is, the largest size
2595 for which each element would still be 0 or 1). */
2596
2597 unsigned int
2598 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2599 {
2600 /* Start with the most optimistic assumption: that we only need
2601 one bit per pattern. This is what we will use if only the first
2602 bit in each pattern is ever set. */
2603 unsigned int mask = GET_MODE_SIZE (DImode);
2604 mask |= builder.npatterns ();
2605
2606 /* Look for set bits. */
2607 unsigned int nelts = builder.encoded_nelts ();
2608 for (unsigned int i = 1; i < nelts; ++i)
2609 if (INTVAL (builder.elt (i)) != 0)
2610 {
2611 if (i & 1)
2612 return 1;
2613 mask |= i;
2614 }
2615 return mask & -mask;
2616 }
2617
2618 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2619 that the constant would have with predicate element size ELT_SIZE
2620 (ignoring the upper bits in each element) and return:
2621
2622 * -1 if all bits are set
2623 * N if the predicate has N leading set bits followed by all clear bits
2624 * 0 if the predicate does not have any of these forms. */
2625
2626 int
2627 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2628 unsigned int elt_size)
2629 {
2630 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2631 followed by set bits. */
2632 if (builder.nelts_per_pattern () == 3)
2633 return 0;
2634
2635 /* Skip over leading set bits. */
2636 unsigned int nelts = builder.encoded_nelts ();
2637 unsigned int i = 0;
2638 for (; i < nelts; i += elt_size)
2639 if (INTVAL (builder.elt (i)) == 0)
2640 break;
2641 unsigned int vl = i / elt_size;
2642
2643 /* Check for the all-true case. */
2644 if (i == nelts)
2645 return -1;
2646
2647 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2648 repeating pattern of set bits followed by clear bits. */
2649 if (builder.nelts_per_pattern () != 2)
2650 return 0;
2651
2652 /* We have a "foreground" value and a duplicated "background" value.
2653 If the background might repeat and the last set bit belongs to it,
2654 we might have set bits followed by clear bits followed by set bits. */
2655 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2656 return 0;
2657
2658 /* Make sure that the rest are all clear. */
2659 for (; i < nelts; i += elt_size)
2660 if (INTVAL (builder.elt (i)) != 0)
2661 return 0;
2662
2663 return vl;
2664 }
2665
2666 /* See if there is an svpattern that encodes an SVE predicate of mode
2667 PRED_MODE in which the first VL bits are set and the rest are clear.
2668 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2669 A VL of -1 indicates an all-true vector. */
2670
2671 aarch64_svpattern
2672 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2673 {
2674 if (vl < 0)
2675 return AARCH64_SV_ALL;
2676
2677 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2678 return AARCH64_NUM_SVPATTERNS;
2679
2680 if (vl >= 1 && vl <= 8)
2681 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2682
2683 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2684 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2685
2686 int max_vl;
2687 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2688 {
2689 if (vl == (max_vl / 3) * 3)
2690 return AARCH64_SV_MUL3;
2691 /* These would only trigger for non-power-of-2 lengths. */
2692 if (vl == (max_vl & -4))
2693 return AARCH64_SV_MUL4;
2694 if (vl == (1 << floor_log2 (max_vl)))
2695 return AARCH64_SV_POW2;
2696 if (vl == max_vl)
2697 return AARCH64_SV_ALL;
2698 }
2699 return AARCH64_NUM_SVPATTERNS;
2700 }
2701
2702 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2703 bits has the lowest bit set and the upper bits clear. This is the
2704 VNx16BImode equivalent of a PTRUE for controlling elements of
2705 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2706 all bits are significant, even the upper zeros. */
2707
2708 rtx
2709 aarch64_ptrue_all (unsigned int elt_size)
2710 {
2711 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2712 builder.quick_push (const1_rtx);
2713 for (unsigned int i = 1; i < elt_size; ++i)
2714 builder.quick_push (const0_rtx);
2715 return builder.build ();
2716 }
2717
2718 /* Return an all-true predicate register of mode MODE. */
2719
2720 rtx
2721 aarch64_ptrue_reg (machine_mode mode)
2722 {
2723 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2724 return force_reg (mode, CONSTM1_RTX (mode));
2725 }
2726
2727 /* Return an all-false predicate register of mode MODE. */
2728
2729 rtx
2730 aarch64_pfalse_reg (machine_mode mode)
2731 {
2732 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2733 return force_reg (mode, CONST0_RTX (mode));
2734 }
2735
2736 /* Return true if we can move VALUE into a register using a single
2737 CNT[BHWD] instruction. */
2738
2739 static bool
2740 aarch64_sve_cnt_immediate_p (poly_int64 value)
2741 {
2742 HOST_WIDE_INT factor = value.coeffs[0];
2743 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2744 return (value.coeffs[1] == factor
2745 && IN_RANGE (factor, 2, 16 * 16)
2746 && (factor & 1) == 0
2747 && factor <= 16 * (factor & -factor));
2748 }
2749
2750 /* Likewise for rtx X. */
2751
2752 bool
2753 aarch64_sve_cnt_immediate_p (rtx x)
2754 {
2755 poly_int64 value;
2756 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2757 }
2758
2759 /* Return the asm string for an instruction with a CNT-like vector size
2760 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2761 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2762 first part of the operands template (the part that comes before the
2763 vector size itself). FACTOR is the number of quadwords.
2764 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2765 If it is zero, we can use any element size. */
2766
2767 static char *
2768 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2769 unsigned int factor,
2770 unsigned int nelts_per_vq)
2771 {
2772 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2773
2774 if (nelts_per_vq == 0)
2775 /* There is some overlap in the ranges of the four CNT instructions.
2776 Here we always use the smallest possible element size, so that the
2777 multiplier is 1 whereever possible. */
2778 nelts_per_vq = factor & -factor;
2779 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2780 gcc_assert (IN_RANGE (shift, 1, 4));
2781 char suffix = "dwhb"[shift - 1];
2782
2783 factor >>= shift;
2784 unsigned int written;
2785 if (factor == 1)
2786 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2787 prefix, suffix, operands);
2788 else
2789 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2790 prefix, suffix, operands, factor);
2791 gcc_assert (written < sizeof (buffer));
2792 return buffer;
2793 }
2794
2795 /* Return the asm string for an instruction with a CNT-like vector size
2796 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2797 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2798 first part of the operands template (the part that comes before the
2799 vector size itself). X is the value of the vector size operand,
2800 as a polynomial integer rtx. */
2801
2802 char *
2803 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2804 rtx x)
2805 {
2806 poly_int64 value = rtx_to_poly_int64 (x);
2807 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2808 return aarch64_output_sve_cnt_immediate (prefix, operands,
2809 value.coeffs[1], 0);
2810 }
2811
2812 /* Return true if we can add VALUE to a register using a single ADDVL
2813 or ADDPL instruction. */
2814
2815 static bool
2816 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2817 {
2818 HOST_WIDE_INT factor = value.coeffs[0];
2819 if (factor == 0 || value.coeffs[1] != factor)
2820 return false;
2821 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2822 and a value of 16 is one vector width. */
2823 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2824 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2825 }
2826
2827 /* Likewise for rtx X. */
2828
2829 bool
2830 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2831 {
2832 poly_int64 value;
2833 return (poly_int_rtx_p (x, &value)
2834 && aarch64_sve_addvl_addpl_immediate_p (value));
2835 }
2836
2837 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2838 and storing the result in operand 0. */
2839
2840 char *
2841 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2842 {
2843 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2844 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2845 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2846
2847 /* Use INC or DEC if possible. */
2848 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2849 {
2850 if (aarch64_sve_cnt_immediate_p (offset_value))
2851 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2852 offset_value.coeffs[1], 0);
2853 if (aarch64_sve_cnt_immediate_p (-offset_value))
2854 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2855 -offset_value.coeffs[1], 0);
2856 }
2857
2858 int factor = offset_value.coeffs[1];
2859 if ((factor & 15) == 0)
2860 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2861 else
2862 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2863 return buffer;
2864 }
2865
2866 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2867 instruction. If it is, store the number of elements in each vector
2868 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2869 factor in *FACTOR_OUT (if nonnull). */
2870
2871 bool
2872 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2873 unsigned int *nelts_per_vq_out)
2874 {
2875 rtx elt;
2876 poly_int64 value;
2877
2878 if (!const_vec_duplicate_p (x, &elt)
2879 || !poly_int_rtx_p (elt, &value))
2880 return false;
2881
2882 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2883 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2884 /* There's no vector INCB. */
2885 return false;
2886
2887 HOST_WIDE_INT factor = value.coeffs[0];
2888 if (value.coeffs[1] != factor)
2889 return false;
2890
2891 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2892 if ((factor % nelts_per_vq) != 0
2893 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2894 return false;
2895
2896 if (factor_out)
2897 *factor_out = factor;
2898 if (nelts_per_vq_out)
2899 *nelts_per_vq_out = nelts_per_vq;
2900 return true;
2901 }
2902
2903 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2904 instruction. */
2905
2906 bool
2907 aarch64_sve_inc_dec_immediate_p (rtx x)
2908 {
2909 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2910 }
2911
2912 /* Return the asm template for an SVE vector INC or DEC instruction.
2913 OPERANDS gives the operands before the vector count and X is the
2914 value of the vector count operand itself. */
2915
2916 char *
2917 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2918 {
2919 int factor;
2920 unsigned int nelts_per_vq;
2921 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2922 gcc_unreachable ();
2923 if (factor < 0)
2924 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2925 nelts_per_vq);
2926 else
2927 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2928 nelts_per_vq);
2929 }
2930
2931 static int
2932 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2933 scalar_int_mode mode)
2934 {
2935 int i;
2936 unsigned HOST_WIDE_INT val, val2, mask;
2937 int one_match, zero_match;
2938 int num_insns;
2939
2940 val = INTVAL (imm);
2941
2942 if (aarch64_move_imm (val, mode))
2943 {
2944 if (generate)
2945 emit_insn (gen_rtx_SET (dest, imm));
2946 return 1;
2947 }
2948
2949 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2950 (with XXXX non-zero). In that case check to see if the move can be done in
2951 a smaller mode. */
2952 val2 = val & 0xffffffff;
2953 if (mode == DImode
2954 && aarch64_move_imm (val2, SImode)
2955 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2956 {
2957 if (generate)
2958 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2959
2960 /* Check if we have to emit a second instruction by checking to see
2961 if any of the upper 32 bits of the original DI mode value is set. */
2962 if (val == val2)
2963 return 1;
2964
2965 i = (val >> 48) ? 48 : 32;
2966
2967 if (generate)
2968 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2969 GEN_INT ((val >> i) & 0xffff)));
2970
2971 return 2;
2972 }
2973
2974 if ((val >> 32) == 0 || mode == SImode)
2975 {
2976 if (generate)
2977 {
2978 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2979 if (mode == SImode)
2980 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2981 GEN_INT ((val >> 16) & 0xffff)));
2982 else
2983 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2984 GEN_INT ((val >> 16) & 0xffff)));
2985 }
2986 return 2;
2987 }
2988
2989 /* Remaining cases are all for DImode. */
2990
2991 mask = 0xffff;
2992 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2993 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2994 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2995 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2996
2997 if (zero_match != 2 && one_match != 2)
2998 {
2999 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3000 For a 64-bit bitmask try whether changing 16 bits to all ones or
3001 zeroes creates a valid bitmask. To check any repeated bitmask,
3002 try using 16 bits from the other 32-bit half of val. */
3003
3004 for (i = 0; i < 64; i += 16, mask <<= 16)
3005 {
3006 val2 = val & ~mask;
3007 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3008 break;
3009 val2 = val | mask;
3010 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3011 break;
3012 val2 = val2 & ~mask;
3013 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3014 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3015 break;
3016 }
3017 if (i != 64)
3018 {
3019 if (generate)
3020 {
3021 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3022 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3023 GEN_INT ((val >> i) & 0xffff)));
3024 }
3025 return 2;
3026 }
3027 }
3028
3029 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3030 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3031 otherwise skip zero bits. */
3032
3033 num_insns = 1;
3034 mask = 0xffff;
3035 val2 = one_match > zero_match ? ~val : val;
3036 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3037
3038 if (generate)
3039 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3040 ? (val | ~(mask << i))
3041 : (val & (mask << i)))));
3042 for (i += 16; i < 64; i += 16)
3043 {
3044 if ((val2 & (mask << i)) == 0)
3045 continue;
3046 if (generate)
3047 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3048 GEN_INT ((val >> i) & 0xffff)));
3049 num_insns ++;
3050 }
3051
3052 return num_insns;
3053 }
3054
3055 /* Return whether imm is a 128-bit immediate which is simple enough to
3056 expand inline. */
3057 bool
3058 aarch64_mov128_immediate (rtx imm)
3059 {
3060 if (GET_CODE (imm) == CONST_INT)
3061 return true;
3062
3063 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3064
3065 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3066 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3067
3068 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3069 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3070 }
3071
3072
3073 /* Return the number of temporary registers that aarch64_add_offset_1
3074 would need to add OFFSET to a register. */
3075
3076 static unsigned int
3077 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3078 {
3079 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3080 }
3081
3082 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3083 a non-polynomial OFFSET. MODE is the mode of the addition.
3084 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3085 be set and CFA adjustments added to the generated instructions.
3086
3087 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3088 temporary if register allocation is already complete. This temporary
3089 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3090 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3091 the immediate again.
3092
3093 Since this function may be used to adjust the stack pointer, we must
3094 ensure that it cannot cause transient stack deallocation (for example
3095 by first incrementing SP and then decrementing when adjusting by a
3096 large immediate). */
3097
3098 static void
3099 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3100 rtx src, HOST_WIDE_INT offset, rtx temp1,
3101 bool frame_related_p, bool emit_move_imm)
3102 {
3103 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3104 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3105
3106 HOST_WIDE_INT moffset = abs_hwi (offset);
3107 rtx_insn *insn;
3108
3109 if (!moffset)
3110 {
3111 if (!rtx_equal_p (dest, src))
3112 {
3113 insn = emit_insn (gen_rtx_SET (dest, src));
3114 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3115 }
3116 return;
3117 }
3118
3119 /* Single instruction adjustment. */
3120 if (aarch64_uimm12_shift (moffset))
3121 {
3122 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3123 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3124 return;
3125 }
3126
3127 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3128 and either:
3129
3130 a) the offset cannot be loaded by a 16-bit move or
3131 b) there is no spare register into which we can move it. */
3132 if (moffset < 0x1000000
3133 && ((!temp1 && !can_create_pseudo_p ())
3134 || !aarch64_move_imm (moffset, mode)))
3135 {
3136 HOST_WIDE_INT low_off = moffset & 0xfff;
3137
3138 low_off = offset < 0 ? -low_off : low_off;
3139 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3140 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3141 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3142 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3143 return;
3144 }
3145
3146 /* Emit a move immediate if required and an addition/subtraction. */
3147 if (emit_move_imm)
3148 {
3149 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3150 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3151 }
3152 insn = emit_insn (offset < 0
3153 ? gen_sub3_insn (dest, src, temp1)
3154 : gen_add3_insn (dest, src, temp1));
3155 if (frame_related_p)
3156 {
3157 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3158 rtx adj = plus_constant (mode, src, offset);
3159 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3160 }
3161 }
3162
3163 /* Return the number of temporary registers that aarch64_add_offset
3164 would need to move OFFSET into a register or add OFFSET to a register;
3165 ADD_P is true if we want the latter rather than the former. */
3166
3167 static unsigned int
3168 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3169 {
3170 /* This follows the same structure as aarch64_add_offset. */
3171 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3172 return 0;
3173
3174 unsigned int count = 0;
3175 HOST_WIDE_INT factor = offset.coeffs[1];
3176 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3177 poly_int64 poly_offset (factor, factor);
3178 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3179 /* Need one register for the ADDVL/ADDPL result. */
3180 count += 1;
3181 else if (factor != 0)
3182 {
3183 factor = abs (factor);
3184 if (factor > 16 * (factor & -factor))
3185 /* Need one register for the CNT result and one for the multiplication
3186 factor. If necessary, the second temporary can be reused for the
3187 constant part of the offset. */
3188 return 2;
3189 /* Need one register for the CNT result (which might then
3190 be shifted). */
3191 count += 1;
3192 }
3193 return count + aarch64_add_offset_1_temporaries (constant);
3194 }
3195
3196 /* If X can be represented as a poly_int64, return the number
3197 of temporaries that are required to add it to a register.
3198 Return -1 otherwise. */
3199
3200 int
3201 aarch64_add_offset_temporaries (rtx x)
3202 {
3203 poly_int64 offset;
3204 if (!poly_int_rtx_p (x, &offset))
3205 return -1;
3206 return aarch64_offset_temporaries (true, offset);
3207 }
3208
3209 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3210 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3211 be set and CFA adjustments added to the generated instructions.
3212
3213 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3214 temporary if register allocation is already complete. This temporary
3215 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3216 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3217 false to avoid emitting the immediate again.
3218
3219 TEMP2, if nonnull, is a second temporary register that doesn't
3220 overlap either DEST or REG.
3221
3222 Since this function may be used to adjust the stack pointer, we must
3223 ensure that it cannot cause transient stack deallocation (for example
3224 by first incrementing SP and then decrementing when adjusting by a
3225 large immediate). */
3226
3227 static void
3228 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3229 poly_int64 offset, rtx temp1, rtx temp2,
3230 bool frame_related_p, bool emit_move_imm = true)
3231 {
3232 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3233 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3234 gcc_assert (temp1 == NULL_RTX
3235 || !frame_related_p
3236 || !reg_overlap_mentioned_p (temp1, dest));
3237 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3238
3239 /* Try using ADDVL or ADDPL to add the whole value. */
3240 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3241 {
3242 rtx offset_rtx = gen_int_mode (offset, mode);
3243 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3244 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3245 return;
3246 }
3247
3248 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3249 SVE vector register, over and above the minimum size of 128 bits.
3250 This is equivalent to half the value returned by CNTD with a
3251 vector shape of ALL. */
3252 HOST_WIDE_INT factor = offset.coeffs[1];
3253 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3254
3255 /* Try using ADDVL or ADDPL to add the VG-based part. */
3256 poly_int64 poly_offset (factor, factor);
3257 if (src != const0_rtx
3258 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3259 {
3260 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3261 if (frame_related_p)
3262 {
3263 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3264 RTX_FRAME_RELATED_P (insn) = true;
3265 src = dest;
3266 }
3267 else
3268 {
3269 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3270 src = aarch64_force_temporary (mode, temp1, addr);
3271 temp1 = temp2;
3272 temp2 = NULL_RTX;
3273 }
3274 }
3275 /* Otherwise use a CNT-based sequence. */
3276 else if (factor != 0)
3277 {
3278 /* Use a subtraction if we have a negative factor. */
3279 rtx_code code = PLUS;
3280 if (factor < 0)
3281 {
3282 factor = -factor;
3283 code = MINUS;
3284 }
3285
3286 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3287 into the multiplication. */
3288 rtx val;
3289 int shift = 0;
3290 if (factor & 1)
3291 /* Use a right shift by 1. */
3292 shift = -1;
3293 else
3294 factor /= 2;
3295 HOST_WIDE_INT low_bit = factor & -factor;
3296 if (factor <= 16 * low_bit)
3297 {
3298 if (factor > 16 * 8)
3299 {
3300 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3301 the value with the minimum multiplier and shift it into
3302 position. */
3303 int extra_shift = exact_log2 (low_bit);
3304 shift += extra_shift;
3305 factor >>= extra_shift;
3306 }
3307 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3308 }
3309 else
3310 {
3311 /* Use CNTD, then multiply it by FACTOR. */
3312 val = gen_int_mode (poly_int64 (2, 2), mode);
3313 val = aarch64_force_temporary (mode, temp1, val);
3314
3315 /* Go back to using a negative multiplication factor if we have
3316 no register from which to subtract. */
3317 if (code == MINUS && src == const0_rtx)
3318 {
3319 factor = -factor;
3320 code = PLUS;
3321 }
3322 rtx coeff1 = gen_int_mode (factor, mode);
3323 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3324 val = gen_rtx_MULT (mode, val, coeff1);
3325 }
3326
3327 if (shift > 0)
3328 {
3329 /* Multiply by 1 << SHIFT. */
3330 val = aarch64_force_temporary (mode, temp1, val);
3331 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3332 }
3333 else if (shift == -1)
3334 {
3335 /* Divide by 2. */
3336 val = aarch64_force_temporary (mode, temp1, val);
3337 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3338 }
3339
3340 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3341 if (src != const0_rtx)
3342 {
3343 val = aarch64_force_temporary (mode, temp1, val);
3344 val = gen_rtx_fmt_ee (code, mode, src, val);
3345 }
3346 else if (code == MINUS)
3347 {
3348 val = aarch64_force_temporary (mode, temp1, val);
3349 val = gen_rtx_NEG (mode, val);
3350 }
3351
3352 if (constant == 0 || frame_related_p)
3353 {
3354 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3355 if (frame_related_p)
3356 {
3357 RTX_FRAME_RELATED_P (insn) = true;
3358 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3359 gen_rtx_SET (dest, plus_constant (Pmode, src,
3360 poly_offset)));
3361 }
3362 src = dest;
3363 if (constant == 0)
3364 return;
3365 }
3366 else
3367 {
3368 src = aarch64_force_temporary (mode, temp1, val);
3369 temp1 = temp2;
3370 temp2 = NULL_RTX;
3371 }
3372
3373 emit_move_imm = true;
3374 }
3375
3376 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3377 frame_related_p, emit_move_imm);
3378 }
3379
3380 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3381 than a poly_int64. */
3382
3383 void
3384 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3385 rtx offset_rtx, rtx temp1, rtx temp2)
3386 {
3387 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3388 temp1, temp2, false);
3389 }
3390
3391 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3392 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3393 if TEMP1 already contains abs (DELTA). */
3394
3395 static inline void
3396 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3397 {
3398 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3399 temp1, temp2, true, emit_move_imm);
3400 }
3401
3402 /* Subtract DELTA from the stack pointer, marking the instructions
3403 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3404 if nonnull. */
3405
3406 static inline void
3407 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3408 bool emit_move_imm = true)
3409 {
3410 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3411 temp1, temp2, frame_related_p, emit_move_imm);
3412 }
3413
3414 /* Set DEST to (vec_series BASE STEP). */
3415
3416 static void
3417 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3418 {
3419 machine_mode mode = GET_MODE (dest);
3420 scalar_mode inner = GET_MODE_INNER (mode);
3421
3422 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3423 if (!aarch64_sve_index_immediate_p (base))
3424 base = force_reg (inner, base);
3425 if (!aarch64_sve_index_immediate_p (step))
3426 step = force_reg (inner, step);
3427
3428 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3429 }
3430
3431 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3432 register of mode MODE. Use TARGET for the result if it's nonnull
3433 and convenient.
3434
3435 The two vector modes must have the same element mode. The behavior
3436 is to duplicate architectural lane N of SRC into architectural lanes
3437 N + I * STEP of the result. On big-endian targets, architectural
3438 lane 0 of an Advanced SIMD vector is the last element of the vector
3439 in memory layout, so for big-endian targets this operation has the
3440 effect of reversing SRC before duplicating it. Callers need to
3441 account for this. */
3442
3443 rtx
3444 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3445 {
3446 machine_mode src_mode = GET_MODE (src);
3447 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3448 insn_code icode = (BYTES_BIG_ENDIAN
3449 ? code_for_aarch64_vec_duplicate_vq_be (mode)
3450 : code_for_aarch64_vec_duplicate_vq_le (mode));
3451
3452 unsigned int i = 0;
3453 expand_operand ops[3];
3454 create_output_operand (&ops[i++], target, mode);
3455 create_output_operand (&ops[i++], src, src_mode);
3456 if (BYTES_BIG_ENDIAN)
3457 {
3458 /* Create a PARALLEL describing the reversal of SRC. */
3459 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3460 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3461 nelts_per_vq - 1, -1);
3462 create_fixed_operand (&ops[i++], sel);
3463 }
3464 expand_insn (icode, i, ops);
3465 return ops[0].value;
3466 }
3467
3468 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3469 the memory image into DEST. Return true on success. */
3470
3471 static bool
3472 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3473 {
3474 src = force_const_mem (GET_MODE (src), src);
3475 if (!src)
3476 return false;
3477
3478 /* Make sure that the address is legitimate. */
3479 if (!aarch64_sve_ld1rq_operand_p (src))
3480 {
3481 rtx addr = force_reg (Pmode, XEXP (src, 0));
3482 src = replace_equiv_address (src, addr);
3483 }
3484
3485 machine_mode mode = GET_MODE (dest);
3486 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3487 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3488 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3489 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3490 return true;
3491 }
3492
3493 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3494 SVE data mode and isn't a legitimate constant. Use TARGET for the
3495 result if convenient.
3496
3497 The returned register can have whatever mode seems most natural
3498 given the contents of SRC. */
3499
3500 static rtx
3501 aarch64_expand_sve_const_vector (rtx target, rtx src)
3502 {
3503 machine_mode mode = GET_MODE (src);
3504 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3505 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3506 scalar_mode elt_mode = GET_MODE_INNER (mode);
3507 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3508 unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3509
3510 if (nelts_per_pattern == 1 && encoded_bits == 128)
3511 {
3512 /* The constant is a duplicated quadword but can't be narrowed
3513 beyond a quadword. Get the memory image of the first quadword
3514 as a 128-bit vector and try using LD1RQ to load it from memory.
3515
3516 The effect for both endiannesses is to load memory lane N into
3517 architectural lanes N + I * STEP of the result. On big-endian
3518 targets, the layout of the 128-bit vector in an Advanced SIMD
3519 register would be different from its layout in an SVE register,
3520 but this 128-bit vector is a memory value only. */
3521 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3522 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3523 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3524 return target;
3525 }
3526
3527 if (nelts_per_pattern == 1 && encoded_bits < 128)
3528 {
3529 /* The vector is a repeating sequence of 64 bits or fewer.
3530 See if we can load them using an Advanced SIMD move and then
3531 duplicate it to fill a vector. This is better than using a GPR
3532 move because it keeps everything in the same register file. */
3533 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3534 rtx_vector_builder builder (vq_mode, npatterns, 1);
3535 for (unsigned int i = 0; i < npatterns; ++i)
3536 {
3537 /* We want memory lane N to go into architectural lane N,
3538 so reverse for big-endian targets. The DUP .Q pattern
3539 has a compensating reverse built-in. */
3540 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3541 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3542 }
3543 rtx vq_src = builder.build ();
3544 if (aarch64_simd_valid_immediate (vq_src, NULL))
3545 {
3546 vq_src = force_reg (vq_mode, vq_src);
3547 return aarch64_expand_sve_dupq (target, mode, vq_src);
3548 }
3549
3550 /* Get an integer representation of the repeating part of Advanced
3551 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3552 which for big-endian targets is lane-swapped wrt a normal
3553 Advanced SIMD vector. This means that for both endiannesses,
3554 memory lane N of SVE vector SRC corresponds to architectural
3555 lane N of a register holding VQ_SRC. This in turn means that
3556 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3557 as a single 128-bit value) and thus that memory lane 0 of SRC is
3558 in the lsb of the integer. Duplicating the integer therefore
3559 ensures that memory lane N of SRC goes into architectural lane
3560 N + I * INDEX of the SVE register. */
3561 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3562 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3563 if (elt_value)
3564 {
3565 /* Pretend that we had a vector of INT_MODE to start with. */
3566 elt_mode = int_mode;
3567 mode = aarch64_full_sve_mode (int_mode).require ();
3568
3569 /* If the integer can be moved into a general register by a
3570 single instruction, do that and duplicate the result. */
3571 if (CONST_INT_P (elt_value)
3572 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3573 {
3574 elt_value = force_reg (elt_mode, elt_value);
3575 return expand_vector_broadcast (mode, elt_value);
3576 }
3577 }
3578 else if (npatterns == 1)
3579 /* We're duplicating a single value, but can't do better than
3580 force it to memory and load from there. This handles things
3581 like symbolic constants. */
3582 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3583
3584 if (elt_value)
3585 {
3586 /* Load the element from memory if we can, otherwise move it into
3587 a register and use a DUP. */
3588 rtx op = force_const_mem (elt_mode, elt_value);
3589 if (!op)
3590 op = force_reg (elt_mode, elt_value);
3591 return expand_vector_broadcast (mode, op);
3592 }
3593 }
3594
3595 /* Try using INDEX. */
3596 rtx base, step;
3597 if (const_vec_series_p (src, &base, &step))
3598 {
3599 aarch64_expand_vec_series (target, base, step);
3600 return target;
3601 }
3602
3603 /* From here on, it's better to force the whole constant to memory
3604 if we can. */
3605 if (GET_MODE_NUNITS (mode).is_constant ())
3606 return NULL_RTX;
3607
3608 /* Expand each pattern individually. */
3609 gcc_assert (npatterns > 1);
3610 rtx_vector_builder builder;
3611 auto_vec<rtx, 16> vectors (npatterns);
3612 for (unsigned int i = 0; i < npatterns; ++i)
3613 {
3614 builder.new_vector (mode, 1, nelts_per_pattern);
3615 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3616 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3617 vectors.quick_push (force_reg (mode, builder.build ()));
3618 }
3619
3620 /* Use permutes to interleave the separate vectors. */
3621 while (npatterns > 1)
3622 {
3623 npatterns /= 2;
3624 for (unsigned int i = 0; i < npatterns; ++i)
3625 {
3626 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3627 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3628 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3629 vectors[i] = tmp;
3630 }
3631 }
3632 gcc_assert (vectors[0] == target);
3633 return target;
3634 }
3635
3636 /* Use WHILE to set predicate register DEST so that the first VL bits
3637 are set and the rest are clear. */
3638
3639 static void
3640 aarch64_sve_move_pred_via_while (rtx dest, unsigned int vl)
3641 {
3642 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3643 emit_insn (gen_while_ult (DImode, GET_MODE (dest),
3644 dest, const0_rtx, limit));
3645 }
3646
3647 /* Set DEST to immediate IMM. */
3648
3649 void
3650 aarch64_expand_mov_immediate (rtx dest, rtx imm)
3651 {
3652 machine_mode mode = GET_MODE (dest);
3653
3654 /* Check on what type of symbol it is. */
3655 scalar_int_mode int_mode;
3656 if ((GET_CODE (imm) == SYMBOL_REF
3657 || GET_CODE (imm) == LABEL_REF
3658 || GET_CODE (imm) == CONST
3659 || GET_CODE (imm) == CONST_POLY_INT)
3660 && is_a <scalar_int_mode> (mode, &int_mode))
3661 {
3662 rtx mem;
3663 poly_int64 offset;
3664 HOST_WIDE_INT const_offset;
3665 enum aarch64_symbol_type sty;
3666
3667 /* If we have (const (plus symbol offset)), separate out the offset
3668 before we start classifying the symbol. */
3669 rtx base = strip_offset (imm, &offset);
3670
3671 /* We must always add an offset involving VL separately, rather than
3672 folding it into the relocation. */
3673 if (!offset.is_constant (&const_offset))
3674 {
3675 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3676 emit_insn (gen_rtx_SET (dest, imm));
3677 else
3678 {
3679 /* Do arithmetic on 32-bit values if the result is smaller
3680 than that. */
3681 if (partial_subreg_p (int_mode, SImode))
3682 {
3683 /* It is invalid to do symbol calculations in modes
3684 narrower than SImode. */
3685 gcc_assert (base == const0_rtx);
3686 dest = gen_lowpart (SImode, dest);
3687 int_mode = SImode;
3688 }
3689 if (base != const0_rtx)
3690 {
3691 base = aarch64_force_temporary (int_mode, dest, base);
3692 aarch64_add_offset (int_mode, dest, base, offset,
3693 NULL_RTX, NULL_RTX, false);
3694 }
3695 else
3696 aarch64_add_offset (int_mode, dest, base, offset,
3697 dest, NULL_RTX, false);
3698 }
3699 return;
3700 }
3701
3702 sty = aarch64_classify_symbol (base, const_offset);
3703 switch (sty)
3704 {
3705 case SYMBOL_FORCE_TO_MEM:
3706 if (const_offset != 0
3707 && targetm.cannot_force_const_mem (int_mode, imm))
3708 {
3709 gcc_assert (can_create_pseudo_p ());
3710 base = aarch64_force_temporary (int_mode, dest, base);
3711 aarch64_add_offset (int_mode, dest, base, const_offset,
3712 NULL_RTX, NULL_RTX, false);
3713 return;
3714 }
3715
3716 mem = force_const_mem (ptr_mode, imm);
3717 gcc_assert (mem);
3718
3719 /* If we aren't generating PC relative literals, then
3720 we need to expand the literal pool access carefully.
3721 This is something that needs to be done in a number
3722 of places, so could well live as a separate function. */
3723 if (!aarch64_pcrelative_literal_loads)
3724 {
3725 gcc_assert (can_create_pseudo_p ());
3726 base = gen_reg_rtx (ptr_mode);
3727 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3728 if (ptr_mode != Pmode)
3729 base = convert_memory_address (Pmode, base);
3730 mem = gen_rtx_MEM (ptr_mode, base);
3731 }
3732
3733 if (int_mode != ptr_mode)
3734 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3735
3736 emit_insn (gen_rtx_SET (dest, mem));
3737
3738 return;
3739
3740 case SYMBOL_SMALL_TLSGD:
3741 case SYMBOL_SMALL_TLSDESC:
3742 case SYMBOL_SMALL_TLSIE:
3743 case SYMBOL_SMALL_GOT_28K:
3744 case SYMBOL_SMALL_GOT_4G:
3745 case SYMBOL_TINY_GOT:
3746 case SYMBOL_TINY_TLSIE:
3747 if (const_offset != 0)
3748 {
3749 gcc_assert(can_create_pseudo_p ());
3750 base = aarch64_force_temporary (int_mode, dest, base);
3751 aarch64_add_offset (int_mode, dest, base, const_offset,
3752 NULL_RTX, NULL_RTX, false);
3753 return;
3754 }
3755 /* FALLTHRU */
3756
3757 case SYMBOL_SMALL_ABSOLUTE:
3758 case SYMBOL_TINY_ABSOLUTE:
3759 case SYMBOL_TLSLE12:
3760 case SYMBOL_TLSLE24:
3761 case SYMBOL_TLSLE32:
3762 case SYMBOL_TLSLE48:
3763 aarch64_load_symref_appropriately (dest, imm, sty);
3764 return;
3765
3766 default:
3767 gcc_unreachable ();
3768 }
3769 }
3770
3771 if (!CONST_INT_P (imm))
3772 {
3773 if (GET_CODE (imm) == HIGH
3774 || aarch64_simd_valid_immediate (imm, NULL))
3775 {
3776 emit_insn (gen_rtx_SET (dest, imm));
3777 return;
3778 }
3779
3780 rtx_vector_builder builder;
3781 if (GET_MODE_CLASS (GET_MODE (imm)) == MODE_VECTOR_BOOL
3782 && aarch64_get_sve_pred_bits (builder, imm))
3783 {
3784 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3785 int vl = aarch64_partial_ptrue_length (builder, elt_size);
3786 if (vl > 0)
3787 {
3788 aarch64_sve_move_pred_via_while (dest, vl);
3789 return;
3790 }
3791 }
3792
3793 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
3794 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
3795 {
3796 if (dest != res)
3797 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
3798 return;
3799 }
3800
3801 rtx mem = force_const_mem (mode, imm);
3802 gcc_assert (mem);
3803 emit_move_insn (dest, mem);
3804 return;
3805 }
3806
3807 aarch64_internal_mov_immediate (dest, imm, true,
3808 as_a <scalar_int_mode> (mode));
3809 }
3810
3811 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3812 that is known to contain PTRUE. */
3813
3814 void
3815 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3816 {
3817 expand_operand ops[3];
3818 machine_mode mode = GET_MODE (dest);
3819 create_output_operand (&ops[0], dest, mode);
3820 create_input_operand (&ops[1], pred, GET_MODE(pred));
3821 create_input_operand (&ops[2], src, mode);
3822 temporary_volatile_ok v (true);
3823 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3824 }
3825
3826 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3827 operand is in memory. In this case we need to use the predicated LD1
3828 and ST1 instead of LDR and STR, both for correctness on big-endian
3829 targets and because LD1 and ST1 support a wider range of addressing modes.
3830 PRED_MODE is the mode of the predicate.
3831
3832 See the comment at the head of aarch64-sve.md for details about the
3833 big-endian handling. */
3834
3835 void
3836 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3837 {
3838 machine_mode mode = GET_MODE (dest);
3839 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3840 if (!register_operand (src, mode)
3841 && !register_operand (dest, mode))
3842 {
3843 rtx tmp = gen_reg_rtx (mode);
3844 if (MEM_P (src))
3845 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3846 else
3847 emit_move_insn (tmp, src);
3848 src = tmp;
3849 }
3850 aarch64_emit_sve_pred_move (dest, ptrue, src);
3851 }
3852
3853 /* Called only on big-endian targets. See whether an SVE vector move
3854 from SRC to DEST is effectively a REV[BHW] instruction, because at
3855 least one operand is a subreg of an SVE vector that has wider or
3856 narrower elements. Return true and emit the instruction if so.
3857
3858 For example:
3859
3860 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3861
3862 represents a VIEW_CONVERT between the following vectors, viewed
3863 in memory order:
3864
3865 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3866 R1: { [0], [1], [2], [3], ... }
3867
3868 The high part of lane X in R2 should therefore correspond to lane X*2
3869 of R1, but the register representations are:
3870
3871 msb lsb
3872 R2: ...... [1].high [1].low [0].high [0].low
3873 R1: ...... [3] [2] [1] [0]
3874
3875 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3876 We therefore need a reverse operation to swap the high and low values
3877 around.
3878
3879 This is purely an optimization. Without it we would spill the
3880 subreg operand to the stack in one mode and reload it in the
3881 other mode, which has the same effect as the REV. */
3882
3883 bool
3884 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3885 {
3886 gcc_assert (BYTES_BIG_ENDIAN);
3887 if (GET_CODE (dest) == SUBREG)
3888 dest = SUBREG_REG (dest);
3889 if (GET_CODE (src) == SUBREG)
3890 src = SUBREG_REG (src);
3891
3892 /* The optimization handles two single SVE REGs with different element
3893 sizes. */
3894 if (!REG_P (dest)
3895 || !REG_P (src)
3896 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3897 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3898 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3899 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3900 return false;
3901
3902 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3903 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
3904 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3905 UNSPEC_REV_SUBREG);
3906 emit_insn (gen_rtx_SET (dest, unspec));
3907 return true;
3908 }
3909
3910 /* Return a copy of X with mode MODE, without changing its other
3911 attributes. Unlike gen_lowpart, this doesn't care whether the
3912 mode change is valid. */
3913
3914 static rtx
3915 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3916 {
3917 if (GET_MODE (x) == mode)
3918 return x;
3919
3920 x = shallow_copy_rtx (x);
3921 set_mode_and_regno (x, mode, REGNO (x));
3922 return x;
3923 }
3924
3925 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3926 operands. */
3927
3928 void
3929 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3930 {
3931 /* Decide which REV operation we need. The mode with narrower elements
3932 determines the mode of the operands and the mode with the wider
3933 elements determines the reverse width. */
3934 machine_mode mode_with_wider_elts = GET_MODE (dest);
3935 machine_mode mode_with_narrower_elts = GET_MODE (src);
3936 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3937 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3938 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3939
3940 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3941 unsigned int unspec;
3942 if (wider_bytes == 8)
3943 unspec = UNSPEC_REV64;
3944 else if (wider_bytes == 4)
3945 unspec = UNSPEC_REV32;
3946 else if (wider_bytes == 2)
3947 unspec = UNSPEC_REV16;
3948 else
3949 gcc_unreachable ();
3950 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3951
3952 /* Emit:
3953
3954 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3955 UNSPEC_MERGE_PTRUE))
3956
3957 with the appropriate modes. */
3958 ptrue = gen_lowpart (pred_mode, ptrue);
3959 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3960 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3961 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3962 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3963 UNSPEC_MERGE_PTRUE);
3964 emit_insn (gen_rtx_SET (dest, src));
3965 }
3966
3967 static bool
3968 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3969 tree exp ATTRIBUTE_UNUSED)
3970 {
3971 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3972 return false;
3973
3974 return true;
3975 }
3976
3977 /* Implement TARGET_PASS_BY_REFERENCE. */
3978
3979 static bool
3980 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3981 machine_mode mode,
3982 const_tree type,
3983 bool named ATTRIBUTE_UNUSED)
3984 {
3985 HOST_WIDE_INT size;
3986 machine_mode dummymode;
3987 int nregs;
3988
3989 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3990 if (mode == BLKmode && type)
3991 size = int_size_in_bytes (type);
3992 else
3993 /* No frontends can create types with variable-sized modes, so we
3994 shouldn't be asked to pass or return them. */
3995 size = GET_MODE_SIZE (mode).to_constant ();
3996
3997 /* Aggregates are passed by reference based on their size. */
3998 if (type && AGGREGATE_TYPE_P (type))
3999 {
4000 size = int_size_in_bytes (type);
4001 }
4002
4003 /* Variable sized arguments are always returned by reference. */
4004 if (size < 0)
4005 return true;
4006
4007 /* Can this be a candidate to be passed in fp/simd register(s)? */
4008 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4009 &dummymode, &nregs,
4010 NULL))
4011 return false;
4012
4013 /* Arguments which are variable sized or larger than 2 registers are
4014 passed by reference unless they are a homogenous floating point
4015 aggregate. */
4016 return size > 2 * UNITS_PER_WORD;
4017 }
4018
4019 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4020 static bool
4021 aarch64_return_in_msb (const_tree valtype)
4022 {
4023 machine_mode dummy_mode;
4024 int dummy_int;
4025
4026 /* Never happens in little-endian mode. */
4027 if (!BYTES_BIG_ENDIAN)
4028 return false;
4029
4030 /* Only composite types smaller than or equal to 16 bytes can
4031 be potentially returned in registers. */
4032 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4033 || int_size_in_bytes (valtype) <= 0
4034 || int_size_in_bytes (valtype) > 16)
4035 return false;
4036
4037 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4038 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4039 is always passed/returned in the least significant bits of fp/simd
4040 register(s). */
4041 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4042 &dummy_mode, &dummy_int, NULL))
4043 return false;
4044
4045 return true;
4046 }
4047
4048 /* Implement TARGET_FUNCTION_VALUE.
4049 Define how to find the value returned by a function. */
4050
4051 static rtx
4052 aarch64_function_value (const_tree type, const_tree func,
4053 bool outgoing ATTRIBUTE_UNUSED)
4054 {
4055 machine_mode mode;
4056 int unsignedp;
4057 int count;
4058 machine_mode ag_mode;
4059
4060 mode = TYPE_MODE (type);
4061 if (INTEGRAL_TYPE_P (type))
4062 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4063
4064 if (aarch64_return_in_msb (type))
4065 {
4066 HOST_WIDE_INT size = int_size_in_bytes (type);
4067
4068 if (size % UNITS_PER_WORD != 0)
4069 {
4070 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4071 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4072 }
4073 }
4074
4075 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4076 &ag_mode, &count, NULL))
4077 {
4078 if (!aarch64_composite_type_p (type, mode))
4079 {
4080 gcc_assert (count == 1 && mode == ag_mode);
4081 return gen_rtx_REG (mode, V0_REGNUM);
4082 }
4083 else
4084 {
4085 int i;
4086 rtx par;
4087
4088 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4089 for (i = 0; i < count; i++)
4090 {
4091 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4092 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4093 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4094 XVECEXP (par, 0, i) = tmp;
4095 }
4096 return par;
4097 }
4098 }
4099 else
4100 return gen_rtx_REG (mode, R0_REGNUM);
4101 }
4102
4103 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4104 Return true if REGNO is the number of a hard register in which the values
4105 of called function may come back. */
4106
4107 static bool
4108 aarch64_function_value_regno_p (const unsigned int regno)
4109 {
4110 /* Maximum of 16 bytes can be returned in the general registers. Examples
4111 of 16-byte return values are: 128-bit integers and 16-byte small
4112 structures (excluding homogeneous floating-point aggregates). */
4113 if (regno == R0_REGNUM || regno == R1_REGNUM)
4114 return true;
4115
4116 /* Up to four fp/simd registers can return a function value, e.g. a
4117 homogeneous floating-point aggregate having four members. */
4118 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4119 return TARGET_FLOAT;
4120
4121 return false;
4122 }
4123
4124 /* Implement TARGET_RETURN_IN_MEMORY.
4125
4126 If the type T of the result of a function is such that
4127 void func (T arg)
4128 would require that arg be passed as a value in a register (or set of
4129 registers) according to the parameter passing rules, then the result
4130 is returned in the same registers as would be used for such an
4131 argument. */
4132
4133 static bool
4134 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4135 {
4136 HOST_WIDE_INT size;
4137 machine_mode ag_mode;
4138 int count;
4139
4140 if (!AGGREGATE_TYPE_P (type)
4141 && TREE_CODE (type) != COMPLEX_TYPE
4142 && TREE_CODE (type) != VECTOR_TYPE)
4143 /* Simple scalar types always returned in registers. */
4144 return false;
4145
4146 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4147 type,
4148 &ag_mode,
4149 &count,
4150 NULL))
4151 return false;
4152
4153 /* Types larger than 2 registers returned in memory. */
4154 size = int_size_in_bytes (type);
4155 return (size < 0 || size > 2 * UNITS_PER_WORD);
4156 }
4157
4158 static bool
4159 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4160 const_tree type, int *nregs)
4161 {
4162 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4163 return aarch64_vfp_is_call_or_return_candidate (mode,
4164 type,
4165 &pcum->aapcs_vfp_rmode,
4166 nregs,
4167 NULL);
4168 }
4169
4170 /* Given MODE and TYPE of a function argument, return the alignment in
4171 bits. The idea is to suppress any stronger alignment requested by
4172 the user and opt for the natural alignment (specified in AAPCS64 \S
4173 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4174 calculated in versions of GCC prior to GCC-9. This is a helper
4175 function for local use only. */
4176
4177 static unsigned int
4178 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4179 bool *abi_break)
4180 {
4181 *abi_break = false;
4182 if (!type)
4183 return GET_MODE_ALIGNMENT (mode);
4184
4185 if (integer_zerop (TYPE_SIZE (type)))
4186 return 0;
4187
4188 gcc_assert (TYPE_MODE (type) == mode);
4189
4190 if (!AGGREGATE_TYPE_P (type))
4191 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4192
4193 if (TREE_CODE (type) == ARRAY_TYPE)
4194 return TYPE_ALIGN (TREE_TYPE (type));
4195
4196 unsigned int alignment = 0;
4197 unsigned int bitfield_alignment = 0;
4198 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4199 if (TREE_CODE (field) == FIELD_DECL)
4200 {
4201 alignment = std::max (alignment, DECL_ALIGN (field));
4202 if (DECL_BIT_FIELD_TYPE (field))
4203 bitfield_alignment
4204 = std::max (bitfield_alignment,
4205 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4206 }
4207
4208 if (bitfield_alignment > alignment)
4209 {
4210 *abi_break = true;
4211 return bitfield_alignment;
4212 }
4213
4214 return alignment;
4215 }
4216
4217 /* Layout a function argument according to the AAPCS64 rules. The rule
4218 numbers refer to the rule numbers in the AAPCS64. */
4219
4220 static void
4221 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4222 const_tree type,
4223 bool named ATTRIBUTE_UNUSED)
4224 {
4225 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4226 int ncrn, nvrn, nregs;
4227 bool allocate_ncrn, allocate_nvrn;
4228 HOST_WIDE_INT size;
4229 bool abi_break;
4230
4231 /* We need to do this once per argument. */
4232 if (pcum->aapcs_arg_processed)
4233 return;
4234
4235 pcum->aapcs_arg_processed = true;
4236
4237 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4238 if (type)
4239 size = int_size_in_bytes (type);
4240 else
4241 /* No frontends can create types with variable-sized modes, so we
4242 shouldn't be asked to pass or return them. */
4243 size = GET_MODE_SIZE (mode).to_constant ();
4244 size = ROUND_UP (size, UNITS_PER_WORD);
4245
4246 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4247 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4248 mode,
4249 type,
4250 &nregs);
4251
4252 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4253 The following code thus handles passing by SIMD/FP registers first. */
4254
4255 nvrn = pcum->aapcs_nvrn;
4256
4257 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4258 and homogenous short-vector aggregates (HVA). */
4259 if (allocate_nvrn)
4260 {
4261 if (!TARGET_FLOAT)
4262 aarch64_err_no_fpadvsimd (mode);
4263
4264 if (nvrn + nregs <= NUM_FP_ARG_REGS)
4265 {
4266 pcum->aapcs_nextnvrn = nvrn + nregs;
4267 if (!aarch64_composite_type_p (type, mode))
4268 {
4269 gcc_assert (nregs == 1);
4270 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4271 }
4272 else
4273 {
4274 rtx par;
4275 int i;
4276 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4277 for (i = 0; i < nregs; i++)
4278 {
4279 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4280 V0_REGNUM + nvrn + i);
4281 rtx offset = gen_int_mode
4282 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4283 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4284 XVECEXP (par, 0, i) = tmp;
4285 }
4286 pcum->aapcs_reg = par;
4287 }
4288 return;
4289 }
4290 else
4291 {
4292 /* C.3 NSRN is set to 8. */
4293 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4294 goto on_stack;
4295 }
4296 }
4297
4298 ncrn = pcum->aapcs_ncrn;
4299 nregs = size / UNITS_PER_WORD;
4300
4301 /* C6 - C9. though the sign and zero extension semantics are
4302 handled elsewhere. This is the case where the argument fits
4303 entirely general registers. */
4304 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4305 {
4306 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4307
4308 /* C.8 if the argument has an alignment of 16 then the NGRN is
4309 rounded up to the next even number. */
4310 if (nregs == 2
4311 && ncrn % 2
4312 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4313 comparison is there because for > 16 * BITS_PER_UNIT
4314 alignment nregs should be > 2 and therefore it should be
4315 passed by reference rather than value. */
4316 && (aarch64_function_arg_alignment (mode, type, &abi_break)
4317 == 16 * BITS_PER_UNIT))
4318 {
4319 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4320 inform (input_location, "parameter passing for argument of type "
4321 "%qT changed in GCC 9.1", type);
4322 ++ncrn;
4323 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4324 }
4325
4326 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4327 A reg is still generated for it, but the caller should be smart
4328 enough not to use it. */
4329 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4330 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4331 else
4332 {
4333 rtx par;
4334 int i;
4335
4336 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4337 for (i = 0; i < nregs; i++)
4338 {
4339 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4340 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4341 GEN_INT (i * UNITS_PER_WORD));
4342 XVECEXP (par, 0, i) = tmp;
4343 }
4344 pcum->aapcs_reg = par;
4345 }
4346
4347 pcum->aapcs_nextncrn = ncrn + nregs;
4348 return;
4349 }
4350
4351 /* C.11 */
4352 pcum->aapcs_nextncrn = NUM_ARG_REGS;
4353
4354 /* The argument is passed on stack; record the needed number of words for
4355 this argument and align the total size if necessary. */
4356 on_stack:
4357 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4358
4359 if (aarch64_function_arg_alignment (mode, type, &abi_break)
4360 == 16 * BITS_PER_UNIT)
4361 {
4362 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4363 if (pcum->aapcs_stack_size != new_size)
4364 {
4365 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4366 inform (input_location, "parameter passing for argument of type "
4367 "%qT changed in GCC 9.1", type);
4368 pcum->aapcs_stack_size = new_size;
4369 }
4370 }
4371 return;
4372 }
4373
4374 /* Implement TARGET_FUNCTION_ARG. */
4375
4376 static rtx
4377 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4378 const_tree type, bool named)
4379 {
4380 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4381 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4382
4383 if (mode == VOIDmode)
4384 return NULL_RTX;
4385
4386 aarch64_layout_arg (pcum_v, mode, type, named);
4387 return pcum->aapcs_reg;
4388 }
4389
4390 void
4391 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4392 const_tree fntype ATTRIBUTE_UNUSED,
4393 rtx libname ATTRIBUTE_UNUSED,
4394 const_tree fndecl ATTRIBUTE_UNUSED,
4395 unsigned n_named ATTRIBUTE_UNUSED)
4396 {
4397 pcum->aapcs_ncrn = 0;
4398 pcum->aapcs_nvrn = 0;
4399 pcum->aapcs_nextncrn = 0;
4400 pcum->aapcs_nextnvrn = 0;
4401 pcum->pcs_variant = ARM_PCS_AAPCS64;
4402 pcum->aapcs_reg = NULL_RTX;
4403 pcum->aapcs_arg_processed = false;
4404 pcum->aapcs_stack_words = 0;
4405 pcum->aapcs_stack_size = 0;
4406
4407 if (!TARGET_FLOAT
4408 && fndecl && TREE_PUBLIC (fndecl)
4409 && fntype && fntype != error_mark_node)
4410 {
4411 const_tree type = TREE_TYPE (fntype);
4412 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4413 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4414 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4415 &mode, &nregs, NULL))
4416 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4417 }
4418 return;
4419 }
4420
4421 static void
4422 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4423 machine_mode mode,
4424 const_tree type,
4425 bool named)
4426 {
4427 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4428 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4429 {
4430 aarch64_layout_arg (pcum_v, mode, type, named);
4431 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4432 != (pcum->aapcs_stack_words != 0));
4433 pcum->aapcs_arg_processed = false;
4434 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4435 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4436 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4437 pcum->aapcs_stack_words = 0;
4438 pcum->aapcs_reg = NULL_RTX;
4439 }
4440 }
4441
4442 bool
4443 aarch64_function_arg_regno_p (unsigned regno)
4444 {
4445 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4446 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4447 }
4448
4449 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4450 PARM_BOUNDARY bits of alignment, but will be given anything up
4451 to STACK_BOUNDARY bits if the type requires it. This makes sure
4452 that both before and after the layout of each argument, the Next
4453 Stacked Argument Address (NSAA) will have a minimum alignment of
4454 8 bytes. */
4455
4456 static unsigned int
4457 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4458 {
4459 bool abi_break;
4460 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4461 &abi_break);
4462 if (abi_break & warn_psabi)
4463 inform (input_location, "parameter passing for argument of type "
4464 "%qT changed in GCC 9.1", type);
4465
4466 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4467 }
4468
4469 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4470
4471 static fixed_size_mode
4472 aarch64_get_reg_raw_mode (int regno)
4473 {
4474 if (TARGET_SVE && FP_REGNUM_P (regno))
4475 /* Don't use the SVE part of the register for __builtin_apply and
4476 __builtin_return. The SVE registers aren't used by the normal PCS,
4477 so using them there would be a waste of time. The PCS extensions
4478 for SVE types are fundamentally incompatible with the
4479 __builtin_return/__builtin_apply interface. */
4480 return as_a <fixed_size_mode> (V16QImode);
4481 return default_get_reg_raw_mode (regno);
4482 }
4483
4484 /* Implement TARGET_FUNCTION_ARG_PADDING.
4485
4486 Small aggregate types are placed in the lowest memory address.
4487
4488 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4489
4490 static pad_direction
4491 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4492 {
4493 /* On little-endian targets, the least significant byte of every stack
4494 argument is passed at the lowest byte address of the stack slot. */
4495 if (!BYTES_BIG_ENDIAN)
4496 return PAD_UPWARD;
4497
4498 /* Otherwise, integral, floating-point and pointer types are padded downward:
4499 the least significant byte of a stack argument is passed at the highest
4500 byte address of the stack slot. */
4501 if (type
4502 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4503 || POINTER_TYPE_P (type))
4504 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4505 return PAD_DOWNWARD;
4506
4507 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4508 return PAD_UPWARD;
4509 }
4510
4511 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4512
4513 It specifies padding for the last (may also be the only)
4514 element of a block move between registers and memory. If
4515 assuming the block is in the memory, padding upward means that
4516 the last element is padded after its highest significant byte,
4517 while in downward padding, the last element is padded at the
4518 its least significant byte side.
4519
4520 Small aggregates and small complex types are always padded
4521 upwards.
4522
4523 We don't need to worry about homogeneous floating-point or
4524 short-vector aggregates; their move is not affected by the
4525 padding direction determined here. Regardless of endianness,
4526 each element of such an aggregate is put in the least
4527 significant bits of a fp/simd register.
4528
4529 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4530 register has useful data, and return the opposite if the most
4531 significant byte does. */
4532
4533 bool
4534 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4535 bool first ATTRIBUTE_UNUSED)
4536 {
4537
4538 /* Small composite types are always padded upward. */
4539 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4540 {
4541 HOST_WIDE_INT size;
4542 if (type)
4543 size = int_size_in_bytes (type);
4544 else
4545 /* No frontends can create types with variable-sized modes, so we
4546 shouldn't be asked to pass or return them. */
4547 size = GET_MODE_SIZE (mode).to_constant ();
4548 if (size < 2 * UNITS_PER_WORD)
4549 return true;
4550 }
4551
4552 /* Otherwise, use the default padding. */
4553 return !BYTES_BIG_ENDIAN;
4554 }
4555
4556 static scalar_int_mode
4557 aarch64_libgcc_cmp_return_mode (void)
4558 {
4559 return SImode;
4560 }
4561
4562 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4563
4564 /* We use the 12-bit shifted immediate arithmetic instructions so values
4565 must be multiple of (1 << 12), i.e. 4096. */
4566 #define ARITH_FACTOR 4096
4567
4568 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4569 #error Cannot use simple address calculation for stack probing
4570 #endif
4571
4572 /* The pair of scratch registers used for stack probing. */
4573 #define PROBE_STACK_FIRST_REG R9_REGNUM
4574 #define PROBE_STACK_SECOND_REG R10_REGNUM
4575
4576 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4577 inclusive. These are offsets from the current stack pointer. */
4578
4579 static void
4580 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4581 {
4582 HOST_WIDE_INT size;
4583 if (!poly_size.is_constant (&size))
4584 {
4585 sorry ("stack probes for SVE frames");
4586 return;
4587 }
4588
4589 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4590
4591 /* See the same assertion on PROBE_INTERVAL above. */
4592 gcc_assert ((first % ARITH_FACTOR) == 0);
4593
4594 /* See if we have a constant small number of probes to generate. If so,
4595 that's the easy case. */
4596 if (size <= PROBE_INTERVAL)
4597 {
4598 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4599
4600 emit_set_insn (reg1,
4601 plus_constant (Pmode,
4602 stack_pointer_rtx, -(first + base)));
4603 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4604 }
4605
4606 /* The run-time loop is made up of 8 insns in the generic case while the
4607 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4608 else if (size <= 4 * PROBE_INTERVAL)
4609 {
4610 HOST_WIDE_INT i, rem;
4611
4612 emit_set_insn (reg1,
4613 plus_constant (Pmode,
4614 stack_pointer_rtx,
4615 -(first + PROBE_INTERVAL)));
4616 emit_stack_probe (reg1);
4617
4618 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4619 it exceeds SIZE. If only two probes are needed, this will not
4620 generate any code. Then probe at FIRST + SIZE. */
4621 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4622 {
4623 emit_set_insn (reg1,
4624 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4625 emit_stack_probe (reg1);
4626 }
4627
4628 rem = size - (i - PROBE_INTERVAL);
4629 if (rem > 256)
4630 {
4631 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4632
4633 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4634 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4635 }
4636 else
4637 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4638 }
4639
4640 /* Otherwise, do the same as above, but in a loop. Note that we must be
4641 extra careful with variables wrapping around because we might be at
4642 the very top (or the very bottom) of the address space and we have
4643 to be able to handle this case properly; in particular, we use an
4644 equality test for the loop condition. */
4645 else
4646 {
4647 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4648
4649 /* Step 1: round SIZE to the previous multiple of the interval. */
4650
4651 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4652
4653
4654 /* Step 2: compute initial and final value of the loop counter. */
4655
4656 /* TEST_ADDR = SP + FIRST. */
4657 emit_set_insn (reg1,
4658 plus_constant (Pmode, stack_pointer_rtx, -first));
4659
4660 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4661 HOST_WIDE_INT adjustment = - (first + rounded_size);
4662 if (! aarch64_uimm12_shift (adjustment))
4663 {
4664 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4665 true, Pmode);
4666 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4667 }
4668 else
4669 emit_set_insn (reg2,
4670 plus_constant (Pmode, stack_pointer_rtx, adjustment));
4671
4672 /* Step 3: the loop
4673
4674 do
4675 {
4676 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4677 probe at TEST_ADDR
4678 }
4679 while (TEST_ADDR != LAST_ADDR)
4680
4681 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4682 until it is equal to ROUNDED_SIZE. */
4683
4684 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4685
4686
4687 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4688 that SIZE is equal to ROUNDED_SIZE. */
4689
4690 if (size != rounded_size)
4691 {
4692 HOST_WIDE_INT rem = size - rounded_size;
4693
4694 if (rem > 256)
4695 {
4696 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4697
4698 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4699 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4700 }
4701 else
4702 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4703 }
4704 }
4705
4706 /* Make sure nothing is scheduled before we are done. */
4707 emit_insn (gen_blockage ());
4708 }
4709
4710 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4711 absolute addresses. */
4712
4713 const char *
4714 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4715 {
4716 static int labelno = 0;
4717 char loop_lab[32];
4718 rtx xops[2];
4719
4720 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4721
4722 /* Loop. */
4723 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4724
4725 HOST_WIDE_INT stack_clash_probe_interval
4726 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4727
4728 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4729 xops[0] = reg1;
4730 HOST_WIDE_INT interval;
4731 if (flag_stack_clash_protection)
4732 interval = stack_clash_probe_interval;
4733 else
4734 interval = PROBE_INTERVAL;
4735
4736 gcc_assert (aarch64_uimm12_shift (interval));
4737 xops[1] = GEN_INT (interval);
4738
4739 output_asm_insn ("sub\t%0, %0, %1", xops);
4740
4741 /* If doing stack clash protection then we probe up by the ABI specified
4742 amount. We do this because we're dropping full pages at a time in the
4743 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4744 if (flag_stack_clash_protection)
4745 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4746 else
4747 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4748
4749 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4750 by this amount for each iteration. */
4751 output_asm_insn ("str\txzr, [%0, %1]", xops);
4752
4753 /* Test if TEST_ADDR == LAST_ADDR. */
4754 xops[1] = reg2;
4755 output_asm_insn ("cmp\t%0, %1", xops);
4756
4757 /* Branch. */
4758 fputs ("\tb.ne\t", asm_out_file);
4759 assemble_name_raw (asm_out_file, loop_lab);
4760 fputc ('\n', asm_out_file);
4761
4762 return "";
4763 }
4764
4765 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4766 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4767 of GUARD_SIZE. When a probe is emitted it is done at most
4768 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4769 at most MIN_PROBE_THRESHOLD. By the end of this function
4770 BASE = BASE - ADJUSTMENT. */
4771
4772 const char *
4773 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4774 rtx min_probe_threshold, rtx guard_size)
4775 {
4776 /* This function is not allowed to use any instruction generation function
4777 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4778 so instead emit the code you want using output_asm_insn. */
4779 gcc_assert (flag_stack_clash_protection);
4780 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4781 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4782
4783 /* The minimum required allocation before the residual requires probing. */
4784 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4785
4786 /* Clamp the value down to the nearest value that can be used with a cmp. */
4787 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4788 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4789
4790 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4791 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4792
4793 static int labelno = 0;
4794 char loop_start_lab[32];
4795 char loop_end_lab[32];
4796 rtx xops[2];
4797
4798 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4799 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4800
4801 /* Emit loop start label. */
4802 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4803
4804 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4805 xops[0] = adjustment;
4806 xops[1] = probe_offset_value_rtx;
4807 output_asm_insn ("cmp\t%0, %1", xops);
4808
4809 /* Branch to end if not enough adjustment to probe. */
4810 fputs ("\tb.lt\t", asm_out_file);
4811 assemble_name_raw (asm_out_file, loop_end_lab);
4812 fputc ('\n', asm_out_file);
4813
4814 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4815 xops[0] = base;
4816 xops[1] = probe_offset_value_rtx;
4817 output_asm_insn ("sub\t%0, %0, %1", xops);
4818
4819 /* Probe at BASE. */
4820 xops[1] = const0_rtx;
4821 output_asm_insn ("str\txzr, [%0, %1]", xops);
4822
4823 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4824 xops[0] = adjustment;
4825 xops[1] = probe_offset_value_rtx;
4826 output_asm_insn ("sub\t%0, %0, %1", xops);
4827
4828 /* Branch to start if still more bytes to allocate. */
4829 fputs ("\tb\t", asm_out_file);
4830 assemble_name_raw (asm_out_file, loop_start_lab);
4831 fputc ('\n', asm_out_file);
4832
4833 /* No probe leave. */
4834 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4835
4836 /* BASE = BASE - ADJUSTMENT. */
4837 xops[0] = base;
4838 xops[1] = adjustment;
4839 output_asm_insn ("sub\t%0, %0, %1", xops);
4840 return "";
4841 }
4842
4843 /* Determine whether a frame chain needs to be generated. */
4844 static bool
4845 aarch64_needs_frame_chain (void)
4846 {
4847 /* Force a frame chain for EH returns so the return address is at FP+8. */
4848 if (frame_pointer_needed || crtl->calls_eh_return)
4849 return true;
4850
4851 /* A leaf function cannot have calls or write LR. */
4852 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4853
4854 /* Don't use a frame chain in leaf functions if leaf frame pointers
4855 are disabled. */
4856 if (flag_omit_leaf_frame_pointer && is_leaf)
4857 return false;
4858
4859 return aarch64_use_frame_pointer;
4860 }
4861
4862 /* Mark the registers that need to be saved by the callee and calculate
4863 the size of the callee-saved registers area and frame record (both FP
4864 and LR may be omitted). */
4865 static void
4866 aarch64_layout_frame (void)
4867 {
4868 HOST_WIDE_INT offset = 0;
4869 int regno, last_fp_reg = INVALID_REGNUM;
4870 bool simd_function = aarch64_simd_decl_p (cfun->decl);
4871
4872 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4873
4874 /* Adjust the outgoing arguments size if required. Keep it in sync with what
4875 the mid-end is doing. */
4876 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4877
4878 #define SLOT_NOT_REQUIRED (-2)
4879 #define SLOT_REQUIRED (-1)
4880
4881 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4882 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4883
4884 /* If this is a non-leaf simd function with calls we assume that
4885 at least one of those calls is to a non-simd function and thus
4886 we must save V8 to V23 in the prologue. */
4887
4888 if (simd_function && !crtl->is_leaf)
4889 {
4890 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4891 if (FP_SIMD_SAVED_REGNUM_P (regno))
4892 df_set_regs_ever_live (regno, true);
4893 }
4894
4895 /* First mark all the registers that really need to be saved... */
4896 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4897 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4898
4899 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4900 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4901
4902 /* ... that includes the eh data registers (if needed)... */
4903 if (crtl->calls_eh_return)
4904 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4905 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4906 = SLOT_REQUIRED;
4907
4908 /* ... and any callee saved register that dataflow says is live. */
4909 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4910 if (df_regs_ever_live_p (regno)
4911 && (regno == R30_REGNUM
4912 || !call_used_regs[regno]))
4913 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4914
4915 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4916 if (df_regs_ever_live_p (regno)
4917 && (!call_used_regs[regno]
4918 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4919 {
4920 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4921 last_fp_reg = regno;
4922 }
4923
4924 if (cfun->machine->frame.emit_frame_chain)
4925 {
4926 /* FP and LR are placed in the linkage record. */
4927 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4928 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4929 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4930 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4931 offset = 2 * UNITS_PER_WORD;
4932 }
4933
4934 /* With stack-clash, LR must be saved in non-leaf functions. */
4935 gcc_assert (crtl->is_leaf
4936 || (cfun->machine->frame.reg_offset[R30_REGNUM]
4937 != SLOT_NOT_REQUIRED));
4938
4939 /* Now assign stack slots for them. */
4940 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4941 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4942 {
4943 cfun->machine->frame.reg_offset[regno] = offset;
4944 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4945 cfun->machine->frame.wb_candidate1 = regno;
4946 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4947 cfun->machine->frame.wb_candidate2 = regno;
4948 offset += UNITS_PER_WORD;
4949 }
4950
4951 HOST_WIDE_INT max_int_offset = offset;
4952 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4953 bool has_align_gap = offset != max_int_offset;
4954
4955 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4956 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4957 {
4958 /* If there is an alignment gap between integer and fp callee-saves,
4959 allocate the last fp register to it if possible. */
4960 if (regno == last_fp_reg
4961 && has_align_gap
4962 && !simd_function
4963 && (offset & 8) == 0)
4964 {
4965 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4966 break;
4967 }
4968
4969 cfun->machine->frame.reg_offset[regno] = offset;
4970 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4971 cfun->machine->frame.wb_candidate1 = regno;
4972 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4973 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4974 cfun->machine->frame.wb_candidate2 = regno;
4975 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4976 }
4977
4978 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4979
4980 cfun->machine->frame.saved_regs_size = offset;
4981
4982 HOST_WIDE_INT varargs_and_saved_regs_size
4983 = offset + cfun->machine->frame.saved_varargs_size;
4984
4985 cfun->machine->frame.hard_fp_offset
4986 = aligned_upper_bound (varargs_and_saved_regs_size
4987 + get_frame_size (),
4988 STACK_BOUNDARY / BITS_PER_UNIT);
4989
4990 /* Both these values are already aligned. */
4991 gcc_assert (multiple_p (crtl->outgoing_args_size,
4992 STACK_BOUNDARY / BITS_PER_UNIT));
4993 cfun->machine->frame.frame_size
4994 = (cfun->machine->frame.hard_fp_offset
4995 + crtl->outgoing_args_size);
4996
4997 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4998
4999 cfun->machine->frame.initial_adjust = 0;
5000 cfun->machine->frame.final_adjust = 0;
5001 cfun->machine->frame.callee_adjust = 0;
5002 cfun->machine->frame.callee_offset = 0;
5003
5004 HOST_WIDE_INT max_push_offset = 0;
5005 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5006 max_push_offset = 512;
5007 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5008 max_push_offset = 256;
5009
5010 HOST_WIDE_INT const_size, const_fp_offset;
5011 if (cfun->machine->frame.frame_size.is_constant (&const_size)
5012 && const_size < max_push_offset
5013 && known_eq (crtl->outgoing_args_size, 0))
5014 {
5015 /* Simple, small frame with no outgoing arguments:
5016 stp reg1, reg2, [sp, -frame_size]!
5017 stp reg3, reg4, [sp, 16] */
5018 cfun->machine->frame.callee_adjust = const_size;
5019 }
5020 else if (known_lt (crtl->outgoing_args_size
5021 + cfun->machine->frame.saved_regs_size, 512)
5022 && !(cfun->calls_alloca
5023 && known_lt (cfun->machine->frame.hard_fp_offset,
5024 max_push_offset)))
5025 {
5026 /* Frame with small outgoing arguments:
5027 sub sp, sp, frame_size
5028 stp reg1, reg2, [sp, outgoing_args_size]
5029 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5030 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5031 cfun->machine->frame.callee_offset
5032 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5033 }
5034 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5035 && const_fp_offset < max_push_offset)
5036 {
5037 /* Frame with large outgoing arguments but a small local area:
5038 stp reg1, reg2, [sp, -hard_fp_offset]!
5039 stp reg3, reg4, [sp, 16]
5040 sub sp, sp, outgoing_args_size */
5041 cfun->machine->frame.callee_adjust = const_fp_offset;
5042 cfun->machine->frame.final_adjust
5043 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5044 }
5045 else
5046 {
5047 /* Frame with large local area and outgoing arguments using frame pointer:
5048 sub sp, sp, hard_fp_offset
5049 stp x29, x30, [sp, 0]
5050 add x29, sp, 0
5051 stp reg3, reg4, [sp, 16]
5052 sub sp, sp, outgoing_args_size */
5053 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5054 cfun->machine->frame.final_adjust
5055 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5056 }
5057
5058 cfun->machine->frame.laid_out = true;
5059 }
5060
5061 /* Return true if the register REGNO is saved on entry to
5062 the current function. */
5063
5064 static bool
5065 aarch64_register_saved_on_entry (int regno)
5066 {
5067 return cfun->machine->frame.reg_offset[regno] >= 0;
5068 }
5069
5070 /* Return the next register up from REGNO up to LIMIT for the callee
5071 to save. */
5072
5073 static unsigned
5074 aarch64_next_callee_save (unsigned regno, unsigned limit)
5075 {
5076 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5077 regno ++;
5078 return regno;
5079 }
5080
5081 /* Push the register number REGNO of mode MODE to the stack with write-back
5082 adjusting the stack by ADJUSTMENT. */
5083
5084 static void
5085 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5086 HOST_WIDE_INT adjustment)
5087 {
5088 rtx base_rtx = stack_pointer_rtx;
5089 rtx insn, reg, mem;
5090
5091 reg = gen_rtx_REG (mode, regno);
5092 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5093 plus_constant (Pmode, base_rtx, -adjustment));
5094 mem = gen_frame_mem (mode, mem);
5095
5096 insn = emit_move_insn (mem, reg);
5097 RTX_FRAME_RELATED_P (insn) = 1;
5098 }
5099
5100 /* Generate and return an instruction to store the pair of registers
5101 REG and REG2 of mode MODE to location BASE with write-back adjusting
5102 the stack location BASE by ADJUSTMENT. */
5103
5104 static rtx
5105 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5106 HOST_WIDE_INT adjustment)
5107 {
5108 switch (mode)
5109 {
5110 case E_DImode:
5111 return gen_storewb_pairdi_di (base, base, reg, reg2,
5112 GEN_INT (-adjustment),
5113 GEN_INT (UNITS_PER_WORD - adjustment));
5114 case E_DFmode:
5115 return gen_storewb_pairdf_di (base, base, reg, reg2,
5116 GEN_INT (-adjustment),
5117 GEN_INT (UNITS_PER_WORD - adjustment));
5118 case E_TFmode:
5119 return gen_storewb_pairtf_di (base, base, reg, reg2,
5120 GEN_INT (-adjustment),
5121 GEN_INT (UNITS_PER_VREG - adjustment));
5122 default:
5123 gcc_unreachable ();
5124 }
5125 }
5126
5127 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5128 stack pointer by ADJUSTMENT. */
5129
5130 static void
5131 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5132 {
5133 rtx_insn *insn;
5134 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5135
5136 if (regno2 == INVALID_REGNUM)
5137 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5138
5139 rtx reg1 = gen_rtx_REG (mode, regno1);
5140 rtx reg2 = gen_rtx_REG (mode, regno2);
5141
5142 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5143 reg2, adjustment));
5144 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5145 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5146 RTX_FRAME_RELATED_P (insn) = 1;
5147 }
5148
5149 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5150 adjusting it by ADJUSTMENT afterwards. */
5151
5152 static rtx
5153 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5154 HOST_WIDE_INT adjustment)
5155 {
5156 switch (mode)
5157 {
5158 case E_DImode:
5159 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5160 GEN_INT (UNITS_PER_WORD));
5161 case E_DFmode:
5162 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5163 GEN_INT (UNITS_PER_WORD));
5164 case E_TFmode:
5165 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5166 GEN_INT (UNITS_PER_VREG));
5167 default:
5168 gcc_unreachable ();
5169 }
5170 }
5171
5172 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5173 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5174 into CFI_OPS. */
5175
5176 static void
5177 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5178 rtx *cfi_ops)
5179 {
5180 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5181 rtx reg1 = gen_rtx_REG (mode, regno1);
5182
5183 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5184
5185 if (regno2 == INVALID_REGNUM)
5186 {
5187 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5188 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5189 emit_move_insn (reg1, gen_frame_mem (mode, mem));
5190 }
5191 else
5192 {
5193 rtx reg2 = gen_rtx_REG (mode, regno2);
5194 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5195 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5196 reg2, adjustment));
5197 }
5198 }
5199
5200 /* Generate and return a store pair instruction of mode MODE to store
5201 register REG1 to MEM1 and register REG2 to MEM2. */
5202
5203 static rtx
5204 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5205 rtx reg2)
5206 {
5207 switch (mode)
5208 {
5209 case E_DImode:
5210 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5211
5212 case E_DFmode:
5213 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5214
5215 case E_TFmode:
5216 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5217
5218 default:
5219 gcc_unreachable ();
5220 }
5221 }
5222
5223 /* Generate and regurn a load pair isntruction of mode MODE to load register
5224 REG1 from MEM1 and register REG2 from MEM2. */
5225
5226 static rtx
5227 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5228 rtx mem2)
5229 {
5230 switch (mode)
5231 {
5232 case E_DImode:
5233 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5234
5235 case E_DFmode:
5236 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5237
5238 case E_TFmode:
5239 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5240
5241 default:
5242 gcc_unreachable ();
5243 }
5244 }
5245
5246 /* Return TRUE if return address signing should be enabled for the current
5247 function, otherwise return FALSE. */
5248
5249 bool
5250 aarch64_return_address_signing_enabled (void)
5251 {
5252 /* This function should only be called after frame laid out. */
5253 gcc_assert (cfun->machine->frame.laid_out);
5254
5255 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5256 if its LR is pushed onto stack. */
5257 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5258 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5259 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5260 }
5261
5262 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5263 bool
5264 aarch64_bti_enabled (void)
5265 {
5266 return (aarch64_enable_bti == 1);
5267 }
5268
5269 /* Emit code to save the callee-saved registers from register number START
5270 to LIMIT to the stack at the location starting at offset START_OFFSET,
5271 skipping any write-back candidates if SKIP_WB is true. */
5272
5273 static void
5274 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5275 unsigned start, unsigned limit, bool skip_wb)
5276 {
5277 rtx_insn *insn;
5278 unsigned regno;
5279 unsigned regno2;
5280
5281 for (regno = aarch64_next_callee_save (start, limit);
5282 regno <= limit;
5283 regno = aarch64_next_callee_save (regno + 1, limit))
5284 {
5285 rtx reg, mem;
5286 poly_int64 offset;
5287 int offset_diff;
5288
5289 if (skip_wb
5290 && (regno == cfun->machine->frame.wb_candidate1
5291 || regno == cfun->machine->frame.wb_candidate2))
5292 continue;
5293
5294 if (cfun->machine->reg_is_wrapped_separately[regno])
5295 continue;
5296
5297 reg = gen_rtx_REG (mode, regno);
5298 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5299 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5300 offset));
5301
5302 regno2 = aarch64_next_callee_save (regno + 1, limit);
5303 offset_diff = cfun->machine->frame.reg_offset[regno2]
5304 - cfun->machine->frame.reg_offset[regno];
5305
5306 if (regno2 <= limit
5307 && !cfun->machine->reg_is_wrapped_separately[regno2]
5308 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5309 {
5310 rtx reg2 = gen_rtx_REG (mode, regno2);
5311 rtx mem2;
5312
5313 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5314 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5315 offset));
5316 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5317 reg2));
5318
5319 /* The first part of a frame-related parallel insn is
5320 always assumed to be relevant to the frame
5321 calculations; subsequent parts, are only
5322 frame-related if explicitly marked. */
5323 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5324 regno = regno2;
5325 }
5326 else
5327 insn = emit_move_insn (mem, reg);
5328
5329 RTX_FRAME_RELATED_P (insn) = 1;
5330 }
5331 }
5332
5333 /* Emit code to restore the callee registers of mode MODE from register
5334 number START up to and including LIMIT. Restore from the stack offset
5335 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5336 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5337
5338 static void
5339 aarch64_restore_callee_saves (machine_mode mode,
5340 poly_int64 start_offset, unsigned start,
5341 unsigned limit, bool skip_wb, rtx *cfi_ops)
5342 {
5343 rtx base_rtx = stack_pointer_rtx;
5344 unsigned regno;
5345 unsigned regno2;
5346 poly_int64 offset;
5347
5348 for (regno = aarch64_next_callee_save (start, limit);
5349 regno <= limit;
5350 regno = aarch64_next_callee_save (regno + 1, limit))
5351 {
5352 if (cfun->machine->reg_is_wrapped_separately[regno])
5353 continue;
5354
5355 rtx reg, mem;
5356 int offset_diff;
5357
5358 if (skip_wb
5359 && (regno == cfun->machine->frame.wb_candidate1
5360 || regno == cfun->machine->frame.wb_candidate2))
5361 continue;
5362
5363 reg = gen_rtx_REG (mode, regno);
5364 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5365 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5366
5367 regno2 = aarch64_next_callee_save (regno + 1, limit);
5368 offset_diff = cfun->machine->frame.reg_offset[regno2]
5369 - cfun->machine->frame.reg_offset[regno];
5370
5371 if (regno2 <= limit
5372 && !cfun->machine->reg_is_wrapped_separately[regno2]
5373 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5374 {
5375 rtx reg2 = gen_rtx_REG (mode, regno2);
5376 rtx mem2;
5377
5378 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5379 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5380 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5381
5382 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5383 regno = regno2;
5384 }
5385 else
5386 emit_move_insn (reg, mem);
5387 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5388 }
5389 }
5390
5391 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5392 of MODE. */
5393
5394 static inline bool
5395 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5396 {
5397 HOST_WIDE_INT multiple;
5398 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5399 && IN_RANGE (multiple, -8, 7));
5400 }
5401
5402 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5403 of MODE. */
5404
5405 static inline bool
5406 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5407 {
5408 HOST_WIDE_INT multiple;
5409 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5410 && IN_RANGE (multiple, 0, 63));
5411 }
5412
5413 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5414 of MODE. */
5415
5416 bool
5417 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5418 {
5419 HOST_WIDE_INT multiple;
5420 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5421 && IN_RANGE (multiple, -64, 63));
5422 }
5423
5424 /* Return true if OFFSET is a signed 9-bit value. */
5425
5426 bool
5427 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5428 poly_int64 offset)
5429 {
5430 HOST_WIDE_INT const_offset;
5431 return (offset.is_constant (&const_offset)
5432 && IN_RANGE (const_offset, -256, 255));
5433 }
5434
5435 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5436 of MODE. */
5437
5438 static inline bool
5439 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5440 {
5441 HOST_WIDE_INT multiple;
5442 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5443 && IN_RANGE (multiple, -256, 255));
5444 }
5445
5446 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5447 of MODE. */
5448
5449 static inline bool
5450 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5451 {
5452 HOST_WIDE_INT multiple;
5453 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5454 && IN_RANGE (multiple, 0, 4095));
5455 }
5456
5457 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5458
5459 static sbitmap
5460 aarch64_get_separate_components (void)
5461 {
5462 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5463 bitmap_clear (components);
5464
5465 /* The registers we need saved to the frame. */
5466 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5467 if (aarch64_register_saved_on_entry (regno))
5468 {
5469 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5470 if (!frame_pointer_needed)
5471 offset += cfun->machine->frame.frame_size
5472 - cfun->machine->frame.hard_fp_offset;
5473 /* Check that we can access the stack slot of the register with one
5474 direct load with no adjustments needed. */
5475 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5476 bitmap_set_bit (components, regno);
5477 }
5478
5479 /* Don't mess with the hard frame pointer. */
5480 if (frame_pointer_needed)
5481 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5482
5483 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5484 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5485 /* If registers have been chosen to be stored/restored with
5486 writeback don't interfere with them to avoid having to output explicit
5487 stack adjustment instructions. */
5488 if (reg2 != INVALID_REGNUM)
5489 bitmap_clear_bit (components, reg2);
5490 if (reg1 != INVALID_REGNUM)
5491 bitmap_clear_bit (components, reg1);
5492
5493 bitmap_clear_bit (components, LR_REGNUM);
5494 bitmap_clear_bit (components, SP_REGNUM);
5495
5496 return components;
5497 }
5498
5499 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5500
5501 static sbitmap
5502 aarch64_components_for_bb (basic_block bb)
5503 {
5504 bitmap in = DF_LIVE_IN (bb);
5505 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5506 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5507 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5508
5509 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5510 bitmap_clear (components);
5511
5512 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5513 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5514 if ((!call_used_regs[regno]
5515 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5516 && (bitmap_bit_p (in, regno)
5517 || bitmap_bit_p (gen, regno)
5518 || bitmap_bit_p (kill, regno)))
5519 {
5520 unsigned regno2, offset, offset2;
5521 bitmap_set_bit (components, regno);
5522
5523 /* If there is a callee-save at an adjacent offset, add it too
5524 to increase the use of LDP/STP. */
5525 offset = cfun->machine->frame.reg_offset[regno];
5526 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5527
5528 if (regno2 <= LAST_SAVED_REGNUM)
5529 {
5530 offset2 = cfun->machine->frame.reg_offset[regno2];
5531 if ((offset & ~8) == (offset2 & ~8))
5532 bitmap_set_bit (components, regno2);
5533 }
5534 }
5535
5536 return components;
5537 }
5538
5539 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5540 Nothing to do for aarch64. */
5541
5542 static void
5543 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5544 {
5545 }
5546
5547 /* Return the next set bit in BMP from START onwards. Return the total number
5548 of bits in BMP if no set bit is found at or after START. */
5549
5550 static unsigned int
5551 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5552 {
5553 unsigned int nbits = SBITMAP_SIZE (bmp);
5554 if (start == nbits)
5555 return start;
5556
5557 gcc_assert (start < nbits);
5558 for (unsigned int i = start; i < nbits; i++)
5559 if (bitmap_bit_p (bmp, i))
5560 return i;
5561
5562 return nbits;
5563 }
5564
5565 /* Do the work for aarch64_emit_prologue_components and
5566 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5567 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5568 for these components or the epilogue sequence. That is, it determines
5569 whether we should emit stores or loads and what kind of CFA notes to attach
5570 to the insns. Otherwise the logic for the two sequences is very
5571 similar. */
5572
5573 static void
5574 aarch64_process_components (sbitmap components, bool prologue_p)
5575 {
5576 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5577 ? HARD_FRAME_POINTER_REGNUM
5578 : STACK_POINTER_REGNUM);
5579
5580 unsigned last_regno = SBITMAP_SIZE (components);
5581 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5582 rtx_insn *insn = NULL;
5583
5584 while (regno != last_regno)
5585 {
5586 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5587 so DFmode for the vector registers is enough. For simd functions
5588 we want to save the low 128 bits. */
5589 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5590
5591 rtx reg = gen_rtx_REG (mode, regno);
5592 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5593 if (!frame_pointer_needed)
5594 offset += cfun->machine->frame.frame_size
5595 - cfun->machine->frame.hard_fp_offset;
5596 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5597 rtx mem = gen_frame_mem (mode, addr);
5598
5599 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5600 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5601 /* No more registers to handle after REGNO.
5602 Emit a single save/restore and exit. */
5603 if (regno2 == last_regno)
5604 {
5605 insn = emit_insn (set);
5606 RTX_FRAME_RELATED_P (insn) = 1;
5607 if (prologue_p)
5608 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5609 else
5610 add_reg_note (insn, REG_CFA_RESTORE, reg);
5611 break;
5612 }
5613
5614 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5615 /* The next register is not of the same class or its offset is not
5616 mergeable with the current one into a pair. */
5617 if (!satisfies_constraint_Ump (mem)
5618 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5619 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5620 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5621 GET_MODE_SIZE (mode)))
5622 {
5623 insn = emit_insn (set);
5624 RTX_FRAME_RELATED_P (insn) = 1;
5625 if (prologue_p)
5626 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5627 else
5628 add_reg_note (insn, REG_CFA_RESTORE, reg);
5629
5630 regno = regno2;
5631 continue;
5632 }
5633
5634 /* REGNO2 can be saved/restored in a pair with REGNO. */
5635 rtx reg2 = gen_rtx_REG (mode, regno2);
5636 if (!frame_pointer_needed)
5637 offset2 += cfun->machine->frame.frame_size
5638 - cfun->machine->frame.hard_fp_offset;
5639 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5640 rtx mem2 = gen_frame_mem (mode, addr2);
5641 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5642 : gen_rtx_SET (reg2, mem2);
5643
5644 if (prologue_p)
5645 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5646 else
5647 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5648
5649 RTX_FRAME_RELATED_P (insn) = 1;
5650 if (prologue_p)
5651 {
5652 add_reg_note (insn, REG_CFA_OFFSET, set);
5653 add_reg_note (insn, REG_CFA_OFFSET, set2);
5654 }
5655 else
5656 {
5657 add_reg_note (insn, REG_CFA_RESTORE, reg);
5658 add_reg_note (insn, REG_CFA_RESTORE, reg2);
5659 }
5660
5661 regno = aarch64_get_next_set_bit (components, regno2 + 1);
5662 }
5663 }
5664
5665 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5666
5667 static void
5668 aarch64_emit_prologue_components (sbitmap components)
5669 {
5670 aarch64_process_components (components, true);
5671 }
5672
5673 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5674
5675 static void
5676 aarch64_emit_epilogue_components (sbitmap components)
5677 {
5678 aarch64_process_components (components, false);
5679 }
5680
5681 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5682
5683 static void
5684 aarch64_set_handled_components (sbitmap components)
5685 {
5686 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5687 if (bitmap_bit_p (components, regno))
5688 cfun->machine->reg_is_wrapped_separately[regno] = true;
5689 }
5690
5691 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5692 determining the probe offset for alloca. */
5693
5694 static HOST_WIDE_INT
5695 aarch64_stack_clash_protection_alloca_probe_range (void)
5696 {
5697 return STACK_CLASH_CALLER_GUARD;
5698 }
5699
5700
5701 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5702 registers. If POLY_SIZE is not large enough to require a probe this function
5703 will only adjust the stack. When allocating the stack space
5704 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5705 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5706 arguments. If we are then we ensure that any allocation larger than the ABI
5707 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5708 maintained.
5709
5710 We emit barriers after each stack adjustment to prevent optimizations from
5711 breaking the invariant that we never drop the stack more than a page. This
5712 invariant is needed to make it easier to correctly handle asynchronous
5713 events, e.g. if we were to allow the stack to be dropped by more than a page
5714 and then have multiple probes up and we take a signal somewhere in between
5715 then the signal handler doesn't know the state of the stack and can make no
5716 assumptions about which pages have been probed. */
5717
5718 static void
5719 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5720 poly_int64 poly_size,
5721 bool frame_related_p,
5722 bool final_adjustment_p)
5723 {
5724 HOST_WIDE_INT guard_size
5725 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5726 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5727 /* When doing the final adjustment for the outgoing argument size we can't
5728 assume that LR was saved at position 0. So subtract it's offset from the
5729 ABI safe buffer so that we don't accidentally allow an adjustment that
5730 would result in an allocation larger than the ABI buffer without
5731 probing. */
5732 HOST_WIDE_INT min_probe_threshold
5733 = final_adjustment_p
5734 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5735 : guard_size - guard_used_by_caller;
5736
5737 poly_int64 frame_size = cfun->machine->frame.frame_size;
5738
5739 /* We should always have a positive probe threshold. */
5740 gcc_assert (min_probe_threshold > 0);
5741
5742 if (flag_stack_clash_protection && !final_adjustment_p)
5743 {
5744 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5745 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5746
5747 if (known_eq (frame_size, 0))
5748 {
5749 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5750 }
5751 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5752 && known_lt (final_adjust, guard_used_by_caller))
5753 {
5754 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5755 }
5756 }
5757
5758 /* If SIZE is not large enough to require probing, just adjust the stack and
5759 exit. */
5760 if (known_lt (poly_size, min_probe_threshold)
5761 || !flag_stack_clash_protection)
5762 {
5763 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5764 return;
5765 }
5766
5767 HOST_WIDE_INT size;
5768 /* Handle the SVE non-constant case first. */
5769 if (!poly_size.is_constant (&size))
5770 {
5771 if (dump_file)
5772 {
5773 fprintf (dump_file, "Stack clash SVE prologue: ");
5774 print_dec (poly_size, dump_file);
5775 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5776 }
5777
5778 /* First calculate the amount of bytes we're actually spilling. */
5779 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5780 poly_size, temp1, temp2, false, true);
5781
5782 rtx_insn *insn = get_last_insn ();
5783
5784 if (frame_related_p)
5785 {
5786 /* This is done to provide unwinding information for the stack
5787 adjustments we're about to do, however to prevent the optimizers
5788 from removing the R11 move and leaving the CFA note (which would be
5789 very wrong) we tie the old and new stack pointer together.
5790 The tie will expand to nothing but the optimizers will not touch
5791 the instruction. */
5792 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5793 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5794 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5795
5796 /* We want the CFA independent of the stack pointer for the
5797 duration of the loop. */
5798 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5799 RTX_FRAME_RELATED_P (insn) = 1;
5800 }
5801
5802 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5803 rtx guard_const = gen_int_mode (guard_size, Pmode);
5804
5805 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5806 stack_pointer_rtx, temp1,
5807 probe_const, guard_const));
5808
5809 /* Now reset the CFA register if needed. */
5810 if (frame_related_p)
5811 {
5812 add_reg_note (insn, REG_CFA_DEF_CFA,
5813 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5814 gen_int_mode (poly_size, Pmode)));
5815 RTX_FRAME_RELATED_P (insn) = 1;
5816 }
5817
5818 return;
5819 }
5820
5821 if (dump_file)
5822 fprintf (dump_file,
5823 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5824 " bytes, probing will be required.\n", size);
5825
5826 /* Round size to the nearest multiple of guard_size, and calculate the
5827 residual as the difference between the original size and the rounded
5828 size. */
5829 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5830 HOST_WIDE_INT residual = size - rounded_size;
5831
5832 /* We can handle a small number of allocations/probes inline. Otherwise
5833 punt to a loop. */
5834 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5835 {
5836 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5837 {
5838 aarch64_sub_sp (NULL, temp2, guard_size, true);
5839 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5840 guard_used_by_caller));
5841 emit_insn (gen_blockage ());
5842 }
5843 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5844 }
5845 else
5846 {
5847 /* Compute the ending address. */
5848 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5849 temp1, NULL, false, true);
5850 rtx_insn *insn = get_last_insn ();
5851
5852 /* For the initial allocation, we don't have a frame pointer
5853 set up, so we always need CFI notes. If we're doing the
5854 final allocation, then we may have a frame pointer, in which
5855 case it is the CFA, otherwise we need CFI notes.
5856
5857 We can determine which allocation we are doing by looking at
5858 the value of FRAME_RELATED_P since the final allocations are not
5859 frame related. */
5860 if (frame_related_p)
5861 {
5862 /* We want the CFA independent of the stack pointer for the
5863 duration of the loop. */
5864 add_reg_note (insn, REG_CFA_DEF_CFA,
5865 plus_constant (Pmode, temp1, rounded_size));
5866 RTX_FRAME_RELATED_P (insn) = 1;
5867 }
5868
5869 /* This allocates and probes the stack. Note that this re-uses some of
5870 the existing Ada stack protection code. However we are guaranteed not
5871 to enter the non loop or residual branches of that code.
5872
5873 The non-loop part won't be entered because if our allocation amount
5874 doesn't require a loop, the case above would handle it.
5875
5876 The residual amount won't be entered because TEMP1 is a mutliple of
5877 the allocation size. The residual will always be 0. As such, the only
5878 part we are actually using from that code is the loop setup. The
5879 actual probing is done in aarch64_output_probe_stack_range. */
5880 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5881 stack_pointer_rtx, temp1));
5882
5883 /* Now reset the CFA register if needed. */
5884 if (frame_related_p)
5885 {
5886 add_reg_note (insn, REG_CFA_DEF_CFA,
5887 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5888 RTX_FRAME_RELATED_P (insn) = 1;
5889 }
5890
5891 emit_insn (gen_blockage ());
5892 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5893 }
5894
5895 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
5896 be probed. This maintains the requirement that each page is probed at
5897 least once. For initial probing we probe only if the allocation is
5898 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5899 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
5900 GUARD_SIZE. This works that for any allocation that is large enough to
5901 trigger a probe here, we'll have at least one, and if they're not large
5902 enough for this code to emit anything for them, The page would have been
5903 probed by the saving of FP/LR either by this function or any callees. If
5904 we don't have any callees then we won't have more stack adjustments and so
5905 are still safe. */
5906 if (residual)
5907 {
5908 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5909 /* If we're doing final adjustments, and we've done any full page
5910 allocations then any residual needs to be probed. */
5911 if (final_adjustment_p && rounded_size != 0)
5912 min_probe_threshold = 0;
5913 /* If doing a small final adjustment, we always probe at offset 0.
5914 This is done to avoid issues when LR is not at position 0 or when
5915 the final adjustment is smaller than the probing offset. */
5916 else if (final_adjustment_p && rounded_size == 0)
5917 residual_probe_offset = 0;
5918
5919 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5920 if (residual >= min_probe_threshold)
5921 {
5922 if (dump_file)
5923 fprintf (dump_file,
5924 "Stack clash AArch64 prologue residuals: "
5925 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5926 "\n", residual);
5927
5928 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5929 residual_probe_offset));
5930 emit_insn (gen_blockage ());
5931 }
5932 }
5933 }
5934
5935 /* Return 1 if the register is used by the epilogue. We need to say the
5936 return register is used, but only after epilogue generation is complete.
5937 Note that in the case of sibcalls, the values "used by the epilogue" are
5938 considered live at the start of the called function.
5939
5940 For SIMD functions we need to return 1 for FP registers that are saved and
5941 restored by a function but are not zero in call_used_regs. If we do not do
5942 this optimizations may remove the restore of the register. */
5943
5944 int
5945 aarch64_epilogue_uses (int regno)
5946 {
5947 if (epilogue_completed)
5948 {
5949 if (regno == LR_REGNUM)
5950 return 1;
5951 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5952 return 1;
5953 }
5954 return 0;
5955 }
5956
5957 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5958 is saved at BASE + OFFSET. */
5959
5960 static void
5961 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5962 rtx base, poly_int64 offset)
5963 {
5964 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5965 add_reg_note (insn, REG_CFA_EXPRESSION,
5966 gen_rtx_SET (mem, regno_reg_rtx[reg]));
5967 }
5968
5969 /* AArch64 stack frames generated by this compiler look like:
5970
5971 +-------------------------------+
5972 | |
5973 | incoming stack arguments |
5974 | |
5975 +-------------------------------+
5976 | | <-- incoming stack pointer (aligned)
5977 | callee-allocated save area |
5978 | for register varargs |
5979 | |
5980 +-------------------------------+
5981 | local variables | <-- frame_pointer_rtx
5982 | |
5983 +-------------------------------+
5984 | padding | \
5985 +-------------------------------+ |
5986 | callee-saved registers | | frame.saved_regs_size
5987 +-------------------------------+ |
5988 | LR' | |
5989 +-------------------------------+ |
5990 | FP' | / <- hard_frame_pointer_rtx (aligned)
5991 +-------------------------------+
5992 | dynamic allocation |
5993 +-------------------------------+
5994 | padding |
5995 +-------------------------------+
5996 | outgoing stack arguments | <-- arg_pointer
5997 | |
5998 +-------------------------------+
5999 | | <-- stack_pointer_rtx (aligned)
6000
6001 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6002 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6003 unchanged.
6004
6005 By default for stack-clash we assume the guard is at least 64KB, but this
6006 value is configurable to either 4KB or 64KB. We also force the guard size to
6007 be the same as the probing interval and both values are kept in sync.
6008
6009 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6010 on the guard size) of stack space without probing.
6011
6012 When probing is needed, we emit a probe at the start of the prologue
6013 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6014
6015 We have to track how much space has been allocated and the only stores
6016 to the stack we track as implicit probes are the FP/LR stores.
6017
6018 For outgoing arguments we probe if the size is larger than 1KB, such that
6019 the ABI specified buffer is maintained for the next callee.
6020
6021 The following registers are reserved during frame layout and should not be
6022 used for any other purpose:
6023
6024 - r11: Used by stack clash protection when SVE is enabled.
6025 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6026 - r14 and r15: Used for speculation tracking.
6027 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6028 - r30(LR), r29(FP): Used by standard frame layout.
6029
6030 These registers must be avoided in frame layout related code unless the
6031 explicit intention is to interact with one of the features listed above. */
6032
6033 /* Generate the prologue instructions for entry into a function.
6034 Establish the stack frame by decreasing the stack pointer with a
6035 properly calculated size and, if necessary, create a frame record
6036 filled with the values of LR and previous frame pointer. The
6037 current FP is also set up if it is in use. */
6038
6039 void
6040 aarch64_expand_prologue (void)
6041 {
6042 poly_int64 frame_size = cfun->machine->frame.frame_size;
6043 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6044 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6045 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6046 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6047 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6048 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6049 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6050 rtx_insn *insn;
6051
6052 /* Sign return address for functions. */
6053 if (aarch64_return_address_signing_enabled ())
6054 {
6055 switch (aarch64_ra_sign_key)
6056 {
6057 case AARCH64_KEY_A:
6058 insn = emit_insn (gen_paciasp ());
6059 break;
6060 case AARCH64_KEY_B:
6061 insn = emit_insn (gen_pacibsp ());
6062 break;
6063 default:
6064 gcc_unreachable ();
6065 }
6066 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6067 RTX_FRAME_RELATED_P (insn) = 1;
6068 }
6069
6070 if (flag_stack_usage_info)
6071 current_function_static_stack_size = constant_lower_bound (frame_size);
6072
6073 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6074 {
6075 if (crtl->is_leaf && !cfun->calls_alloca)
6076 {
6077 if (maybe_gt (frame_size, PROBE_INTERVAL)
6078 && maybe_gt (frame_size, get_stack_check_protect ()))
6079 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6080 (frame_size
6081 - get_stack_check_protect ()));
6082 }
6083 else if (maybe_gt (frame_size, 0))
6084 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6085 }
6086
6087 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6088 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6089
6090 /* In theory we should never have both an initial adjustment
6091 and a callee save adjustment. Verify that is the case since the
6092 code below does not handle it for -fstack-clash-protection. */
6093 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6094
6095 /* Will only probe if the initial adjustment is larger than the guard
6096 less the amount of the guard reserved for use by the caller's
6097 outgoing args. */
6098 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6099 true, false);
6100
6101 if (callee_adjust != 0)
6102 aarch64_push_regs (reg1, reg2, callee_adjust);
6103
6104 if (emit_frame_chain)
6105 {
6106 poly_int64 reg_offset = callee_adjust;
6107 if (callee_adjust == 0)
6108 {
6109 reg1 = R29_REGNUM;
6110 reg2 = R30_REGNUM;
6111 reg_offset = callee_offset;
6112 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6113 }
6114 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6115 stack_pointer_rtx, callee_offset,
6116 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6117 if (frame_pointer_needed && !frame_size.is_constant ())
6118 {
6119 /* Variable-sized frames need to describe the save slot
6120 address using DW_CFA_expression rather than DW_CFA_offset.
6121 This means that, without taking further action, the
6122 locations of the registers that we've already saved would
6123 remain based on the stack pointer even after we redefine
6124 the CFA based on the frame pointer. We therefore need new
6125 DW_CFA_expressions to re-express the save slots with addresses
6126 based on the frame pointer. */
6127 rtx_insn *insn = get_last_insn ();
6128 gcc_assert (RTX_FRAME_RELATED_P (insn));
6129
6130 /* Add an explicit CFA definition if this was previously
6131 implicit. */
6132 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6133 {
6134 rtx src = plus_constant (Pmode, stack_pointer_rtx,
6135 callee_offset);
6136 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6137 gen_rtx_SET (hard_frame_pointer_rtx, src));
6138 }
6139
6140 /* Change the save slot expressions for the registers that
6141 we've already saved. */
6142 reg_offset -= callee_offset;
6143 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6144 reg_offset + UNITS_PER_WORD);
6145 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6146 reg_offset);
6147 }
6148 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6149 }
6150
6151 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6152 callee_adjust != 0 || emit_frame_chain);
6153 if (aarch64_simd_decl_p (cfun->decl))
6154 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6155 callee_adjust != 0 || emit_frame_chain);
6156 else
6157 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6158 callee_adjust != 0 || emit_frame_chain);
6159
6160 /* We may need to probe the final adjustment if it is larger than the guard
6161 that is assumed by the called. */
6162 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6163 !frame_pointer_needed, true);
6164 }
6165
6166 /* Return TRUE if we can use a simple_return insn.
6167
6168 This function checks whether the callee saved stack is empty, which
6169 means no restore actions are need. The pro_and_epilogue will use
6170 this to check whether shrink-wrapping opt is feasible. */
6171
6172 bool
6173 aarch64_use_return_insn_p (void)
6174 {
6175 if (!reload_completed)
6176 return false;
6177
6178 if (crtl->profile)
6179 return false;
6180
6181 return known_eq (cfun->machine->frame.frame_size, 0);
6182 }
6183
6184 /* Return false for non-leaf SIMD functions in order to avoid
6185 shrink-wrapping them. Doing this will lose the necessary
6186 save/restore of FP registers. */
6187
6188 bool
6189 aarch64_use_simple_return_insn_p (void)
6190 {
6191 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6192 return false;
6193
6194 return true;
6195 }
6196
6197 /* Generate the epilogue instructions for returning from a function.
6198 This is almost exactly the reverse of the prolog sequence, except
6199 that we need to insert barriers to avoid scheduling loads that read
6200 from a deallocated stack, and we optimize the unwind records by
6201 emitting them all together if possible. */
6202 void
6203 aarch64_expand_epilogue (bool for_sibcall)
6204 {
6205 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6206 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6207 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6208 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6209 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6210 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6211 rtx cfi_ops = NULL;
6212 rtx_insn *insn;
6213 /* A stack clash protection prologue may not have left EP0_REGNUM or
6214 EP1_REGNUM in a usable state. The same is true for allocations
6215 with an SVE component, since we then need both temporary registers
6216 for each allocation. For stack clash we are in a usable state if
6217 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6218 HOST_WIDE_INT guard_size
6219 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6220 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6221
6222 /* We can re-use the registers when the allocation amount is smaller than
6223 guard_size - guard_used_by_caller because we won't be doing any probes
6224 then. In such situations the register should remain live with the correct
6225 value. */
6226 bool can_inherit_p = (initial_adjust.is_constant ()
6227 && final_adjust.is_constant ())
6228 && (!flag_stack_clash_protection
6229 || known_lt (initial_adjust,
6230 guard_size - guard_used_by_caller));
6231
6232 /* We need to add memory barrier to prevent read from deallocated stack. */
6233 bool need_barrier_p
6234 = maybe_ne (get_frame_size ()
6235 + cfun->machine->frame.saved_varargs_size, 0);
6236
6237 /* Emit a barrier to prevent loads from a deallocated stack. */
6238 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6239 || cfun->calls_alloca
6240 || crtl->calls_eh_return)
6241 {
6242 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6243 need_barrier_p = false;
6244 }
6245
6246 /* Restore the stack pointer from the frame pointer if it may not
6247 be the same as the stack pointer. */
6248 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6249 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6250 if (frame_pointer_needed
6251 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6252 /* If writeback is used when restoring callee-saves, the CFA
6253 is restored on the instruction doing the writeback. */
6254 aarch64_add_offset (Pmode, stack_pointer_rtx,
6255 hard_frame_pointer_rtx, -callee_offset,
6256 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6257 else
6258 /* The case where we need to re-use the register here is very rare, so
6259 avoid the complicated condition and just always emit a move if the
6260 immediate doesn't fit. */
6261 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6262
6263 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6264 callee_adjust != 0, &cfi_ops);
6265 if (aarch64_simd_decl_p (cfun->decl))
6266 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6267 callee_adjust != 0, &cfi_ops);
6268 else
6269 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6270 callee_adjust != 0, &cfi_ops);
6271
6272 if (need_barrier_p)
6273 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6274
6275 if (callee_adjust != 0)
6276 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6277
6278 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6279 {
6280 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6281 insn = get_last_insn ();
6282 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6283 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6284 RTX_FRAME_RELATED_P (insn) = 1;
6285 cfi_ops = NULL;
6286 }
6287
6288 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6289 add restriction on emit_move optimization to leaf functions. */
6290 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6291 (!can_inherit_p || !crtl->is_leaf
6292 || df_regs_ever_live_p (EP0_REGNUM)));
6293
6294 if (cfi_ops)
6295 {
6296 /* Emit delayed restores and reset the CFA to be SP. */
6297 insn = get_last_insn ();
6298 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6299 REG_NOTES (insn) = cfi_ops;
6300 RTX_FRAME_RELATED_P (insn) = 1;
6301 }
6302
6303 /* We prefer to emit the combined return/authenticate instruction RETAA,
6304 however there are three cases in which we must instead emit an explicit
6305 authentication instruction.
6306
6307 1) Sibcalls don't return in a normal way, so if we're about to call one
6308 we must authenticate.
6309
6310 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6311 generating code for !TARGET_ARMV8_3 we can't use it and must
6312 explicitly authenticate.
6313
6314 3) On an eh_return path we make extra stack adjustments to update the
6315 canonical frame address to be the exception handler's CFA. We want
6316 to authenticate using the CFA of the function which calls eh_return.
6317 */
6318 if (aarch64_return_address_signing_enabled ()
6319 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6320 {
6321 switch (aarch64_ra_sign_key)
6322 {
6323 case AARCH64_KEY_A:
6324 insn = emit_insn (gen_autiasp ());
6325 break;
6326 case AARCH64_KEY_B:
6327 insn = emit_insn (gen_autibsp ());
6328 break;
6329 default:
6330 gcc_unreachable ();
6331 }
6332 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6333 RTX_FRAME_RELATED_P (insn) = 1;
6334 }
6335
6336 /* Stack adjustment for exception handler. */
6337 if (crtl->calls_eh_return && !for_sibcall)
6338 {
6339 /* We need to unwind the stack by the offset computed by
6340 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6341 to be SP; letting the CFA move during this adjustment
6342 is just as correct as retaining the CFA from the body
6343 of the function. Therefore, do nothing special. */
6344 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6345 }
6346
6347 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6348 if (!for_sibcall)
6349 emit_jump_insn (ret_rtx);
6350 }
6351
6352 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6353 normally or return to a previous frame after unwinding.
6354
6355 An EH return uses a single shared return sequence. The epilogue is
6356 exactly like a normal epilogue except that it has an extra input
6357 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6358 that must be applied after the frame has been destroyed. An extra label
6359 is inserted before the epilogue which initializes this register to zero,
6360 and this is the entry point for a normal return.
6361
6362 An actual EH return updates the return address, initializes the stack
6363 adjustment and jumps directly into the epilogue (bypassing the zeroing
6364 of the adjustment). Since the return address is typically saved on the
6365 stack when a function makes a call, the saved LR must be updated outside
6366 the epilogue.
6367
6368 This poses problems as the store is generated well before the epilogue,
6369 so the offset of LR is not known yet. Also optimizations will remove the
6370 store as it appears dead, even after the epilogue is generated (as the
6371 base or offset for loading LR is different in many cases).
6372
6373 To avoid these problems this implementation forces the frame pointer
6374 in eh_return functions so that the location of LR is fixed and known early.
6375 It also marks the store volatile, so no optimization is permitted to
6376 remove the store. */
6377 rtx
6378 aarch64_eh_return_handler_rtx (void)
6379 {
6380 rtx tmp = gen_frame_mem (Pmode,
6381 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6382
6383 /* Mark the store volatile, so no optimization is permitted to remove it. */
6384 MEM_VOLATILE_P (tmp) = true;
6385 return tmp;
6386 }
6387
6388 /* Output code to add DELTA to the first argument, and then jump
6389 to FUNCTION. Used for C++ multiple inheritance. */
6390 static void
6391 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6392 HOST_WIDE_INT delta,
6393 HOST_WIDE_INT vcall_offset,
6394 tree function)
6395 {
6396 /* The this pointer is always in x0. Note that this differs from
6397 Arm where the this pointer maybe bumped to r1 if r0 is required
6398 to return a pointer to an aggregate. On AArch64 a result value
6399 pointer will be in x8. */
6400 int this_regno = R0_REGNUM;
6401 rtx this_rtx, temp0, temp1, addr, funexp;
6402 rtx_insn *insn;
6403 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6404
6405 if (aarch64_bti_enabled ())
6406 emit_insn (gen_bti_c());
6407
6408 reload_completed = 1;
6409 emit_note (NOTE_INSN_PROLOGUE_END);
6410
6411 this_rtx = gen_rtx_REG (Pmode, this_regno);
6412 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6413 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6414
6415 if (vcall_offset == 0)
6416 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6417 else
6418 {
6419 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6420
6421 addr = this_rtx;
6422 if (delta != 0)
6423 {
6424 if (delta >= -256 && delta < 256)
6425 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6426 plus_constant (Pmode, this_rtx, delta));
6427 else
6428 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6429 temp1, temp0, false);
6430 }
6431
6432 if (Pmode == ptr_mode)
6433 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6434 else
6435 aarch64_emit_move (temp0,
6436 gen_rtx_ZERO_EXTEND (Pmode,
6437 gen_rtx_MEM (ptr_mode, addr)));
6438
6439 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6440 addr = plus_constant (Pmode, temp0, vcall_offset);
6441 else
6442 {
6443 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6444 Pmode);
6445 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6446 }
6447
6448 if (Pmode == ptr_mode)
6449 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6450 else
6451 aarch64_emit_move (temp1,
6452 gen_rtx_SIGN_EXTEND (Pmode,
6453 gen_rtx_MEM (ptr_mode, addr)));
6454
6455 emit_insn (gen_add2_insn (this_rtx, temp1));
6456 }
6457
6458 /* Generate a tail call to the target function. */
6459 if (!TREE_USED (function))
6460 {
6461 assemble_external (function);
6462 TREE_USED (function) = 1;
6463 }
6464 funexp = XEXP (DECL_RTL (function), 0);
6465 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6466 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6467 SIBLING_CALL_P (insn) = 1;
6468
6469 insn = get_insns ();
6470 shorten_branches (insn);
6471
6472 assemble_start_function (thunk, fnname);
6473 final_start_function (insn, file, 1);
6474 final (insn, file, 1);
6475 final_end_function ();
6476 assemble_end_function (thunk, fnname);
6477
6478 /* Stop pretending to be a post-reload pass. */
6479 reload_completed = 0;
6480 }
6481
6482 static bool
6483 aarch64_tls_referenced_p (rtx x)
6484 {
6485 if (!TARGET_HAVE_TLS)
6486 return false;
6487 subrtx_iterator::array_type array;
6488 FOR_EACH_SUBRTX (iter, array, x, ALL)
6489 {
6490 const_rtx x = *iter;
6491 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6492 return true;
6493 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6494 TLS offsets, not real symbol references. */
6495 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6496 iter.skip_subrtxes ();
6497 }
6498 return false;
6499 }
6500
6501
6502 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6503 a left shift of 0 or 12 bits. */
6504 bool
6505 aarch64_uimm12_shift (HOST_WIDE_INT val)
6506 {
6507 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6508 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6509 );
6510 }
6511
6512 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6513 that can be created with a left shift of 0 or 12. */
6514 static HOST_WIDE_INT
6515 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6516 {
6517 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6518 handle correctly. */
6519 gcc_assert ((val & 0xffffff) == val);
6520
6521 if (((val & 0xfff) << 0) == val)
6522 return val;
6523
6524 return val & (0xfff << 12);
6525 }
6526
6527 /* Return true if val is an immediate that can be loaded into a
6528 register by a MOVZ instruction. */
6529 static bool
6530 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6531 {
6532 if (GET_MODE_SIZE (mode) > 4)
6533 {
6534 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6535 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6536 return 1;
6537 }
6538 else
6539 {
6540 /* Ignore sign extension. */
6541 val &= (HOST_WIDE_INT) 0xffffffff;
6542 }
6543 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6544 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6545 }
6546
6547 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6548 64-bit (DImode) integer. */
6549
6550 static unsigned HOST_WIDE_INT
6551 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6552 {
6553 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6554 while (size < 64)
6555 {
6556 val &= (HOST_WIDE_INT_1U << size) - 1;
6557 val |= val << size;
6558 size *= 2;
6559 }
6560 return val;
6561 }
6562
6563 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6564
6565 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6566 {
6567 0x0000000100000001ull,
6568 0x0001000100010001ull,
6569 0x0101010101010101ull,
6570 0x1111111111111111ull,
6571 0x5555555555555555ull,
6572 };
6573
6574
6575 /* Return true if val is a valid bitmask immediate. */
6576
6577 bool
6578 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6579 {
6580 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6581 int bits;
6582
6583 /* Check for a single sequence of one bits and return quickly if so.
6584 The special cases of all ones and all zeroes returns false. */
6585 val = aarch64_replicate_bitmask_imm (val_in, mode);
6586 tmp = val + (val & -val);
6587
6588 if (tmp == (tmp & -tmp))
6589 return (val + 1) > 1;
6590
6591 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6592 if (mode == SImode)
6593 val = (val << 32) | (val & 0xffffffff);
6594
6595 /* Invert if the immediate doesn't start with a zero bit - this means we
6596 only need to search for sequences of one bits. */
6597 if (val & 1)
6598 val = ~val;
6599
6600 /* Find the first set bit and set tmp to val with the first sequence of one
6601 bits removed. Return success if there is a single sequence of ones. */
6602 first_one = val & -val;
6603 tmp = val & (val + first_one);
6604
6605 if (tmp == 0)
6606 return true;
6607
6608 /* Find the next set bit and compute the difference in bit position. */
6609 next_one = tmp & -tmp;
6610 bits = clz_hwi (first_one) - clz_hwi (next_one);
6611 mask = val ^ tmp;
6612
6613 /* Check the bit position difference is a power of 2, and that the first
6614 sequence of one bits fits within 'bits' bits. */
6615 if ((mask >> bits) != 0 || bits != (bits & -bits))
6616 return false;
6617
6618 /* Check the sequence of one bits is repeated 64/bits times. */
6619 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6620 }
6621
6622 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6623 Assumed precondition: VAL_IN Is not zero. */
6624
6625 unsigned HOST_WIDE_INT
6626 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6627 {
6628 int lowest_bit_set = ctz_hwi (val_in);
6629 int highest_bit_set = floor_log2 (val_in);
6630 gcc_assert (val_in != 0);
6631
6632 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6633 (HOST_WIDE_INT_1U << lowest_bit_set));
6634 }
6635
6636 /* Create constant where bits outside of lowest bit set to highest bit set
6637 are set to 1. */
6638
6639 unsigned HOST_WIDE_INT
6640 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6641 {
6642 return val_in | ~aarch64_and_split_imm1 (val_in);
6643 }
6644
6645 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6646
6647 bool
6648 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6649 {
6650 scalar_int_mode int_mode;
6651 if (!is_a <scalar_int_mode> (mode, &int_mode))
6652 return false;
6653
6654 if (aarch64_bitmask_imm (val_in, int_mode))
6655 return false;
6656
6657 if (aarch64_move_imm (val_in, int_mode))
6658 return false;
6659
6660 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6661
6662 return aarch64_bitmask_imm (imm2, int_mode);
6663 }
6664
6665 /* Return true if val is an immediate that can be loaded into a
6666 register in a single instruction. */
6667 bool
6668 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6669 {
6670 scalar_int_mode int_mode;
6671 if (!is_a <scalar_int_mode> (mode, &int_mode))
6672 return false;
6673
6674 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6675 return 1;
6676 return aarch64_bitmask_imm (val, int_mode);
6677 }
6678
6679 static bool
6680 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6681 {
6682 rtx base, offset;
6683
6684 if (GET_CODE (x) == HIGH)
6685 return true;
6686
6687 /* There's no way to calculate VL-based values using relocations. */
6688 subrtx_iterator::array_type array;
6689 FOR_EACH_SUBRTX (iter, array, x, ALL)
6690 if (GET_CODE (*iter) == CONST_POLY_INT)
6691 return true;
6692
6693 split_const (x, &base, &offset);
6694 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6695 {
6696 if (aarch64_classify_symbol (base, INTVAL (offset))
6697 != SYMBOL_FORCE_TO_MEM)
6698 return true;
6699 else
6700 /* Avoid generating a 64-bit relocation in ILP32; leave
6701 to aarch64_expand_mov_immediate to handle it properly. */
6702 return mode != ptr_mode;
6703 }
6704
6705 return aarch64_tls_referenced_p (x);
6706 }
6707
6708 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6709 The expansion for a table switch is quite expensive due to the number
6710 of instructions, the table lookup and hard to predict indirect jump.
6711 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6712 set, otherwise use tables for > 16 cases as a tradeoff between size and
6713 performance. When optimizing for size, use the default setting. */
6714
6715 static unsigned int
6716 aarch64_case_values_threshold (void)
6717 {
6718 /* Use the specified limit for the number of cases before using jump
6719 tables at higher optimization levels. */
6720 if (optimize > 2
6721 && selected_cpu->tune->max_case_values != 0)
6722 return selected_cpu->tune->max_case_values;
6723 else
6724 return optimize_size ? default_case_values_threshold () : 17;
6725 }
6726
6727 /* Return true if register REGNO is a valid index register.
6728 STRICT_P is true if REG_OK_STRICT is in effect. */
6729
6730 bool
6731 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6732 {
6733 if (!HARD_REGISTER_NUM_P (regno))
6734 {
6735 if (!strict_p)
6736 return true;
6737
6738 if (!reg_renumber)
6739 return false;
6740
6741 regno = reg_renumber[regno];
6742 }
6743 return GP_REGNUM_P (regno);
6744 }
6745
6746 /* Return true if register REGNO is a valid base register for mode MODE.
6747 STRICT_P is true if REG_OK_STRICT is in effect. */
6748
6749 bool
6750 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6751 {
6752 if (!HARD_REGISTER_NUM_P (regno))
6753 {
6754 if (!strict_p)
6755 return true;
6756
6757 if (!reg_renumber)
6758 return false;
6759
6760 regno = reg_renumber[regno];
6761 }
6762
6763 /* The fake registers will be eliminated to either the stack or
6764 hard frame pointer, both of which are usually valid base registers.
6765 Reload deals with the cases where the eliminated form isn't valid. */
6766 return (GP_REGNUM_P (regno)
6767 || regno == SP_REGNUM
6768 || regno == FRAME_POINTER_REGNUM
6769 || regno == ARG_POINTER_REGNUM);
6770 }
6771
6772 /* Return true if X is a valid base register for mode MODE.
6773 STRICT_P is true if REG_OK_STRICT is in effect. */
6774
6775 static bool
6776 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6777 {
6778 if (!strict_p
6779 && GET_CODE (x) == SUBREG
6780 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6781 x = SUBREG_REG (x);
6782
6783 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6784 }
6785
6786 /* Return true if address offset is a valid index. If it is, fill in INFO
6787 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6788
6789 static bool
6790 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6791 machine_mode mode, bool strict_p)
6792 {
6793 enum aarch64_address_type type;
6794 rtx index;
6795 int shift;
6796
6797 /* (reg:P) */
6798 if ((REG_P (x) || GET_CODE (x) == SUBREG)
6799 && GET_MODE (x) == Pmode)
6800 {
6801 type = ADDRESS_REG_REG;
6802 index = x;
6803 shift = 0;
6804 }
6805 /* (sign_extend:DI (reg:SI)) */
6806 else if ((GET_CODE (x) == SIGN_EXTEND
6807 || GET_CODE (x) == ZERO_EXTEND)
6808 && GET_MODE (x) == DImode
6809 && GET_MODE (XEXP (x, 0)) == SImode)
6810 {
6811 type = (GET_CODE (x) == SIGN_EXTEND)
6812 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6813 index = XEXP (x, 0);
6814 shift = 0;
6815 }
6816 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6817 else if (GET_CODE (x) == MULT
6818 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6819 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6820 && GET_MODE (XEXP (x, 0)) == DImode
6821 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6822 && CONST_INT_P (XEXP (x, 1)))
6823 {
6824 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6825 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6826 index = XEXP (XEXP (x, 0), 0);
6827 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6828 }
6829 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6830 else if (GET_CODE (x) == ASHIFT
6831 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6832 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6833 && GET_MODE (XEXP (x, 0)) == DImode
6834 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6835 && CONST_INT_P (XEXP (x, 1)))
6836 {
6837 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6838 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6839 index = XEXP (XEXP (x, 0), 0);
6840 shift = INTVAL (XEXP (x, 1));
6841 }
6842 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6843 else if ((GET_CODE (x) == SIGN_EXTRACT
6844 || GET_CODE (x) == ZERO_EXTRACT)
6845 && GET_MODE (x) == DImode
6846 && GET_CODE (XEXP (x, 0)) == MULT
6847 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6848 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6849 {
6850 type = (GET_CODE (x) == SIGN_EXTRACT)
6851 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6852 index = XEXP (XEXP (x, 0), 0);
6853 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6854 if (INTVAL (XEXP (x, 1)) != 32 + shift
6855 || INTVAL (XEXP (x, 2)) != 0)
6856 shift = -1;
6857 }
6858 /* (and:DI (mult:DI (reg:DI) (const_int scale))
6859 (const_int 0xffffffff<<shift)) */
6860 else if (GET_CODE (x) == AND
6861 && GET_MODE (x) == DImode
6862 && GET_CODE (XEXP (x, 0)) == MULT
6863 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6864 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6865 && CONST_INT_P (XEXP (x, 1)))
6866 {
6867 type = ADDRESS_REG_UXTW;
6868 index = XEXP (XEXP (x, 0), 0);
6869 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6870 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6871 shift = -1;
6872 }
6873 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6874 else if ((GET_CODE (x) == SIGN_EXTRACT
6875 || GET_CODE (x) == ZERO_EXTRACT)
6876 && GET_MODE (x) == DImode
6877 && GET_CODE (XEXP (x, 0)) == ASHIFT
6878 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6879 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6880 {
6881 type = (GET_CODE (x) == SIGN_EXTRACT)
6882 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6883 index = XEXP (XEXP (x, 0), 0);
6884 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6885 if (INTVAL (XEXP (x, 1)) != 32 + shift
6886 || INTVAL (XEXP (x, 2)) != 0)
6887 shift = -1;
6888 }
6889 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6890 (const_int 0xffffffff<<shift)) */
6891 else if (GET_CODE (x) == AND
6892 && GET_MODE (x) == DImode
6893 && GET_CODE (XEXP (x, 0)) == ASHIFT
6894 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6895 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6896 && CONST_INT_P (XEXP (x, 1)))
6897 {
6898 type = ADDRESS_REG_UXTW;
6899 index = XEXP (XEXP (x, 0), 0);
6900 shift = INTVAL (XEXP (XEXP (x, 0), 1));
6901 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6902 shift = -1;
6903 }
6904 /* (mult:P (reg:P) (const_int scale)) */
6905 else if (GET_CODE (x) == MULT
6906 && GET_MODE (x) == Pmode
6907 && GET_MODE (XEXP (x, 0)) == Pmode
6908 && CONST_INT_P (XEXP (x, 1)))
6909 {
6910 type = ADDRESS_REG_REG;
6911 index = XEXP (x, 0);
6912 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6913 }
6914 /* (ashift:P (reg:P) (const_int shift)) */
6915 else if (GET_CODE (x) == ASHIFT
6916 && GET_MODE (x) == Pmode
6917 && GET_MODE (XEXP (x, 0)) == Pmode
6918 && CONST_INT_P (XEXP (x, 1)))
6919 {
6920 type = ADDRESS_REG_REG;
6921 index = XEXP (x, 0);
6922 shift = INTVAL (XEXP (x, 1));
6923 }
6924 else
6925 return false;
6926
6927 if (!strict_p
6928 && GET_CODE (index) == SUBREG
6929 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6930 index = SUBREG_REG (index);
6931
6932 if (aarch64_sve_data_mode_p (mode))
6933 {
6934 if (type != ADDRESS_REG_REG
6935 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6936 return false;
6937 }
6938 else
6939 {
6940 if (shift != 0
6941 && !(IN_RANGE (shift, 1, 3)
6942 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6943 return false;
6944 }
6945
6946 if (REG_P (index)
6947 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6948 {
6949 info->type = type;
6950 info->offset = index;
6951 info->shift = shift;
6952 return true;
6953 }
6954
6955 return false;
6956 }
6957
6958 /* Return true if MODE is one of the modes for which we
6959 support LDP/STP operations. */
6960
6961 static bool
6962 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6963 {
6964 return mode == SImode || mode == DImode
6965 || mode == SFmode || mode == DFmode
6966 || (aarch64_vector_mode_supported_p (mode)
6967 && (known_eq (GET_MODE_SIZE (mode), 8)
6968 || (known_eq (GET_MODE_SIZE (mode), 16)
6969 && (aarch64_tune_params.extra_tuning_flags
6970 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6971 }
6972
6973 /* Return true if REGNO is a virtual pointer register, or an eliminable
6974 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
6975 include stack_pointer or hard_frame_pointer. */
6976 static bool
6977 virt_or_elim_regno_p (unsigned regno)
6978 {
6979 return ((regno >= FIRST_VIRTUAL_REGISTER
6980 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6981 || regno == FRAME_POINTER_REGNUM
6982 || regno == ARG_POINTER_REGNUM);
6983 }
6984
6985 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6986 If it is, fill in INFO appropriately. STRICT_P is true if
6987 REG_OK_STRICT is in effect. */
6988
6989 bool
6990 aarch64_classify_address (struct aarch64_address_info *info,
6991 rtx x, machine_mode mode, bool strict_p,
6992 aarch64_addr_query_type type)
6993 {
6994 enum rtx_code code = GET_CODE (x);
6995 rtx op0, op1;
6996 poly_int64 offset;
6997
6998 HOST_WIDE_INT const_size;
6999
7000 /* On BE, we use load/store pair for all large int mode load/stores.
7001 TI/TFmode may also use a load/store pair. */
7002 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7003 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7004 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7005 || type == ADDR_QUERY_LDP_STP_N
7006 || mode == TImode
7007 || mode == TFmode
7008 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7009
7010 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7011 corresponds to the actual size of the memory being loaded/stored and the
7012 mode of the corresponding addressing mode is half of that. */
7013 if (type == ADDR_QUERY_LDP_STP_N
7014 && known_eq (GET_MODE_SIZE (mode), 16))
7015 mode = DFmode;
7016
7017 bool allow_reg_index_p = (!load_store_pair_p
7018 && (known_lt (GET_MODE_SIZE (mode), 16)
7019 || vec_flags == VEC_ADVSIMD
7020 || vec_flags & VEC_SVE_DATA));
7021
7022 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7023 [Rn, #offset, MUL VL]. */
7024 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7025 && (code != REG && code != PLUS))
7026 return false;
7027
7028 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7029 REG addressing. */
7030 if (advsimd_struct_p
7031 && !BYTES_BIG_ENDIAN
7032 && (code != POST_INC && code != REG))
7033 return false;
7034
7035 gcc_checking_assert (GET_MODE (x) == VOIDmode
7036 || SCALAR_INT_MODE_P (GET_MODE (x)));
7037
7038 switch (code)
7039 {
7040 case REG:
7041 case SUBREG:
7042 info->type = ADDRESS_REG_IMM;
7043 info->base = x;
7044 info->offset = const0_rtx;
7045 info->const_offset = 0;
7046 return aarch64_base_register_rtx_p (x, strict_p);
7047
7048 case PLUS:
7049 op0 = XEXP (x, 0);
7050 op1 = XEXP (x, 1);
7051
7052 if (! strict_p
7053 && REG_P (op0)
7054 && virt_or_elim_regno_p (REGNO (op0))
7055 && poly_int_rtx_p (op1, &offset))
7056 {
7057 info->type = ADDRESS_REG_IMM;
7058 info->base = op0;
7059 info->offset = op1;
7060 info->const_offset = offset;
7061
7062 return true;
7063 }
7064
7065 if (maybe_ne (GET_MODE_SIZE (mode), 0)
7066 && aarch64_base_register_rtx_p (op0, strict_p)
7067 && poly_int_rtx_p (op1, &offset))
7068 {
7069 info->type = ADDRESS_REG_IMM;
7070 info->base = op0;
7071 info->offset = op1;
7072 info->const_offset = offset;
7073
7074 /* TImode and TFmode values are allowed in both pairs of X
7075 registers and individual Q registers. The available
7076 address modes are:
7077 X,X: 7-bit signed scaled offset
7078 Q: 9-bit signed offset
7079 We conservatively require an offset representable in either mode.
7080 When performing the check for pairs of X registers i.e. LDP/STP
7081 pass down DImode since that is the natural size of the LDP/STP
7082 instruction memory accesses. */
7083 if (mode == TImode || mode == TFmode)
7084 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7085 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7086 || offset_12bit_unsigned_scaled_p (mode, offset)));
7087
7088 /* A 7bit offset check because OImode will emit a ldp/stp
7089 instruction (only big endian will get here).
7090 For ldp/stp instructions, the offset is scaled for the size of a
7091 single element of the pair. */
7092 if (mode == OImode)
7093 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7094
7095 /* Three 9/12 bit offsets checks because CImode will emit three
7096 ldr/str instructions (only big endian will get here). */
7097 if (mode == CImode)
7098 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7099 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7100 offset + 32)
7101 || offset_12bit_unsigned_scaled_p (V16QImode,
7102 offset + 32)));
7103
7104 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7105 instructions (only big endian will get here). */
7106 if (mode == XImode)
7107 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7108 && aarch64_offset_7bit_signed_scaled_p (TImode,
7109 offset + 32));
7110
7111 /* Make "m" use the LD1 offset range for SVE data modes, so
7112 that pre-RTL optimizers like ivopts will work to that
7113 instead of the wider LDR/STR range. */
7114 if (vec_flags == VEC_SVE_DATA)
7115 return (type == ADDR_QUERY_M
7116 ? offset_4bit_signed_scaled_p (mode, offset)
7117 : offset_9bit_signed_scaled_p (mode, offset));
7118
7119 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7120 {
7121 poly_int64 end_offset = (offset
7122 + GET_MODE_SIZE (mode)
7123 - BYTES_PER_SVE_VECTOR);
7124 return (type == ADDR_QUERY_M
7125 ? offset_4bit_signed_scaled_p (mode, offset)
7126 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7127 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7128 end_offset)));
7129 }
7130
7131 if (vec_flags == VEC_SVE_PRED)
7132 return offset_9bit_signed_scaled_p (mode, offset);
7133
7134 if (load_store_pair_p)
7135 return ((known_eq (GET_MODE_SIZE (mode), 4)
7136 || known_eq (GET_MODE_SIZE (mode), 8)
7137 || known_eq (GET_MODE_SIZE (mode), 16))
7138 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7139 else
7140 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7141 || offset_12bit_unsigned_scaled_p (mode, offset));
7142 }
7143
7144 if (allow_reg_index_p)
7145 {
7146 /* Look for base + (scaled/extended) index register. */
7147 if (aarch64_base_register_rtx_p (op0, strict_p)
7148 && aarch64_classify_index (info, op1, mode, strict_p))
7149 {
7150 info->base = op0;
7151 return true;
7152 }
7153 if (aarch64_base_register_rtx_p (op1, strict_p)
7154 && aarch64_classify_index (info, op0, mode, strict_p))
7155 {
7156 info->base = op1;
7157 return true;
7158 }
7159 }
7160
7161 return false;
7162
7163 case POST_INC:
7164 case POST_DEC:
7165 case PRE_INC:
7166 case PRE_DEC:
7167 info->type = ADDRESS_REG_WB;
7168 info->base = XEXP (x, 0);
7169 info->offset = NULL_RTX;
7170 return aarch64_base_register_rtx_p (info->base, strict_p);
7171
7172 case POST_MODIFY:
7173 case PRE_MODIFY:
7174 info->type = ADDRESS_REG_WB;
7175 info->base = XEXP (x, 0);
7176 if (GET_CODE (XEXP (x, 1)) == PLUS
7177 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7178 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7179 && aarch64_base_register_rtx_p (info->base, strict_p))
7180 {
7181 info->offset = XEXP (XEXP (x, 1), 1);
7182 info->const_offset = offset;
7183
7184 /* TImode and TFmode values are allowed in both pairs of X
7185 registers and individual Q registers. The available
7186 address modes are:
7187 X,X: 7-bit signed scaled offset
7188 Q: 9-bit signed offset
7189 We conservatively require an offset representable in either mode.
7190 */
7191 if (mode == TImode || mode == TFmode)
7192 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7193 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7194
7195 if (load_store_pair_p)
7196 return ((known_eq (GET_MODE_SIZE (mode), 4)
7197 || known_eq (GET_MODE_SIZE (mode), 8)
7198 || known_eq (GET_MODE_SIZE (mode), 16))
7199 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7200 else
7201 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7202 }
7203 return false;
7204
7205 case CONST:
7206 case SYMBOL_REF:
7207 case LABEL_REF:
7208 /* load literal: pc-relative constant pool entry. Only supported
7209 for SI mode or larger. */
7210 info->type = ADDRESS_SYMBOLIC;
7211
7212 if (!load_store_pair_p
7213 && GET_MODE_SIZE (mode).is_constant (&const_size)
7214 && const_size >= 4)
7215 {
7216 rtx sym, addend;
7217
7218 split_const (x, &sym, &addend);
7219 return ((GET_CODE (sym) == LABEL_REF
7220 || (GET_CODE (sym) == SYMBOL_REF
7221 && CONSTANT_POOL_ADDRESS_P (sym)
7222 && aarch64_pcrelative_literal_loads)));
7223 }
7224 return false;
7225
7226 case LO_SUM:
7227 info->type = ADDRESS_LO_SUM;
7228 info->base = XEXP (x, 0);
7229 info->offset = XEXP (x, 1);
7230 if (allow_reg_index_p
7231 && aarch64_base_register_rtx_p (info->base, strict_p))
7232 {
7233 rtx sym, offs;
7234 split_const (info->offset, &sym, &offs);
7235 if (GET_CODE (sym) == SYMBOL_REF
7236 && (aarch64_classify_symbol (sym, INTVAL (offs))
7237 == SYMBOL_SMALL_ABSOLUTE))
7238 {
7239 /* The symbol and offset must be aligned to the access size. */
7240 unsigned int align;
7241
7242 if (CONSTANT_POOL_ADDRESS_P (sym))
7243 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7244 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7245 {
7246 tree exp = SYMBOL_REF_DECL (sym);
7247 align = TYPE_ALIGN (TREE_TYPE (exp));
7248 align = aarch64_constant_alignment (exp, align);
7249 }
7250 else if (SYMBOL_REF_DECL (sym))
7251 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7252 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7253 && SYMBOL_REF_BLOCK (sym) != NULL)
7254 align = SYMBOL_REF_BLOCK (sym)->alignment;
7255 else
7256 align = BITS_PER_UNIT;
7257
7258 poly_int64 ref_size = GET_MODE_SIZE (mode);
7259 if (known_eq (ref_size, 0))
7260 ref_size = GET_MODE_SIZE (DImode);
7261
7262 return (multiple_p (INTVAL (offs), ref_size)
7263 && multiple_p (align / BITS_PER_UNIT, ref_size));
7264 }
7265 }
7266 return false;
7267
7268 default:
7269 return false;
7270 }
7271 }
7272
7273 /* Return true if the address X is valid for a PRFM instruction.
7274 STRICT_P is true if we should do strict checking with
7275 aarch64_classify_address. */
7276
7277 bool
7278 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7279 {
7280 struct aarch64_address_info addr;
7281
7282 /* PRFM accepts the same addresses as DImode... */
7283 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7284 if (!res)
7285 return false;
7286
7287 /* ... except writeback forms. */
7288 return addr.type != ADDRESS_REG_WB;
7289 }
7290
7291 bool
7292 aarch64_symbolic_address_p (rtx x)
7293 {
7294 rtx offset;
7295
7296 split_const (x, &x, &offset);
7297 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7298 }
7299
7300 /* Classify the base of symbolic expression X. */
7301
7302 enum aarch64_symbol_type
7303 aarch64_classify_symbolic_expression (rtx x)
7304 {
7305 rtx offset;
7306
7307 split_const (x, &x, &offset);
7308 return aarch64_classify_symbol (x, INTVAL (offset));
7309 }
7310
7311
7312 /* Return TRUE if X is a legitimate address for accessing memory in
7313 mode MODE. */
7314 static bool
7315 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7316 {
7317 struct aarch64_address_info addr;
7318
7319 return aarch64_classify_address (&addr, x, mode, strict_p);
7320 }
7321
7322 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7323 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7324 bool
7325 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7326 aarch64_addr_query_type type)
7327 {
7328 struct aarch64_address_info addr;
7329
7330 return aarch64_classify_address (&addr, x, mode, strict_p, type);
7331 }
7332
7333 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7334
7335 static bool
7336 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7337 poly_int64 orig_offset,
7338 machine_mode mode)
7339 {
7340 HOST_WIDE_INT size;
7341 if (GET_MODE_SIZE (mode).is_constant (&size))
7342 {
7343 HOST_WIDE_INT const_offset, second_offset;
7344
7345 /* A general SVE offset is A * VQ + B. Remove the A component from
7346 coefficient 0 in order to get the constant B. */
7347 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7348
7349 /* Split an out-of-range address displacement into a base and
7350 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7351 range otherwise to increase opportunities for sharing the base
7352 address of different sizes. Unaligned accesses use the signed
7353 9-bit range, TImode/TFmode use the intersection of signed
7354 scaled 7-bit and signed 9-bit offset. */
7355 if (mode == TImode || mode == TFmode)
7356 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7357 else if ((const_offset & (size - 1)) != 0)
7358 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7359 else
7360 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7361
7362 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7363 return false;
7364
7365 /* Split the offset into second_offset and the rest. */
7366 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7367 *offset2 = gen_int_mode (second_offset, Pmode);
7368 return true;
7369 }
7370 else
7371 {
7372 /* Get the mode we should use as the basis of the range. For structure
7373 modes this is the mode of one vector. */
7374 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7375 machine_mode step_mode
7376 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7377
7378 /* Get the "mul vl" multiplier we'd like to use. */
7379 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7380 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7381 if (vec_flags & VEC_SVE_DATA)
7382 /* LDR supports a 9-bit range, but the move patterns for
7383 structure modes require all vectors to be in range of the
7384 same base. The simplest way of accomodating that while still
7385 promoting reuse of anchor points between different modes is
7386 to use an 8-bit range unconditionally. */
7387 vnum = ((vnum + 128) & 255) - 128;
7388 else
7389 /* Predicates are only handled singly, so we might as well use
7390 the full range. */
7391 vnum = ((vnum + 256) & 511) - 256;
7392 if (vnum == 0)
7393 return false;
7394
7395 /* Convert the "mul vl" multiplier into a byte offset. */
7396 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7397 if (known_eq (second_offset, orig_offset))
7398 return false;
7399
7400 /* Split the offset into second_offset and the rest. */
7401 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7402 *offset2 = gen_int_mode (second_offset, Pmode);
7403 return true;
7404 }
7405 }
7406
7407 /* Return the binary representation of floating point constant VALUE in INTVAL.
7408 If the value cannot be converted, return false without setting INTVAL.
7409 The conversion is done in the given MODE. */
7410 bool
7411 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7412 {
7413
7414 /* We make a general exception for 0. */
7415 if (aarch64_float_const_zero_rtx_p (value))
7416 {
7417 *intval = 0;
7418 return true;
7419 }
7420
7421 scalar_float_mode mode;
7422 if (GET_CODE (value) != CONST_DOUBLE
7423 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7424 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7425 /* Only support up to DF mode. */
7426 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7427 return false;
7428
7429 unsigned HOST_WIDE_INT ival = 0;
7430
7431 long res[2];
7432 real_to_target (res,
7433 CONST_DOUBLE_REAL_VALUE (value),
7434 REAL_MODE_FORMAT (mode));
7435
7436 if (mode == DFmode)
7437 {
7438 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7439 ival = zext_hwi (res[order], 32);
7440 ival |= (zext_hwi (res[1 - order], 32) << 32);
7441 }
7442 else
7443 ival = zext_hwi (res[0], 32);
7444
7445 *intval = ival;
7446 return true;
7447 }
7448
7449 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7450 single MOV(+MOVK) followed by an FMOV. */
7451 bool
7452 aarch64_float_const_rtx_p (rtx x)
7453 {
7454 machine_mode mode = GET_MODE (x);
7455 if (mode == VOIDmode)
7456 return false;
7457
7458 /* Determine whether it's cheaper to write float constants as
7459 mov/movk pairs over ldr/adrp pairs. */
7460 unsigned HOST_WIDE_INT ival;
7461
7462 if (GET_CODE (x) == CONST_DOUBLE
7463 && SCALAR_FLOAT_MODE_P (mode)
7464 && aarch64_reinterpret_float_as_int (x, &ival))
7465 {
7466 scalar_int_mode imode = (mode == HFmode
7467 ? SImode
7468 : int_mode_for_mode (mode).require ());
7469 int num_instr = aarch64_internal_mov_immediate
7470 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7471 return num_instr < 3;
7472 }
7473
7474 return false;
7475 }
7476
7477 /* Return TRUE if rtx X is immediate constant 0.0 */
7478 bool
7479 aarch64_float_const_zero_rtx_p (rtx x)
7480 {
7481 if (GET_MODE (x) == VOIDmode)
7482 return false;
7483
7484 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7485 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7486 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7487 }
7488
7489 /* Return TRUE if rtx X is immediate constant that fits in a single
7490 MOVI immediate operation. */
7491 bool
7492 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7493 {
7494 if (!TARGET_SIMD)
7495 return false;
7496
7497 machine_mode vmode;
7498 scalar_int_mode imode;
7499 unsigned HOST_WIDE_INT ival;
7500
7501 if (GET_CODE (x) == CONST_DOUBLE
7502 && SCALAR_FLOAT_MODE_P (mode))
7503 {
7504 if (!aarch64_reinterpret_float_as_int (x, &ival))
7505 return false;
7506
7507 /* We make a general exception for 0. */
7508 if (aarch64_float_const_zero_rtx_p (x))
7509 return true;
7510
7511 imode = int_mode_for_mode (mode).require ();
7512 }
7513 else if (GET_CODE (x) == CONST_INT
7514 && is_a <scalar_int_mode> (mode, &imode))
7515 ival = INTVAL (x);
7516 else
7517 return false;
7518
7519 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7520 a 128 bit vector mode. */
7521 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7522
7523 vmode = aarch64_simd_container_mode (imode, width);
7524 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7525
7526 return aarch64_simd_valid_immediate (v_op, NULL);
7527 }
7528
7529
7530 /* Return the fixed registers used for condition codes. */
7531
7532 static bool
7533 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7534 {
7535 *p1 = CC_REGNUM;
7536 *p2 = INVALID_REGNUM;
7537 return true;
7538 }
7539
7540 /* This function is used by the call expanders of the machine description.
7541 RESULT is the register in which the result is returned. It's NULL for
7542 "call" and "sibcall".
7543 MEM is the location of the function call.
7544 SIBCALL indicates whether this function call is normal call or sibling call.
7545 It will generate different pattern accordingly. */
7546
7547 void
7548 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7549 {
7550 rtx call, callee, tmp;
7551 rtvec vec;
7552 machine_mode mode;
7553
7554 gcc_assert (MEM_P (mem));
7555 callee = XEXP (mem, 0);
7556 mode = GET_MODE (callee);
7557 gcc_assert (mode == Pmode);
7558
7559 /* Decide if we should generate indirect calls by loading the
7560 address of the callee into a register before performing
7561 the branch-and-link. */
7562 if (SYMBOL_REF_P (callee)
7563 ? (aarch64_is_long_call_p (callee)
7564 || aarch64_is_noplt_call_p (callee))
7565 : !REG_P (callee))
7566 XEXP (mem, 0) = force_reg (mode, callee);
7567
7568 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7569
7570 if (result != NULL_RTX)
7571 call = gen_rtx_SET (result, call);
7572
7573 if (sibcall)
7574 tmp = ret_rtx;
7575 else
7576 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7577
7578 vec = gen_rtvec (2, call, tmp);
7579 call = gen_rtx_PARALLEL (VOIDmode, vec);
7580
7581 aarch64_emit_call_insn (call);
7582 }
7583
7584 /* Emit call insn with PAT and do aarch64-specific handling. */
7585
7586 void
7587 aarch64_emit_call_insn (rtx pat)
7588 {
7589 rtx insn = emit_call_insn (pat);
7590
7591 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7592 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7593 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7594 }
7595
7596 machine_mode
7597 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7598 {
7599 machine_mode mode_x = GET_MODE (x);
7600 rtx_code code_x = GET_CODE (x);
7601
7602 /* All floating point compares return CCFP if it is an equality
7603 comparison, and CCFPE otherwise. */
7604 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7605 {
7606 switch (code)
7607 {
7608 case EQ:
7609 case NE:
7610 case UNORDERED:
7611 case ORDERED:
7612 case UNLT:
7613 case UNLE:
7614 case UNGT:
7615 case UNGE:
7616 case UNEQ:
7617 return CCFPmode;
7618
7619 case LT:
7620 case LE:
7621 case GT:
7622 case GE:
7623 case LTGT:
7624 return CCFPEmode;
7625
7626 default:
7627 gcc_unreachable ();
7628 }
7629 }
7630
7631 /* Equality comparisons of short modes against zero can be performed
7632 using the TST instruction with the appropriate bitmask. */
7633 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7634 && (code == EQ || code == NE)
7635 && (mode_x == HImode || mode_x == QImode))
7636 return CC_NZmode;
7637
7638 /* Similarly, comparisons of zero_extends from shorter modes can
7639 be performed using an ANDS with an immediate mask. */
7640 if (y == const0_rtx && code_x == ZERO_EXTEND
7641 && (mode_x == SImode || mode_x == DImode)
7642 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7643 && (code == EQ || code == NE))
7644 return CC_NZmode;
7645
7646 if ((mode_x == SImode || mode_x == DImode)
7647 && y == const0_rtx
7648 && (code == EQ || code == NE || code == LT || code == GE)
7649 && (code_x == PLUS || code_x == MINUS || code_x == AND
7650 || code_x == NEG
7651 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7652 && CONST_INT_P (XEXP (x, 2)))))
7653 return CC_NZmode;
7654
7655 /* A compare with a shifted operand. Because of canonicalization,
7656 the comparison will have to be swapped when we emit the assembly
7657 code. */
7658 if ((mode_x == SImode || mode_x == DImode)
7659 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7660 && (code_x == ASHIFT || code_x == ASHIFTRT
7661 || code_x == LSHIFTRT
7662 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7663 return CC_SWPmode;
7664
7665 /* Similarly for a negated operand, but we can only do this for
7666 equalities. */
7667 if ((mode_x == SImode || mode_x == DImode)
7668 && (REG_P (y) || GET_CODE (y) == SUBREG)
7669 && (code == EQ || code == NE)
7670 && code_x == NEG)
7671 return CC_Zmode;
7672
7673 /* A test for unsigned overflow from an addition. */
7674 if ((mode_x == DImode || mode_x == TImode)
7675 && (code == LTU || code == GEU)
7676 && code_x == PLUS
7677 && rtx_equal_p (XEXP (x, 0), y))
7678 return CC_Cmode;
7679
7680 /* A test for unsigned overflow from an add with carry. */
7681 if ((mode_x == DImode || mode_x == TImode)
7682 && (code == LTU || code == GEU)
7683 && code_x == PLUS
7684 && CONST_SCALAR_INT_P (y)
7685 && (rtx_mode_t (y, mode_x)
7686 == (wi::shwi (1, mode_x)
7687 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7688 return CC_ADCmode;
7689
7690 /* A test for signed overflow. */
7691 if ((mode_x == DImode || mode_x == TImode)
7692 && code == NE
7693 && code_x == PLUS
7694 && GET_CODE (y) == SIGN_EXTEND)
7695 return CC_Vmode;
7696
7697 /* For everything else, return CCmode. */
7698 return CCmode;
7699 }
7700
7701 static int
7702 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7703
7704 int
7705 aarch64_get_condition_code (rtx x)
7706 {
7707 machine_mode mode = GET_MODE (XEXP (x, 0));
7708 enum rtx_code comp_code = GET_CODE (x);
7709
7710 if (GET_MODE_CLASS (mode) != MODE_CC)
7711 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7712 return aarch64_get_condition_code_1 (mode, comp_code);
7713 }
7714
7715 static int
7716 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7717 {
7718 switch (mode)
7719 {
7720 case E_CCFPmode:
7721 case E_CCFPEmode:
7722 switch (comp_code)
7723 {
7724 case GE: return AARCH64_GE;
7725 case GT: return AARCH64_GT;
7726 case LE: return AARCH64_LS;
7727 case LT: return AARCH64_MI;
7728 case NE: return AARCH64_NE;
7729 case EQ: return AARCH64_EQ;
7730 case ORDERED: return AARCH64_VC;
7731 case UNORDERED: return AARCH64_VS;
7732 case UNLT: return AARCH64_LT;
7733 case UNLE: return AARCH64_LE;
7734 case UNGT: return AARCH64_HI;
7735 case UNGE: return AARCH64_PL;
7736 default: return -1;
7737 }
7738 break;
7739
7740 case E_CCmode:
7741 switch (comp_code)
7742 {
7743 case NE: return AARCH64_NE;
7744 case EQ: return AARCH64_EQ;
7745 case GE: return AARCH64_GE;
7746 case GT: return AARCH64_GT;
7747 case LE: return AARCH64_LE;
7748 case LT: return AARCH64_LT;
7749 case GEU: return AARCH64_CS;
7750 case GTU: return AARCH64_HI;
7751 case LEU: return AARCH64_LS;
7752 case LTU: return AARCH64_CC;
7753 default: return -1;
7754 }
7755 break;
7756
7757 case E_CC_SWPmode:
7758 switch (comp_code)
7759 {
7760 case NE: return AARCH64_NE;
7761 case EQ: return AARCH64_EQ;
7762 case GE: return AARCH64_LE;
7763 case GT: return AARCH64_LT;
7764 case LE: return AARCH64_GE;
7765 case LT: return AARCH64_GT;
7766 case GEU: return AARCH64_LS;
7767 case GTU: return AARCH64_CC;
7768 case LEU: return AARCH64_CS;
7769 case LTU: return AARCH64_HI;
7770 default: return -1;
7771 }
7772 break;
7773
7774 case E_CC_NZCmode:
7775 switch (comp_code)
7776 {
7777 case NE: return AARCH64_NE; /* = any */
7778 case EQ: return AARCH64_EQ; /* = none */
7779 case GE: return AARCH64_PL; /* = nfrst */
7780 case LT: return AARCH64_MI; /* = first */
7781 case GEU: return AARCH64_CS; /* = nlast */
7782 case GTU: return AARCH64_HI; /* = pmore */
7783 case LEU: return AARCH64_LS; /* = plast */
7784 case LTU: return AARCH64_CC; /* = last */
7785 default: return -1;
7786 }
7787 break;
7788
7789 case E_CC_NZmode:
7790 switch (comp_code)
7791 {
7792 case NE: return AARCH64_NE;
7793 case EQ: return AARCH64_EQ;
7794 case GE: return AARCH64_PL;
7795 case LT: return AARCH64_MI;
7796 default: return -1;
7797 }
7798 break;
7799
7800 case E_CC_Zmode:
7801 switch (comp_code)
7802 {
7803 case NE: return AARCH64_NE;
7804 case EQ: return AARCH64_EQ;
7805 default: return -1;
7806 }
7807 break;
7808
7809 case E_CC_Cmode:
7810 switch (comp_code)
7811 {
7812 case LTU: return AARCH64_CS;
7813 case GEU: return AARCH64_CC;
7814 default: return -1;
7815 }
7816 break;
7817
7818 case E_CC_ADCmode:
7819 switch (comp_code)
7820 {
7821 case GEU: return AARCH64_CS;
7822 case LTU: return AARCH64_CC;
7823 default: return -1;
7824 }
7825 break;
7826
7827 case E_CC_Vmode:
7828 switch (comp_code)
7829 {
7830 case NE: return AARCH64_VS;
7831 case EQ: return AARCH64_VC;
7832 default: return -1;
7833 }
7834 break;
7835
7836 default:
7837 return -1;
7838 }
7839
7840 return -1;
7841 }
7842
7843 bool
7844 aarch64_const_vec_all_same_in_range_p (rtx x,
7845 HOST_WIDE_INT minval,
7846 HOST_WIDE_INT maxval)
7847 {
7848 rtx elt;
7849 return (const_vec_duplicate_p (x, &elt)
7850 && CONST_INT_P (elt)
7851 && IN_RANGE (INTVAL (elt), minval, maxval));
7852 }
7853
7854 bool
7855 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7856 {
7857 return aarch64_const_vec_all_same_in_range_p (x, val, val);
7858 }
7859
7860 /* Return true if VEC is a constant in which every element is in the range
7861 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
7862
7863 static bool
7864 aarch64_const_vec_all_in_range_p (rtx vec,
7865 HOST_WIDE_INT minval,
7866 HOST_WIDE_INT maxval)
7867 {
7868 if (GET_CODE (vec) != CONST_VECTOR
7869 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7870 return false;
7871
7872 int nunits;
7873 if (!CONST_VECTOR_STEPPED_P (vec))
7874 nunits = const_vector_encoded_nelts (vec);
7875 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7876 return false;
7877
7878 for (int i = 0; i < nunits; i++)
7879 {
7880 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7881 if (!CONST_INT_P (vec_elem)
7882 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7883 return false;
7884 }
7885 return true;
7886 }
7887
7888 /* N Z C V. */
7889 #define AARCH64_CC_V 1
7890 #define AARCH64_CC_C (1 << 1)
7891 #define AARCH64_CC_Z (1 << 2)
7892 #define AARCH64_CC_N (1 << 3)
7893
7894 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
7895 static const int aarch64_nzcv_codes[] =
7896 {
7897 0, /* EQ, Z == 1. */
7898 AARCH64_CC_Z, /* NE, Z == 0. */
7899 0, /* CS, C == 1. */
7900 AARCH64_CC_C, /* CC, C == 0. */
7901 0, /* MI, N == 1. */
7902 AARCH64_CC_N, /* PL, N == 0. */
7903 0, /* VS, V == 1. */
7904 AARCH64_CC_V, /* VC, V == 0. */
7905 0, /* HI, C ==1 && Z == 0. */
7906 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
7907 AARCH64_CC_V, /* GE, N == V. */
7908 0, /* LT, N != V. */
7909 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
7910 0, /* LE, !(Z == 0 && N == V). */
7911 0, /* AL, Any. */
7912 0 /* NV, Any. */
7913 };
7914
7915 /* Print floating-point vector immediate operand X to F, negating it
7916 first if NEGATE is true. Return true on success, false if it isn't
7917 a constant we can handle. */
7918
7919 static bool
7920 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7921 {
7922 rtx elt;
7923
7924 if (!const_vec_duplicate_p (x, &elt))
7925 return false;
7926
7927 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7928 if (negate)
7929 r = real_value_negate (&r);
7930
7931 /* We only handle the SVE single-bit immediates here. */
7932 if (real_equal (&r, &dconst0))
7933 asm_fprintf (f, "0.0");
7934 else if (real_equal (&r, &dconst1))
7935 asm_fprintf (f, "1.0");
7936 else if (real_equal (&r, &dconsthalf))
7937 asm_fprintf (f, "0.5");
7938 else
7939 return false;
7940
7941 return true;
7942 }
7943
7944 /* Return the equivalent letter for size. */
7945 static char
7946 sizetochar (int size)
7947 {
7948 switch (size)
7949 {
7950 case 64: return 'd';
7951 case 32: return 's';
7952 case 16: return 'h';
7953 case 8 : return 'b';
7954 default: gcc_unreachable ();
7955 }
7956 }
7957
7958 /* Print operand X to file F in a target specific manner according to CODE.
7959 The acceptable formatting commands given by CODE are:
7960 'c': An integer or symbol address without a preceding #
7961 sign.
7962 'C': Take the duplicated element in a vector constant
7963 and print it in hex.
7964 'D': Take the duplicated element in a vector constant
7965 and print it as an unsigned integer, in decimal.
7966 'e': Print the sign/zero-extend size as a character 8->b,
7967 16->h, 32->w.
7968 'p': Prints N such that 2^N == X (X must be power of 2 and
7969 const int).
7970 'P': Print the number of non-zero bits in X (a const_int).
7971 'H': Print the higher numbered register of a pair (TImode)
7972 of regs.
7973 'm': Print a condition (eq, ne, etc).
7974 'M': Same as 'm', but invert condition.
7975 'N': Take the duplicated element in a vector constant
7976 and print the negative of it in decimal.
7977 'b/h/s/d/q': Print a scalar FP/SIMD register name.
7978 'S/T/U/V': Print a FP/SIMD register name for a register list.
7979 The register printed is the FP/SIMD register name
7980 of X + 0/1/2/3 for S/T/U/V.
7981 'R': Print a scalar FP/SIMD register name + 1.
7982 'X': Print bottom 16 bits of integer constant in hex.
7983 'w/x': Print a general register name or the zero register
7984 (32-bit or 64-bit).
7985 '0': Print a normal operand, if it's a general register,
7986 then we assume DImode.
7987 'k': Print NZCV for conditional compare instructions.
7988 'A': Output address constant representing the first
7989 argument of X, specifying a relocation offset
7990 if appropriate.
7991 'L': Output constant address specified by X
7992 with a relocation offset if appropriate.
7993 'G': Prints address of X, specifying a PC relative
7994 relocation mode if appropriate.
7995 'y': Output address of LDP or STP - this is used for
7996 some LDP/STPs which don't use a PARALLEL in their
7997 pattern (so the mode needs to be adjusted).
7998 'z': Output address of a typical LDP or STP. */
7999
8000 static void
8001 aarch64_print_operand (FILE *f, rtx x, int code)
8002 {
8003 rtx elt;
8004 switch (code)
8005 {
8006 case 'c':
8007 switch (GET_CODE (x))
8008 {
8009 case CONST_INT:
8010 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8011 break;
8012
8013 case SYMBOL_REF:
8014 output_addr_const (f, x);
8015 break;
8016
8017 case CONST:
8018 if (GET_CODE (XEXP (x, 0)) == PLUS
8019 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8020 {
8021 output_addr_const (f, x);
8022 break;
8023 }
8024 /* Fall through. */
8025
8026 default:
8027 output_operand_lossage ("unsupported operand for code '%c'", code);
8028 }
8029 break;
8030
8031 case 'e':
8032 {
8033 int n;
8034
8035 if (!CONST_INT_P (x)
8036 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
8037 {
8038 output_operand_lossage ("invalid operand for '%%%c'", code);
8039 return;
8040 }
8041
8042 switch (n)
8043 {
8044 case 3:
8045 fputc ('b', f);
8046 break;
8047 case 4:
8048 fputc ('h', f);
8049 break;
8050 case 5:
8051 fputc ('w', f);
8052 break;
8053 default:
8054 output_operand_lossage ("invalid operand for '%%%c'", code);
8055 return;
8056 }
8057 }
8058 break;
8059
8060 case 'p':
8061 {
8062 int n;
8063
8064 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8065 {
8066 output_operand_lossage ("invalid operand for '%%%c'", code);
8067 return;
8068 }
8069
8070 asm_fprintf (f, "%d", n);
8071 }
8072 break;
8073
8074 case 'P':
8075 if (!CONST_INT_P (x))
8076 {
8077 output_operand_lossage ("invalid operand for '%%%c'", code);
8078 return;
8079 }
8080
8081 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8082 break;
8083
8084 case 'H':
8085 if (x == const0_rtx)
8086 {
8087 asm_fprintf (f, "xzr");
8088 break;
8089 }
8090
8091 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8092 {
8093 output_operand_lossage ("invalid operand for '%%%c'", code);
8094 return;
8095 }
8096
8097 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8098 break;
8099
8100 case 'M':
8101 case 'm':
8102 {
8103 int cond_code;
8104 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8105 if (x == const_true_rtx)
8106 {
8107 if (code == 'M')
8108 fputs ("nv", f);
8109 return;
8110 }
8111
8112 if (!COMPARISON_P (x))
8113 {
8114 output_operand_lossage ("invalid operand for '%%%c'", code);
8115 return;
8116 }
8117
8118 cond_code = aarch64_get_condition_code (x);
8119 gcc_assert (cond_code >= 0);
8120 if (code == 'M')
8121 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8122 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8123 fputs (aarch64_sve_condition_codes[cond_code], f);
8124 else
8125 fputs (aarch64_condition_codes[cond_code], f);
8126 }
8127 break;
8128
8129 case 'N':
8130 if (!const_vec_duplicate_p (x, &elt))
8131 {
8132 output_operand_lossage ("invalid vector constant");
8133 return;
8134 }
8135
8136 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8137 asm_fprintf (f, "%wd", -INTVAL (elt));
8138 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8139 && aarch64_print_vector_float_operand (f, x, true))
8140 ;
8141 else
8142 {
8143 output_operand_lossage ("invalid vector constant");
8144 return;
8145 }
8146 break;
8147
8148 case 'b':
8149 case 'h':
8150 case 's':
8151 case 'd':
8152 case 'q':
8153 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8154 {
8155 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8156 return;
8157 }
8158 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8159 break;
8160
8161 case 'S':
8162 case 'T':
8163 case 'U':
8164 case 'V':
8165 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8166 {
8167 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8168 return;
8169 }
8170 asm_fprintf (f, "%c%d",
8171 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8172 REGNO (x) - V0_REGNUM + (code - 'S'));
8173 break;
8174
8175 case 'R':
8176 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8177 {
8178 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8179 return;
8180 }
8181 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8182 break;
8183
8184 case 'X':
8185 if (!CONST_INT_P (x))
8186 {
8187 output_operand_lossage ("invalid operand for '%%%c'", code);
8188 return;
8189 }
8190 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8191 break;
8192
8193 case 'C':
8194 {
8195 /* Print a replicated constant in hex. */
8196 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8197 {
8198 output_operand_lossage ("invalid operand for '%%%c'", code);
8199 return;
8200 }
8201 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8202 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8203 }
8204 break;
8205
8206 case 'D':
8207 {
8208 /* Print a replicated constant in decimal, treating it as
8209 unsigned. */
8210 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8211 {
8212 output_operand_lossage ("invalid operand for '%%%c'", code);
8213 return;
8214 }
8215 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8216 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8217 }
8218 break;
8219
8220 case 'w':
8221 case 'x':
8222 if (x == const0_rtx
8223 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8224 {
8225 asm_fprintf (f, "%czr", code);
8226 break;
8227 }
8228
8229 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8230 {
8231 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8232 break;
8233 }
8234
8235 if (REG_P (x) && REGNO (x) == SP_REGNUM)
8236 {
8237 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8238 break;
8239 }
8240
8241 /* Fall through */
8242
8243 case 0:
8244 if (x == NULL)
8245 {
8246 output_operand_lossage ("missing operand");
8247 return;
8248 }
8249
8250 switch (GET_CODE (x))
8251 {
8252 case REG:
8253 if (aarch64_sve_data_mode_p (GET_MODE (x)))
8254 {
8255 if (REG_NREGS (x) == 1)
8256 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8257 else
8258 {
8259 char suffix
8260 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8261 asm_fprintf (f, "{z%d.%c - z%d.%c}",
8262 REGNO (x) - V0_REGNUM, suffix,
8263 END_REGNO (x) - V0_REGNUM - 1, suffix);
8264 }
8265 }
8266 else
8267 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8268 break;
8269
8270 case MEM:
8271 output_address (GET_MODE (x), XEXP (x, 0));
8272 break;
8273
8274 case LABEL_REF:
8275 case SYMBOL_REF:
8276 output_addr_const (asm_out_file, x);
8277 break;
8278
8279 case CONST_INT:
8280 asm_fprintf (f, "%wd", INTVAL (x));
8281 break;
8282
8283 case CONST:
8284 if (!VECTOR_MODE_P (GET_MODE (x)))
8285 {
8286 output_addr_const (asm_out_file, x);
8287 break;
8288 }
8289 /* fall through */
8290
8291 case CONST_VECTOR:
8292 if (!const_vec_duplicate_p (x, &elt))
8293 {
8294 output_operand_lossage ("invalid vector constant");
8295 return;
8296 }
8297
8298 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8299 asm_fprintf (f, "%wd", INTVAL (elt));
8300 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8301 && aarch64_print_vector_float_operand (f, x, false))
8302 ;
8303 else
8304 {
8305 output_operand_lossage ("invalid vector constant");
8306 return;
8307 }
8308 break;
8309
8310 case CONST_DOUBLE:
8311 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8312 be getting CONST_DOUBLEs holding integers. */
8313 gcc_assert (GET_MODE (x) != VOIDmode);
8314 if (aarch64_float_const_zero_rtx_p (x))
8315 {
8316 fputc ('0', f);
8317 break;
8318 }
8319 else if (aarch64_float_const_representable_p (x))
8320 {
8321 #define buf_size 20
8322 char float_buf[buf_size] = {'\0'};
8323 real_to_decimal_for_mode (float_buf,
8324 CONST_DOUBLE_REAL_VALUE (x),
8325 buf_size, buf_size,
8326 1, GET_MODE (x));
8327 asm_fprintf (asm_out_file, "%s", float_buf);
8328 break;
8329 #undef buf_size
8330 }
8331 output_operand_lossage ("invalid constant");
8332 return;
8333 default:
8334 output_operand_lossage ("invalid operand");
8335 return;
8336 }
8337 break;
8338
8339 case 'A':
8340 if (GET_CODE (x) == HIGH)
8341 x = XEXP (x, 0);
8342
8343 switch (aarch64_classify_symbolic_expression (x))
8344 {
8345 case SYMBOL_SMALL_GOT_4G:
8346 asm_fprintf (asm_out_file, ":got:");
8347 break;
8348
8349 case SYMBOL_SMALL_TLSGD:
8350 asm_fprintf (asm_out_file, ":tlsgd:");
8351 break;
8352
8353 case SYMBOL_SMALL_TLSDESC:
8354 asm_fprintf (asm_out_file, ":tlsdesc:");
8355 break;
8356
8357 case SYMBOL_SMALL_TLSIE:
8358 asm_fprintf (asm_out_file, ":gottprel:");
8359 break;
8360
8361 case SYMBOL_TLSLE24:
8362 asm_fprintf (asm_out_file, ":tprel:");
8363 break;
8364
8365 case SYMBOL_TINY_GOT:
8366 gcc_unreachable ();
8367 break;
8368
8369 default:
8370 break;
8371 }
8372 output_addr_const (asm_out_file, x);
8373 break;
8374
8375 case 'L':
8376 switch (aarch64_classify_symbolic_expression (x))
8377 {
8378 case SYMBOL_SMALL_GOT_4G:
8379 asm_fprintf (asm_out_file, ":lo12:");
8380 break;
8381
8382 case SYMBOL_SMALL_TLSGD:
8383 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8384 break;
8385
8386 case SYMBOL_SMALL_TLSDESC:
8387 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8388 break;
8389
8390 case SYMBOL_SMALL_TLSIE:
8391 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8392 break;
8393
8394 case SYMBOL_TLSLE12:
8395 asm_fprintf (asm_out_file, ":tprel_lo12:");
8396 break;
8397
8398 case SYMBOL_TLSLE24:
8399 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8400 break;
8401
8402 case SYMBOL_TINY_GOT:
8403 asm_fprintf (asm_out_file, ":got:");
8404 break;
8405
8406 case SYMBOL_TINY_TLSIE:
8407 asm_fprintf (asm_out_file, ":gottprel:");
8408 break;
8409
8410 default:
8411 break;
8412 }
8413 output_addr_const (asm_out_file, x);
8414 break;
8415
8416 case 'G':
8417 switch (aarch64_classify_symbolic_expression (x))
8418 {
8419 case SYMBOL_TLSLE24:
8420 asm_fprintf (asm_out_file, ":tprel_hi12:");
8421 break;
8422 default:
8423 break;
8424 }
8425 output_addr_const (asm_out_file, x);
8426 break;
8427
8428 case 'k':
8429 {
8430 HOST_WIDE_INT cond_code;
8431
8432 if (!CONST_INT_P (x))
8433 {
8434 output_operand_lossage ("invalid operand for '%%%c'", code);
8435 return;
8436 }
8437
8438 cond_code = INTVAL (x);
8439 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8440 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8441 }
8442 break;
8443
8444 case 'y':
8445 case 'z':
8446 {
8447 machine_mode mode = GET_MODE (x);
8448
8449 if (GET_CODE (x) != MEM
8450 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8451 {
8452 output_operand_lossage ("invalid operand for '%%%c'", code);
8453 return;
8454 }
8455
8456 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8457 code == 'y'
8458 ? ADDR_QUERY_LDP_STP_N
8459 : ADDR_QUERY_LDP_STP))
8460 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8461 }
8462 break;
8463
8464 default:
8465 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8466 return;
8467 }
8468 }
8469
8470 /* Print address 'x' of a memory access with mode 'mode'.
8471 'op' is the context required by aarch64_classify_address. It can either be
8472 MEM for a normal memory access or PARALLEL for LDP/STP. */
8473 static bool
8474 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8475 aarch64_addr_query_type type)
8476 {
8477 struct aarch64_address_info addr;
8478 unsigned int size;
8479
8480 /* Check all addresses are Pmode - including ILP32. */
8481 if (GET_MODE (x) != Pmode
8482 && (!CONST_INT_P (x)
8483 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8484 {
8485 output_operand_lossage ("invalid address mode");
8486 return false;
8487 }
8488
8489 if (aarch64_classify_address (&addr, x, mode, true, type))
8490 switch (addr.type)
8491 {
8492 case ADDRESS_REG_IMM:
8493 if (known_eq (addr.const_offset, 0))
8494 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8495 else if (aarch64_sve_data_mode_p (mode))
8496 {
8497 HOST_WIDE_INT vnum
8498 = exact_div (addr.const_offset,
8499 BYTES_PER_SVE_VECTOR).to_constant ();
8500 asm_fprintf (f, "[%s, #%wd, mul vl]",
8501 reg_names[REGNO (addr.base)], vnum);
8502 }
8503 else if (aarch64_sve_pred_mode_p (mode))
8504 {
8505 HOST_WIDE_INT vnum
8506 = exact_div (addr.const_offset,
8507 BYTES_PER_SVE_PRED).to_constant ();
8508 asm_fprintf (f, "[%s, #%wd, mul vl]",
8509 reg_names[REGNO (addr.base)], vnum);
8510 }
8511 else
8512 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8513 INTVAL (addr.offset));
8514 return true;
8515
8516 case ADDRESS_REG_REG:
8517 if (addr.shift == 0)
8518 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8519 reg_names [REGNO (addr.offset)]);
8520 else
8521 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8522 reg_names [REGNO (addr.offset)], addr.shift);
8523 return true;
8524
8525 case ADDRESS_REG_UXTW:
8526 if (addr.shift == 0)
8527 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8528 REGNO (addr.offset) - R0_REGNUM);
8529 else
8530 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8531 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8532 return true;
8533
8534 case ADDRESS_REG_SXTW:
8535 if (addr.shift == 0)
8536 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8537 REGNO (addr.offset) - R0_REGNUM);
8538 else
8539 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8540 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8541 return true;
8542
8543 case ADDRESS_REG_WB:
8544 /* Writeback is only supported for fixed-width modes. */
8545 size = GET_MODE_SIZE (mode).to_constant ();
8546 switch (GET_CODE (x))
8547 {
8548 case PRE_INC:
8549 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8550 return true;
8551 case POST_INC:
8552 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8553 return true;
8554 case PRE_DEC:
8555 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8556 return true;
8557 case POST_DEC:
8558 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8559 return true;
8560 case PRE_MODIFY:
8561 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8562 INTVAL (addr.offset));
8563 return true;
8564 case POST_MODIFY:
8565 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8566 INTVAL (addr.offset));
8567 return true;
8568 default:
8569 break;
8570 }
8571 break;
8572
8573 case ADDRESS_LO_SUM:
8574 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8575 output_addr_const (f, addr.offset);
8576 asm_fprintf (f, "]");
8577 return true;
8578
8579 case ADDRESS_SYMBOLIC:
8580 output_addr_const (f, x);
8581 return true;
8582 }
8583
8584 return false;
8585 }
8586
8587 /* Print address 'x' of a memory access with mode 'mode'. */
8588 static void
8589 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8590 {
8591 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8592 output_addr_const (f, x);
8593 }
8594
8595 bool
8596 aarch64_label_mentioned_p (rtx x)
8597 {
8598 const char *fmt;
8599 int i;
8600
8601 if (GET_CODE (x) == LABEL_REF)
8602 return true;
8603
8604 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8605 referencing instruction, but they are constant offsets, not
8606 symbols. */
8607 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8608 return false;
8609
8610 fmt = GET_RTX_FORMAT (GET_CODE (x));
8611 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8612 {
8613 if (fmt[i] == 'E')
8614 {
8615 int j;
8616
8617 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8618 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8619 return 1;
8620 }
8621 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8622 return 1;
8623 }
8624
8625 return 0;
8626 }
8627
8628 /* Implement REGNO_REG_CLASS. */
8629
8630 enum reg_class
8631 aarch64_regno_regclass (unsigned regno)
8632 {
8633 if (GP_REGNUM_P (regno))
8634 return GENERAL_REGS;
8635
8636 if (regno == SP_REGNUM)
8637 return STACK_REG;
8638
8639 if (regno == FRAME_POINTER_REGNUM
8640 || regno == ARG_POINTER_REGNUM)
8641 return POINTER_REGS;
8642
8643 if (FP_REGNUM_P (regno))
8644 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
8645 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
8646
8647 if (PR_REGNUM_P (regno))
8648 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8649
8650 return NO_REGS;
8651 }
8652
8653 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8654 If OFFSET is out of range, return an offset of an anchor point
8655 that is in range. Return 0 otherwise. */
8656
8657 static HOST_WIDE_INT
8658 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8659 machine_mode mode)
8660 {
8661 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8662 if (size > 16)
8663 return (offset + 0x400) & ~0x7f0;
8664
8665 /* For offsets that aren't a multiple of the access size, the limit is
8666 -256...255. */
8667 if (offset & (size - 1))
8668 {
8669 /* BLKmode typically uses LDP of X-registers. */
8670 if (mode == BLKmode)
8671 return (offset + 512) & ~0x3ff;
8672 return (offset + 0x100) & ~0x1ff;
8673 }
8674
8675 /* Small negative offsets are supported. */
8676 if (IN_RANGE (offset, -256, 0))
8677 return 0;
8678
8679 if (mode == TImode || mode == TFmode)
8680 return (offset + 0x100) & ~0x1ff;
8681
8682 /* Use 12-bit offset by access size. */
8683 return offset & (~0xfff * size);
8684 }
8685
8686 static rtx
8687 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
8688 {
8689 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8690 where mask is selected by alignment and size of the offset.
8691 We try to pick as large a range for the offset as possible to
8692 maximize the chance of a CSE. However, for aligned addresses
8693 we limit the range to 4k so that structures with different sized
8694 elements are likely to use the same base. We need to be careful
8695 not to split a CONST for some forms of address expression, otherwise
8696 it will generate sub-optimal code. */
8697
8698 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8699 {
8700 rtx base = XEXP (x, 0);
8701 rtx offset_rtx = XEXP (x, 1);
8702 HOST_WIDE_INT offset = INTVAL (offset_rtx);
8703
8704 if (GET_CODE (base) == PLUS)
8705 {
8706 rtx op0 = XEXP (base, 0);
8707 rtx op1 = XEXP (base, 1);
8708
8709 /* Force any scaling into a temp for CSE. */
8710 op0 = force_reg (Pmode, op0);
8711 op1 = force_reg (Pmode, op1);
8712
8713 /* Let the pointer register be in op0. */
8714 if (REG_POINTER (op1))
8715 std::swap (op0, op1);
8716
8717 /* If the pointer is virtual or frame related, then we know that
8718 virtual register instantiation or register elimination is going
8719 to apply a second constant. We want the two constants folded
8720 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8721 if (virt_or_elim_regno_p (REGNO (op0)))
8722 {
8723 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8724 NULL_RTX, true, OPTAB_DIRECT);
8725 return gen_rtx_PLUS (Pmode, base, op1);
8726 }
8727
8728 /* Otherwise, in order to encourage CSE (and thence loop strength
8729 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8730 base = expand_binop (Pmode, add_optab, op0, op1,
8731 NULL_RTX, true, OPTAB_DIRECT);
8732 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8733 }
8734
8735 HOST_WIDE_INT size;
8736 if (GET_MODE_SIZE (mode).is_constant (&size))
8737 {
8738 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8739 mode);
8740 if (base_offset != 0)
8741 {
8742 base = plus_constant (Pmode, base, base_offset);
8743 base = force_operand (base, NULL_RTX);
8744 return plus_constant (Pmode, base, offset - base_offset);
8745 }
8746 }
8747 }
8748
8749 return x;
8750 }
8751
8752 static reg_class_t
8753 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8754 reg_class_t rclass,
8755 machine_mode mode,
8756 secondary_reload_info *sri)
8757 {
8758 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8759 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8760 comment at the head of aarch64-sve.md for more details about the
8761 big-endian handling. */
8762 if (BYTES_BIG_ENDIAN
8763 && reg_class_subset_p (rclass, FP_REGS)
8764 && !((REG_P (x) && HARD_REGISTER_P (x))
8765 || aarch64_simd_valid_immediate (x, NULL))
8766 && aarch64_sve_data_mode_p (mode))
8767 {
8768 sri->icode = CODE_FOR_aarch64_sve_reload_be;
8769 return NO_REGS;
8770 }
8771
8772 /* If we have to disable direct literal pool loads and stores because the
8773 function is too big, then we need a scratch register. */
8774 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8775 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8776 || targetm.vector_mode_supported_p (GET_MODE (x)))
8777 && !aarch64_pcrelative_literal_loads)
8778 {
8779 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8780 return NO_REGS;
8781 }
8782
8783 /* Without the TARGET_SIMD instructions we cannot move a Q register
8784 to a Q register directly. We need a scratch. */
8785 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8786 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8787 && reg_class_subset_p (rclass, FP_REGS))
8788 {
8789 sri->icode = code_for_aarch64_reload_mov (mode);
8790 return NO_REGS;
8791 }
8792
8793 /* A TFmode or TImode memory access should be handled via an FP_REGS
8794 because AArch64 has richer addressing modes for LDR/STR instructions
8795 than LDP/STP instructions. */
8796 if (TARGET_FLOAT && rclass == GENERAL_REGS
8797 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8798 return FP_REGS;
8799
8800 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8801 return GENERAL_REGS;
8802
8803 return NO_REGS;
8804 }
8805
8806 static bool
8807 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8808 {
8809 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8810
8811 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8812 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8813 if (frame_pointer_needed)
8814 return to == HARD_FRAME_POINTER_REGNUM;
8815 return true;
8816 }
8817
8818 poly_int64
8819 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8820 {
8821 if (to == HARD_FRAME_POINTER_REGNUM)
8822 {
8823 if (from == ARG_POINTER_REGNUM)
8824 return cfun->machine->frame.hard_fp_offset;
8825
8826 if (from == FRAME_POINTER_REGNUM)
8827 return cfun->machine->frame.hard_fp_offset
8828 - cfun->machine->frame.locals_offset;
8829 }
8830
8831 if (to == STACK_POINTER_REGNUM)
8832 {
8833 if (from == FRAME_POINTER_REGNUM)
8834 return cfun->machine->frame.frame_size
8835 - cfun->machine->frame.locals_offset;
8836 }
8837
8838 return cfun->machine->frame.frame_size;
8839 }
8840
8841 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8842 previous frame. */
8843
8844 rtx
8845 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8846 {
8847 if (count != 0)
8848 return const0_rtx;
8849 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8850 }
8851
8852
8853 static void
8854 aarch64_asm_trampoline_template (FILE *f)
8855 {
8856 int offset1 = 16;
8857 int offset2 = 20;
8858
8859 if (aarch64_bti_enabled ())
8860 {
8861 asm_fprintf (f, "\thint\t34 // bti c\n");
8862 offset1 -= 4;
8863 offset2 -= 4;
8864 }
8865
8866 if (TARGET_ILP32)
8867 {
8868 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8869 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8870 offset1);
8871 }
8872 else
8873 {
8874 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8875 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8876 offset2);
8877 }
8878 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8879
8880 /* The trampoline needs an extra padding instruction. In case if BTI is
8881 enabled the padding instruction is replaced by the BTI instruction at
8882 the beginning. */
8883 if (!aarch64_bti_enabled ())
8884 assemble_aligned_integer (4, const0_rtx);
8885
8886 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8887 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8888 }
8889
8890 static void
8891 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8892 {
8893 rtx fnaddr, mem, a_tramp;
8894 const int tramp_code_sz = 16;
8895
8896 /* Don't need to copy the trailing D-words, we fill those in below. */
8897 emit_block_move (m_tramp, assemble_trampoline_template (),
8898 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8899 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8900 fnaddr = XEXP (DECL_RTL (fndecl), 0);
8901 if (GET_MODE (fnaddr) != ptr_mode)
8902 fnaddr = convert_memory_address (ptr_mode, fnaddr);
8903 emit_move_insn (mem, fnaddr);
8904
8905 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8906 emit_move_insn (mem, chain_value);
8907
8908 /* XXX We should really define a "clear_cache" pattern and use
8909 gen_clear_cache(). */
8910 a_tramp = XEXP (m_tramp, 0);
8911 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8912 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8913 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8914 ptr_mode);
8915 }
8916
8917 static unsigned char
8918 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8919 {
8920 /* ??? Logically we should only need to provide a value when
8921 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8922 can hold MODE, but at the moment we need to handle all modes.
8923 Just ignore any runtime parts for registers that can't store them. */
8924 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8925 unsigned int nregs;
8926 switch (regclass)
8927 {
8928 case TAILCALL_ADDR_REGS:
8929 case POINTER_REGS:
8930 case GENERAL_REGS:
8931 case ALL_REGS:
8932 case POINTER_AND_FP_REGS:
8933 case FP_REGS:
8934 case FP_LO_REGS:
8935 case FP_LO8_REGS:
8936 if (aarch64_sve_data_mode_p (mode)
8937 && constant_multiple_p (GET_MODE_SIZE (mode),
8938 BYTES_PER_SVE_VECTOR, &nregs))
8939 return nregs;
8940 return (aarch64_vector_data_mode_p (mode)
8941 ? CEIL (lowest_size, UNITS_PER_VREG)
8942 : CEIL (lowest_size, UNITS_PER_WORD));
8943 case STACK_REG:
8944 case PR_REGS:
8945 case PR_LO_REGS:
8946 case PR_HI_REGS:
8947 return 1;
8948
8949 case NO_REGS:
8950 return 0;
8951
8952 default:
8953 break;
8954 }
8955 gcc_unreachable ();
8956 }
8957
8958 static reg_class_t
8959 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8960 {
8961 if (regclass == POINTER_REGS)
8962 return GENERAL_REGS;
8963
8964 if (regclass == STACK_REG)
8965 {
8966 if (REG_P(x)
8967 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8968 return regclass;
8969
8970 return NO_REGS;
8971 }
8972
8973 /* Register eliminiation can result in a request for
8974 SP+constant->FP_REGS. We cannot support such operations which
8975 use SP as source and an FP_REG as destination, so reject out
8976 right now. */
8977 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8978 {
8979 rtx lhs = XEXP (x, 0);
8980
8981 /* Look through a possible SUBREG introduced by ILP32. */
8982 if (GET_CODE (lhs) == SUBREG)
8983 lhs = SUBREG_REG (lhs);
8984
8985 gcc_assert (REG_P (lhs));
8986 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8987 POINTER_REGS));
8988 return NO_REGS;
8989 }
8990
8991 return regclass;
8992 }
8993
8994 void
8995 aarch64_asm_output_labelref (FILE* f, const char *name)
8996 {
8997 asm_fprintf (f, "%U%s", name);
8998 }
8999
9000 static void
9001 aarch64_elf_asm_constructor (rtx symbol, int priority)
9002 {
9003 if (priority == DEFAULT_INIT_PRIORITY)
9004 default_ctor_section_asm_out_constructor (symbol, priority);
9005 else
9006 {
9007 section *s;
9008 /* While priority is known to be in range [0, 65535], so 18 bytes
9009 would be enough, the compiler might not know that. To avoid
9010 -Wformat-truncation false positive, use a larger size. */
9011 char buf[23];
9012 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9013 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9014 switch_to_section (s);
9015 assemble_align (POINTER_SIZE);
9016 assemble_aligned_integer (POINTER_BYTES, symbol);
9017 }
9018 }
9019
9020 static void
9021 aarch64_elf_asm_destructor (rtx symbol, int priority)
9022 {
9023 if (priority == DEFAULT_INIT_PRIORITY)
9024 default_dtor_section_asm_out_destructor (symbol, priority);
9025 else
9026 {
9027 section *s;
9028 /* While priority is known to be in range [0, 65535], so 18 bytes
9029 would be enough, the compiler might not know that. To avoid
9030 -Wformat-truncation false positive, use a larger size. */
9031 char buf[23];
9032 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9033 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9034 switch_to_section (s);
9035 assemble_align (POINTER_SIZE);
9036 assemble_aligned_integer (POINTER_BYTES, symbol);
9037 }
9038 }
9039
9040 const char*
9041 aarch64_output_casesi (rtx *operands)
9042 {
9043 char buf[100];
9044 char label[100];
9045 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9046 int index;
9047 static const char *const patterns[4][2] =
9048 {
9049 {
9050 "ldrb\t%w3, [%0,%w1,uxtw]",
9051 "add\t%3, %4, %w3, sxtb #2"
9052 },
9053 {
9054 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9055 "add\t%3, %4, %w3, sxth #2"
9056 },
9057 {
9058 "ldr\t%w3, [%0,%w1,uxtw #2]",
9059 "add\t%3, %4, %w3, sxtw #2"
9060 },
9061 /* We assume that DImode is only generated when not optimizing and
9062 that we don't really need 64-bit address offsets. That would
9063 imply an object file with 8GB of code in a single function! */
9064 {
9065 "ldr\t%w3, [%0,%w1,uxtw #2]",
9066 "add\t%3, %4, %w3, sxtw #2"
9067 }
9068 };
9069
9070 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9071
9072 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9073 index = exact_log2 (GET_MODE_SIZE (mode));
9074
9075 gcc_assert (index >= 0 && index <= 3);
9076
9077 /* Need to implement table size reduction, by chaning the code below. */
9078 output_asm_insn (patterns[index][0], operands);
9079 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9080 snprintf (buf, sizeof (buf),
9081 "adr\t%%4, %s", targetm.strip_name_encoding (label));
9082 output_asm_insn (buf, operands);
9083 output_asm_insn (patterns[index][1], operands);
9084 output_asm_insn ("br\t%3", operands);
9085 assemble_label (asm_out_file, label);
9086 return "";
9087 }
9088
9089
9090 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9091 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9092 operator. */
9093
9094 int
9095 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9096 {
9097 if (shift >= 0 && shift <= 3)
9098 {
9099 int size;
9100 for (size = 8; size <= 32; size *= 2)
9101 {
9102 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9103 if (mask == bits << shift)
9104 return size;
9105 }
9106 }
9107 return 0;
9108 }
9109
9110 /* Constant pools are per function only when PC relative
9111 literal loads are true or we are in the large memory
9112 model. */
9113
9114 static inline bool
9115 aarch64_can_use_per_function_literal_pools_p (void)
9116 {
9117 return (aarch64_pcrelative_literal_loads
9118 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9119 }
9120
9121 static bool
9122 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9123 {
9124 /* We can't use blocks for constants when we're using a per-function
9125 constant pool. */
9126 return !aarch64_can_use_per_function_literal_pools_p ();
9127 }
9128
9129 /* Select appropriate section for constants depending
9130 on where we place literal pools. */
9131
9132 static section *
9133 aarch64_select_rtx_section (machine_mode mode,
9134 rtx x,
9135 unsigned HOST_WIDE_INT align)
9136 {
9137 if (aarch64_can_use_per_function_literal_pools_p ())
9138 return function_section (current_function_decl);
9139
9140 return default_elf_select_rtx_section (mode, x, align);
9141 }
9142
9143 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9144 void
9145 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9146 HOST_WIDE_INT offset)
9147 {
9148 /* When using per-function literal pools, we must ensure that any code
9149 section is aligned to the minimal instruction length, lest we get
9150 errors from the assembler re "unaligned instructions". */
9151 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9152 ASM_OUTPUT_ALIGN (f, 2);
9153 }
9154
9155 /* Costs. */
9156
9157 /* Helper function for rtx cost calculation. Strip a shift expression
9158 from X. Returns the inner operand if successful, or the original
9159 expression on failure. */
9160 static rtx
9161 aarch64_strip_shift (rtx x)
9162 {
9163 rtx op = x;
9164
9165 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9166 we can convert both to ROR during final output. */
9167 if ((GET_CODE (op) == ASHIFT
9168 || GET_CODE (op) == ASHIFTRT
9169 || GET_CODE (op) == LSHIFTRT
9170 || GET_CODE (op) == ROTATERT
9171 || GET_CODE (op) == ROTATE)
9172 && CONST_INT_P (XEXP (op, 1)))
9173 return XEXP (op, 0);
9174
9175 if (GET_CODE (op) == MULT
9176 && CONST_INT_P (XEXP (op, 1))
9177 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9178 return XEXP (op, 0);
9179
9180 return x;
9181 }
9182
9183 /* Helper function for rtx cost calculation. Strip an extend
9184 expression from X. Returns the inner operand if successful, or the
9185 original expression on failure. We deal with a number of possible
9186 canonicalization variations here. If STRIP_SHIFT is true, then
9187 we can strip off a shift also. */
9188 static rtx
9189 aarch64_strip_extend (rtx x, bool strip_shift)
9190 {
9191 scalar_int_mode mode;
9192 rtx op = x;
9193
9194 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9195 return op;
9196
9197 /* Zero and sign extraction of a widened value. */
9198 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9199 && XEXP (op, 2) == const0_rtx
9200 && GET_CODE (XEXP (op, 0)) == MULT
9201 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9202 XEXP (op, 1)))
9203 return XEXP (XEXP (op, 0), 0);
9204
9205 /* It can also be represented (for zero-extend) as an AND with an
9206 immediate. */
9207 if (GET_CODE (op) == AND
9208 && GET_CODE (XEXP (op, 0)) == MULT
9209 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9210 && CONST_INT_P (XEXP (op, 1))
9211 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9212 INTVAL (XEXP (op, 1))) != 0)
9213 return XEXP (XEXP (op, 0), 0);
9214
9215 /* Now handle extended register, as this may also have an optional
9216 left shift by 1..4. */
9217 if (strip_shift
9218 && GET_CODE (op) == ASHIFT
9219 && CONST_INT_P (XEXP (op, 1))
9220 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9221 op = XEXP (op, 0);
9222
9223 if (GET_CODE (op) == ZERO_EXTEND
9224 || GET_CODE (op) == SIGN_EXTEND)
9225 op = XEXP (op, 0);
9226
9227 if (op != x)
9228 return op;
9229
9230 return x;
9231 }
9232
9233 /* Return true iff CODE is a shift supported in combination
9234 with arithmetic instructions. */
9235
9236 static bool
9237 aarch64_shift_p (enum rtx_code code)
9238 {
9239 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9240 }
9241
9242
9243 /* Return true iff X is a cheap shift without a sign extend. */
9244
9245 static bool
9246 aarch64_cheap_mult_shift_p (rtx x)
9247 {
9248 rtx op0, op1;
9249
9250 op0 = XEXP (x, 0);
9251 op1 = XEXP (x, 1);
9252
9253 if (!(aarch64_tune_params.extra_tuning_flags
9254 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9255 return false;
9256
9257 if (GET_CODE (op0) == SIGN_EXTEND)
9258 return false;
9259
9260 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9261 && UINTVAL (op1) <= 4)
9262 return true;
9263
9264 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9265 return false;
9266
9267 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9268
9269 if (l2 > 0 && l2 <= 4)
9270 return true;
9271
9272 return false;
9273 }
9274
9275 /* Helper function for rtx cost calculation. Calculate the cost of
9276 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9277 Return the calculated cost of the expression, recursing manually in to
9278 operands where needed. */
9279
9280 static int
9281 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9282 {
9283 rtx op0, op1;
9284 const struct cpu_cost_table *extra_cost
9285 = aarch64_tune_params.insn_extra_cost;
9286 int cost = 0;
9287 bool compound_p = (outer == PLUS || outer == MINUS);
9288 machine_mode mode = GET_MODE (x);
9289
9290 gcc_checking_assert (code == MULT);
9291
9292 op0 = XEXP (x, 0);
9293 op1 = XEXP (x, 1);
9294
9295 if (VECTOR_MODE_P (mode))
9296 mode = GET_MODE_INNER (mode);
9297
9298 /* Integer multiply/fma. */
9299 if (GET_MODE_CLASS (mode) == MODE_INT)
9300 {
9301 /* The multiply will be canonicalized as a shift, cost it as such. */
9302 if (aarch64_shift_p (GET_CODE (x))
9303 || (CONST_INT_P (op1)
9304 && exact_log2 (INTVAL (op1)) > 0))
9305 {
9306 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9307 || GET_CODE (op0) == SIGN_EXTEND;
9308 if (speed)
9309 {
9310 if (compound_p)
9311 {
9312 /* If the shift is considered cheap,
9313 then don't add any cost. */
9314 if (aarch64_cheap_mult_shift_p (x))
9315 ;
9316 else if (REG_P (op1))
9317 /* ARITH + shift-by-register. */
9318 cost += extra_cost->alu.arith_shift_reg;
9319 else if (is_extend)
9320 /* ARITH + extended register. We don't have a cost field
9321 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9322 cost += extra_cost->alu.extend_arith;
9323 else
9324 /* ARITH + shift-by-immediate. */
9325 cost += extra_cost->alu.arith_shift;
9326 }
9327 else
9328 /* LSL (immediate). */
9329 cost += extra_cost->alu.shift;
9330
9331 }
9332 /* Strip extends as we will have costed them in the case above. */
9333 if (is_extend)
9334 op0 = aarch64_strip_extend (op0, true);
9335
9336 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9337
9338 return cost;
9339 }
9340
9341 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9342 compound and let the below cases handle it. After all, MNEG is a
9343 special-case alias of MSUB. */
9344 if (GET_CODE (op0) == NEG)
9345 {
9346 op0 = XEXP (op0, 0);
9347 compound_p = true;
9348 }
9349
9350 /* Integer multiplies or FMAs have zero/sign extending variants. */
9351 if ((GET_CODE (op0) == ZERO_EXTEND
9352 && GET_CODE (op1) == ZERO_EXTEND)
9353 || (GET_CODE (op0) == SIGN_EXTEND
9354 && GET_CODE (op1) == SIGN_EXTEND))
9355 {
9356 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9357 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9358
9359 if (speed)
9360 {
9361 if (compound_p)
9362 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9363 cost += extra_cost->mult[0].extend_add;
9364 else
9365 /* MUL/SMULL/UMULL. */
9366 cost += extra_cost->mult[0].extend;
9367 }
9368
9369 return cost;
9370 }
9371
9372 /* This is either an integer multiply or a MADD. In both cases
9373 we want to recurse and cost the operands. */
9374 cost += rtx_cost (op0, mode, MULT, 0, speed);
9375 cost += rtx_cost (op1, mode, MULT, 1, speed);
9376
9377 if (speed)
9378 {
9379 if (compound_p)
9380 /* MADD/MSUB. */
9381 cost += extra_cost->mult[mode == DImode].add;
9382 else
9383 /* MUL. */
9384 cost += extra_cost->mult[mode == DImode].simple;
9385 }
9386
9387 return cost;
9388 }
9389 else
9390 {
9391 if (speed)
9392 {
9393 /* Floating-point FMA/FMUL can also support negations of the
9394 operands, unless the rounding mode is upward or downward in
9395 which case FNMUL is different than FMUL with operand negation. */
9396 bool neg0 = GET_CODE (op0) == NEG;
9397 bool neg1 = GET_CODE (op1) == NEG;
9398 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9399 {
9400 if (neg0)
9401 op0 = XEXP (op0, 0);
9402 if (neg1)
9403 op1 = XEXP (op1, 0);
9404 }
9405
9406 if (compound_p)
9407 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9408 cost += extra_cost->fp[mode == DFmode].fma;
9409 else
9410 /* FMUL/FNMUL. */
9411 cost += extra_cost->fp[mode == DFmode].mult;
9412 }
9413
9414 cost += rtx_cost (op0, mode, MULT, 0, speed);
9415 cost += rtx_cost (op1, mode, MULT, 1, speed);
9416 return cost;
9417 }
9418 }
9419
9420 static int
9421 aarch64_address_cost (rtx x,
9422 machine_mode mode,
9423 addr_space_t as ATTRIBUTE_UNUSED,
9424 bool speed)
9425 {
9426 enum rtx_code c = GET_CODE (x);
9427 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9428 struct aarch64_address_info info;
9429 int cost = 0;
9430 info.shift = 0;
9431
9432 if (!aarch64_classify_address (&info, x, mode, false))
9433 {
9434 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9435 {
9436 /* This is a CONST or SYMBOL ref which will be split
9437 in a different way depending on the code model in use.
9438 Cost it through the generic infrastructure. */
9439 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9440 /* Divide through by the cost of one instruction to
9441 bring it to the same units as the address costs. */
9442 cost_symbol_ref /= COSTS_N_INSNS (1);
9443 /* The cost is then the cost of preparing the address,
9444 followed by an immediate (possibly 0) offset. */
9445 return cost_symbol_ref + addr_cost->imm_offset;
9446 }
9447 else
9448 {
9449 /* This is most likely a jump table from a case
9450 statement. */
9451 return addr_cost->register_offset;
9452 }
9453 }
9454
9455 switch (info.type)
9456 {
9457 case ADDRESS_LO_SUM:
9458 case ADDRESS_SYMBOLIC:
9459 case ADDRESS_REG_IMM:
9460 cost += addr_cost->imm_offset;
9461 break;
9462
9463 case ADDRESS_REG_WB:
9464 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9465 cost += addr_cost->pre_modify;
9466 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9467 cost += addr_cost->post_modify;
9468 else
9469 gcc_unreachable ();
9470
9471 break;
9472
9473 case ADDRESS_REG_REG:
9474 cost += addr_cost->register_offset;
9475 break;
9476
9477 case ADDRESS_REG_SXTW:
9478 cost += addr_cost->register_sextend;
9479 break;
9480
9481 case ADDRESS_REG_UXTW:
9482 cost += addr_cost->register_zextend;
9483 break;
9484
9485 default:
9486 gcc_unreachable ();
9487 }
9488
9489
9490 if (info.shift > 0)
9491 {
9492 /* For the sake of calculating the cost of the shifted register
9493 component, we can treat same sized modes in the same way. */
9494 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9495 cost += addr_cost->addr_scale_costs.hi;
9496 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9497 cost += addr_cost->addr_scale_costs.si;
9498 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9499 cost += addr_cost->addr_scale_costs.di;
9500 else
9501 /* We can't tell, or this is a 128-bit vector. */
9502 cost += addr_cost->addr_scale_costs.ti;
9503 }
9504
9505 return cost;
9506 }
9507
9508 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9509 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9510 to be taken. */
9511
9512 int
9513 aarch64_branch_cost (bool speed_p, bool predictable_p)
9514 {
9515 /* When optimizing for speed, use the cost of unpredictable branches. */
9516 const struct cpu_branch_cost *branch_costs =
9517 aarch64_tune_params.branch_costs;
9518
9519 if (!speed_p || predictable_p)
9520 return branch_costs->predictable;
9521 else
9522 return branch_costs->unpredictable;
9523 }
9524
9525 /* Return true if the RTX X in mode MODE is a zero or sign extract
9526 usable in an ADD or SUB (extended register) instruction. */
9527 static bool
9528 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9529 {
9530 /* Catch add with a sign extract.
9531 This is add_<optab><mode>_multp2. */
9532 if (GET_CODE (x) == SIGN_EXTRACT
9533 || GET_CODE (x) == ZERO_EXTRACT)
9534 {
9535 rtx op0 = XEXP (x, 0);
9536 rtx op1 = XEXP (x, 1);
9537 rtx op2 = XEXP (x, 2);
9538
9539 if (GET_CODE (op0) == MULT
9540 && CONST_INT_P (op1)
9541 && op2 == const0_rtx
9542 && CONST_INT_P (XEXP (op0, 1))
9543 && aarch64_is_extend_from_extract (mode,
9544 XEXP (op0, 1),
9545 op1))
9546 {
9547 return true;
9548 }
9549 }
9550 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9551 No shift. */
9552 else if (GET_CODE (x) == SIGN_EXTEND
9553 || GET_CODE (x) == ZERO_EXTEND)
9554 return REG_P (XEXP (x, 0));
9555
9556 return false;
9557 }
9558
9559 static bool
9560 aarch64_frint_unspec_p (unsigned int u)
9561 {
9562 switch (u)
9563 {
9564 case UNSPEC_FRINTZ:
9565 case UNSPEC_FRINTP:
9566 case UNSPEC_FRINTM:
9567 case UNSPEC_FRINTA:
9568 case UNSPEC_FRINTN:
9569 case UNSPEC_FRINTX:
9570 case UNSPEC_FRINTI:
9571 return true;
9572
9573 default:
9574 return false;
9575 }
9576 }
9577
9578 /* Return true iff X is an rtx that will match an extr instruction
9579 i.e. as described in the *extr<mode>5_insn family of patterns.
9580 OP0 and OP1 will be set to the operands of the shifts involved
9581 on success and will be NULL_RTX otherwise. */
9582
9583 static bool
9584 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9585 {
9586 rtx op0, op1;
9587 scalar_int_mode mode;
9588 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9589 return false;
9590
9591 *res_op0 = NULL_RTX;
9592 *res_op1 = NULL_RTX;
9593
9594 if (GET_CODE (x) != IOR)
9595 return false;
9596
9597 op0 = XEXP (x, 0);
9598 op1 = XEXP (x, 1);
9599
9600 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9601 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9602 {
9603 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9604 if (GET_CODE (op1) == ASHIFT)
9605 std::swap (op0, op1);
9606
9607 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9608 return false;
9609
9610 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9611 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9612
9613 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9614 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9615 {
9616 *res_op0 = XEXP (op0, 0);
9617 *res_op1 = XEXP (op1, 0);
9618 return true;
9619 }
9620 }
9621
9622 return false;
9623 }
9624
9625 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9626 storing it in *COST. Result is true if the total cost of the operation
9627 has now been calculated. */
9628 static bool
9629 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9630 {
9631 rtx inner;
9632 rtx comparator;
9633 enum rtx_code cmpcode;
9634
9635 if (COMPARISON_P (op0))
9636 {
9637 inner = XEXP (op0, 0);
9638 comparator = XEXP (op0, 1);
9639 cmpcode = GET_CODE (op0);
9640 }
9641 else
9642 {
9643 inner = op0;
9644 comparator = const0_rtx;
9645 cmpcode = NE;
9646 }
9647
9648 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9649 {
9650 /* Conditional branch. */
9651 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9652 return true;
9653 else
9654 {
9655 if (cmpcode == NE || cmpcode == EQ)
9656 {
9657 if (comparator == const0_rtx)
9658 {
9659 /* TBZ/TBNZ/CBZ/CBNZ. */
9660 if (GET_CODE (inner) == ZERO_EXTRACT)
9661 /* TBZ/TBNZ. */
9662 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9663 ZERO_EXTRACT, 0, speed);
9664 else
9665 /* CBZ/CBNZ. */
9666 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9667
9668 return true;
9669 }
9670 }
9671 else if (cmpcode == LT || cmpcode == GE)
9672 {
9673 /* TBZ/TBNZ. */
9674 if (comparator == const0_rtx)
9675 return true;
9676 }
9677 }
9678 }
9679 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9680 {
9681 /* CCMP. */
9682 if (GET_CODE (op1) == COMPARE)
9683 {
9684 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9685 if (XEXP (op1, 1) == const0_rtx)
9686 *cost += 1;
9687 if (speed)
9688 {
9689 machine_mode mode = GET_MODE (XEXP (op1, 0));
9690 const struct cpu_cost_table *extra_cost
9691 = aarch64_tune_params.insn_extra_cost;
9692
9693 if (GET_MODE_CLASS (mode) == MODE_INT)
9694 *cost += extra_cost->alu.arith;
9695 else
9696 *cost += extra_cost->fp[mode == DFmode].compare;
9697 }
9698 return true;
9699 }
9700
9701 /* It's a conditional operation based on the status flags,
9702 so it must be some flavor of CSEL. */
9703
9704 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9705 if (GET_CODE (op1) == NEG
9706 || GET_CODE (op1) == NOT
9707 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9708 op1 = XEXP (op1, 0);
9709 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9710 {
9711 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9712 op1 = XEXP (op1, 0);
9713 op2 = XEXP (op2, 0);
9714 }
9715
9716 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9717 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9718 return true;
9719 }
9720
9721 /* We don't know what this is, cost all operands. */
9722 return false;
9723 }
9724
9725 /* Check whether X is a bitfield operation of the form shift + extend that
9726 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9727 operand to which the bitfield operation is applied. Otherwise return
9728 NULL_RTX. */
9729
9730 static rtx
9731 aarch64_extend_bitfield_pattern_p (rtx x)
9732 {
9733 rtx_code outer_code = GET_CODE (x);
9734 machine_mode outer_mode = GET_MODE (x);
9735
9736 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9737 && outer_mode != SImode && outer_mode != DImode)
9738 return NULL_RTX;
9739
9740 rtx inner = XEXP (x, 0);
9741 rtx_code inner_code = GET_CODE (inner);
9742 machine_mode inner_mode = GET_MODE (inner);
9743 rtx op = NULL_RTX;
9744
9745 switch (inner_code)
9746 {
9747 case ASHIFT:
9748 if (CONST_INT_P (XEXP (inner, 1))
9749 && (inner_mode == QImode || inner_mode == HImode))
9750 op = XEXP (inner, 0);
9751 break;
9752 case LSHIFTRT:
9753 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9754 && (inner_mode == QImode || inner_mode == HImode))
9755 op = XEXP (inner, 0);
9756 break;
9757 case ASHIFTRT:
9758 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9759 && (inner_mode == QImode || inner_mode == HImode))
9760 op = XEXP (inner, 0);
9761 break;
9762 default:
9763 break;
9764 }
9765
9766 return op;
9767 }
9768
9769 /* Return true if the mask and a shift amount from an RTX of the form
9770 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9771 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9772
9773 bool
9774 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9775 rtx shft_amnt)
9776 {
9777 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9778 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9779 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9780 && (INTVAL (mask)
9781 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9782 }
9783
9784 /* Return true if the masks and a shift amount from an RTX of the form
9785 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9786 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
9787
9788 bool
9789 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9790 unsigned HOST_WIDE_INT mask1,
9791 unsigned HOST_WIDE_INT shft_amnt,
9792 unsigned HOST_WIDE_INT mask2)
9793 {
9794 unsigned HOST_WIDE_INT t;
9795
9796 /* Verify that there is no overlap in what bits are set in the two masks. */
9797 if (mask1 != ~mask2)
9798 return false;
9799
9800 /* Verify that mask2 is not all zeros or ones. */
9801 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9802 return false;
9803
9804 /* The shift amount should always be less than the mode size. */
9805 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9806
9807 /* Verify that the mask being shifted is contiguous and would be in the
9808 least significant bits after shifting by shft_amnt. */
9809 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9810 return (t == (t & -t));
9811 }
9812
9813 /* Calculate the cost of calculating X, storing it in *COST. Result
9814 is true if the total cost of the operation has now been calculated. */
9815 static bool
9816 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9817 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9818 {
9819 rtx op0, op1, op2;
9820 const struct cpu_cost_table *extra_cost
9821 = aarch64_tune_params.insn_extra_cost;
9822 int code = GET_CODE (x);
9823 scalar_int_mode int_mode;
9824
9825 /* By default, assume that everything has equivalent cost to the
9826 cheapest instruction. Any additional costs are applied as a delta
9827 above this default. */
9828 *cost = COSTS_N_INSNS (1);
9829
9830 switch (code)
9831 {
9832 case SET:
9833 /* The cost depends entirely on the operands to SET. */
9834 *cost = 0;
9835 op0 = SET_DEST (x);
9836 op1 = SET_SRC (x);
9837
9838 switch (GET_CODE (op0))
9839 {
9840 case MEM:
9841 if (speed)
9842 {
9843 rtx address = XEXP (op0, 0);
9844 if (VECTOR_MODE_P (mode))
9845 *cost += extra_cost->ldst.storev;
9846 else if (GET_MODE_CLASS (mode) == MODE_INT)
9847 *cost += extra_cost->ldst.store;
9848 else if (mode == SFmode)
9849 *cost += extra_cost->ldst.storef;
9850 else if (mode == DFmode)
9851 *cost += extra_cost->ldst.stored;
9852
9853 *cost +=
9854 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9855 0, speed));
9856 }
9857
9858 *cost += rtx_cost (op1, mode, SET, 1, speed);
9859 return true;
9860
9861 case SUBREG:
9862 if (! REG_P (SUBREG_REG (op0)))
9863 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9864
9865 /* Fall through. */
9866 case REG:
9867 /* The cost is one per vector-register copied. */
9868 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9869 {
9870 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9871 *cost = COSTS_N_INSNS (nregs);
9872 }
9873 /* const0_rtx is in general free, but we will use an
9874 instruction to set a register to 0. */
9875 else if (REG_P (op1) || op1 == const0_rtx)
9876 {
9877 /* The cost is 1 per register copied. */
9878 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9879 *cost = COSTS_N_INSNS (nregs);
9880 }
9881 else
9882 /* Cost is just the cost of the RHS of the set. */
9883 *cost += rtx_cost (op1, mode, SET, 1, speed);
9884 return true;
9885
9886 case ZERO_EXTRACT:
9887 case SIGN_EXTRACT:
9888 /* Bit-field insertion. Strip any redundant widening of
9889 the RHS to meet the width of the target. */
9890 if (GET_CODE (op1) == SUBREG)
9891 op1 = SUBREG_REG (op1);
9892 if ((GET_CODE (op1) == ZERO_EXTEND
9893 || GET_CODE (op1) == SIGN_EXTEND)
9894 && CONST_INT_P (XEXP (op0, 1))
9895 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9896 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9897 op1 = XEXP (op1, 0);
9898
9899 if (CONST_INT_P (op1))
9900 {
9901 /* MOV immediate is assumed to always be cheap. */
9902 *cost = COSTS_N_INSNS (1);
9903 }
9904 else
9905 {
9906 /* BFM. */
9907 if (speed)
9908 *cost += extra_cost->alu.bfi;
9909 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9910 }
9911
9912 return true;
9913
9914 default:
9915 /* We can't make sense of this, assume default cost. */
9916 *cost = COSTS_N_INSNS (1);
9917 return false;
9918 }
9919 return false;
9920
9921 case CONST_INT:
9922 /* If an instruction can incorporate a constant within the
9923 instruction, the instruction's expression avoids calling
9924 rtx_cost() on the constant. If rtx_cost() is called on a
9925 constant, then it is usually because the constant must be
9926 moved into a register by one or more instructions.
9927
9928 The exception is constant 0, which can be expressed
9929 as XZR/WZR and is therefore free. The exception to this is
9930 if we have (set (reg) (const0_rtx)) in which case we must cost
9931 the move. However, we can catch that when we cost the SET, so
9932 we don't need to consider that here. */
9933 if (x == const0_rtx)
9934 *cost = 0;
9935 else
9936 {
9937 /* To an approximation, building any other constant is
9938 proportionally expensive to the number of instructions
9939 required to build that constant. This is true whether we
9940 are compiling for SPEED or otherwise. */
9941 if (!is_a <scalar_int_mode> (mode, &int_mode))
9942 int_mode = word_mode;
9943 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9944 (NULL_RTX, x, false, int_mode));
9945 }
9946 return true;
9947
9948 case CONST_DOUBLE:
9949
9950 /* First determine number of instructions to do the move
9951 as an integer constant. */
9952 if (!aarch64_float_const_representable_p (x)
9953 && !aarch64_can_const_movi_rtx_p (x, mode)
9954 && aarch64_float_const_rtx_p (x))
9955 {
9956 unsigned HOST_WIDE_INT ival;
9957 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9958 gcc_assert (succeed);
9959
9960 scalar_int_mode imode = (mode == HFmode
9961 ? SImode
9962 : int_mode_for_mode (mode).require ());
9963 int ncost = aarch64_internal_mov_immediate
9964 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9965 *cost += COSTS_N_INSNS (ncost);
9966 return true;
9967 }
9968
9969 if (speed)
9970 {
9971 /* mov[df,sf]_aarch64. */
9972 if (aarch64_float_const_representable_p (x))
9973 /* FMOV (scalar immediate). */
9974 *cost += extra_cost->fp[mode == DFmode].fpconst;
9975 else if (!aarch64_float_const_zero_rtx_p (x))
9976 {
9977 /* This will be a load from memory. */
9978 if (mode == DFmode)
9979 *cost += extra_cost->ldst.loadd;
9980 else
9981 *cost += extra_cost->ldst.loadf;
9982 }
9983 else
9984 /* Otherwise this is +0.0. We get this using MOVI d0, #0
9985 or MOV v0.s[0], wzr - neither of which are modeled by the
9986 cost tables. Just use the default cost. */
9987 {
9988 }
9989 }
9990
9991 return true;
9992
9993 case MEM:
9994 if (speed)
9995 {
9996 /* For loads we want the base cost of a load, plus an
9997 approximation for the additional cost of the addressing
9998 mode. */
9999 rtx address = XEXP (x, 0);
10000 if (VECTOR_MODE_P (mode))
10001 *cost += extra_cost->ldst.loadv;
10002 else if (GET_MODE_CLASS (mode) == MODE_INT)
10003 *cost += extra_cost->ldst.load;
10004 else if (mode == SFmode)
10005 *cost += extra_cost->ldst.loadf;
10006 else if (mode == DFmode)
10007 *cost += extra_cost->ldst.loadd;
10008
10009 *cost +=
10010 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10011 0, speed));
10012 }
10013
10014 return true;
10015
10016 case NEG:
10017 op0 = XEXP (x, 0);
10018
10019 if (VECTOR_MODE_P (mode))
10020 {
10021 if (speed)
10022 {
10023 /* FNEG. */
10024 *cost += extra_cost->vect.alu;
10025 }
10026 return false;
10027 }
10028
10029 if (GET_MODE_CLASS (mode) == MODE_INT)
10030 {
10031 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10032 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10033 {
10034 /* CSETM. */
10035 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10036 return true;
10037 }
10038
10039 /* Cost this as SUB wzr, X. */
10040 op0 = CONST0_RTX (mode);
10041 op1 = XEXP (x, 0);
10042 goto cost_minus;
10043 }
10044
10045 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10046 {
10047 /* Support (neg(fma...)) as a single instruction only if
10048 sign of zeros is unimportant. This matches the decision
10049 making in aarch64.md. */
10050 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10051 {
10052 /* FNMADD. */
10053 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10054 return true;
10055 }
10056 if (GET_CODE (op0) == MULT)
10057 {
10058 /* FNMUL. */
10059 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10060 return true;
10061 }
10062 if (speed)
10063 /* FNEG. */
10064 *cost += extra_cost->fp[mode == DFmode].neg;
10065 return false;
10066 }
10067
10068 return false;
10069
10070 case CLRSB:
10071 case CLZ:
10072 if (speed)
10073 {
10074 if (VECTOR_MODE_P (mode))
10075 *cost += extra_cost->vect.alu;
10076 else
10077 *cost += extra_cost->alu.clz;
10078 }
10079
10080 return false;
10081
10082 case COMPARE:
10083 op0 = XEXP (x, 0);
10084 op1 = XEXP (x, 1);
10085
10086 if (op1 == const0_rtx
10087 && GET_CODE (op0) == AND)
10088 {
10089 x = op0;
10090 mode = GET_MODE (op0);
10091 goto cost_logic;
10092 }
10093
10094 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10095 {
10096 /* TODO: A write to the CC flags possibly costs extra, this
10097 needs encoding in the cost tables. */
10098
10099 mode = GET_MODE (op0);
10100 /* ANDS. */
10101 if (GET_CODE (op0) == AND)
10102 {
10103 x = op0;
10104 goto cost_logic;
10105 }
10106
10107 if (GET_CODE (op0) == PLUS)
10108 {
10109 /* ADDS (and CMN alias). */
10110 x = op0;
10111 goto cost_plus;
10112 }
10113
10114 if (GET_CODE (op0) == MINUS)
10115 {
10116 /* SUBS. */
10117 x = op0;
10118 goto cost_minus;
10119 }
10120
10121 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10122 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10123 && CONST_INT_P (XEXP (op0, 2)))
10124 {
10125 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10126 Handle it here directly rather than going to cost_logic
10127 since we know the immediate generated for the TST is valid
10128 so we can avoid creating an intermediate rtx for it only
10129 for costing purposes. */
10130 if (speed)
10131 *cost += extra_cost->alu.logical;
10132
10133 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10134 ZERO_EXTRACT, 0, speed);
10135 return true;
10136 }
10137
10138 if (GET_CODE (op1) == NEG)
10139 {
10140 /* CMN. */
10141 if (speed)
10142 *cost += extra_cost->alu.arith;
10143
10144 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10145 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10146 return true;
10147 }
10148
10149 /* CMP.
10150
10151 Compare can freely swap the order of operands, and
10152 canonicalization puts the more complex operation first.
10153 But the integer MINUS logic expects the shift/extend
10154 operation in op1. */
10155 if (! (REG_P (op0)
10156 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10157 {
10158 op0 = XEXP (x, 1);
10159 op1 = XEXP (x, 0);
10160 }
10161 goto cost_minus;
10162 }
10163
10164 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10165 {
10166 /* FCMP. */
10167 if (speed)
10168 *cost += extra_cost->fp[mode == DFmode].compare;
10169
10170 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10171 {
10172 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10173 /* FCMP supports constant 0.0 for no extra cost. */
10174 return true;
10175 }
10176 return false;
10177 }
10178
10179 if (VECTOR_MODE_P (mode))
10180 {
10181 /* Vector compare. */
10182 if (speed)
10183 *cost += extra_cost->vect.alu;
10184
10185 if (aarch64_float_const_zero_rtx_p (op1))
10186 {
10187 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10188 cost. */
10189 return true;
10190 }
10191 return false;
10192 }
10193 return false;
10194
10195 case MINUS:
10196 {
10197 op0 = XEXP (x, 0);
10198 op1 = XEXP (x, 1);
10199
10200 cost_minus:
10201 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10202
10203 /* Detect valid immediates. */
10204 if ((GET_MODE_CLASS (mode) == MODE_INT
10205 || (GET_MODE_CLASS (mode) == MODE_CC
10206 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10207 && CONST_INT_P (op1)
10208 && aarch64_uimm12_shift (INTVAL (op1)))
10209 {
10210 if (speed)
10211 /* SUB(S) (immediate). */
10212 *cost += extra_cost->alu.arith;
10213 return true;
10214 }
10215
10216 /* Look for SUB (extended register). */
10217 if (is_a <scalar_int_mode> (mode, &int_mode)
10218 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10219 {
10220 if (speed)
10221 *cost += extra_cost->alu.extend_arith;
10222
10223 op1 = aarch64_strip_extend (op1, true);
10224 *cost += rtx_cost (op1, VOIDmode,
10225 (enum rtx_code) GET_CODE (op1), 0, speed);
10226 return true;
10227 }
10228
10229 rtx new_op1 = aarch64_strip_extend (op1, false);
10230
10231 /* Cost this as an FMA-alike operation. */
10232 if ((GET_CODE (new_op1) == MULT
10233 || aarch64_shift_p (GET_CODE (new_op1)))
10234 && code != COMPARE)
10235 {
10236 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10237 (enum rtx_code) code,
10238 speed);
10239 return true;
10240 }
10241
10242 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10243
10244 if (speed)
10245 {
10246 if (VECTOR_MODE_P (mode))
10247 {
10248 /* Vector SUB. */
10249 *cost += extra_cost->vect.alu;
10250 }
10251 else if (GET_MODE_CLASS (mode) == MODE_INT)
10252 {
10253 /* SUB(S). */
10254 *cost += extra_cost->alu.arith;
10255 }
10256 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10257 {
10258 /* FSUB. */
10259 *cost += extra_cost->fp[mode == DFmode].addsub;
10260 }
10261 }
10262 return true;
10263 }
10264
10265 case PLUS:
10266 {
10267 rtx new_op0;
10268
10269 op0 = XEXP (x, 0);
10270 op1 = XEXP (x, 1);
10271
10272 cost_plus:
10273 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10274 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10275 {
10276 /* CSINC. */
10277 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10278 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10279 return true;
10280 }
10281
10282 if (GET_MODE_CLASS (mode) == MODE_INT
10283 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10284 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10285 {
10286 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10287
10288 if (speed)
10289 /* ADD (immediate). */
10290 *cost += extra_cost->alu.arith;
10291 return true;
10292 }
10293
10294 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10295
10296 /* Look for ADD (extended register). */
10297 if (is_a <scalar_int_mode> (mode, &int_mode)
10298 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10299 {
10300 if (speed)
10301 *cost += extra_cost->alu.extend_arith;
10302
10303 op0 = aarch64_strip_extend (op0, true);
10304 *cost += rtx_cost (op0, VOIDmode,
10305 (enum rtx_code) GET_CODE (op0), 0, speed);
10306 return true;
10307 }
10308
10309 /* Strip any extend, leave shifts behind as we will
10310 cost them through mult_cost. */
10311 new_op0 = aarch64_strip_extend (op0, false);
10312
10313 if (GET_CODE (new_op0) == MULT
10314 || aarch64_shift_p (GET_CODE (new_op0)))
10315 {
10316 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10317 speed);
10318 return true;
10319 }
10320
10321 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10322
10323 if (speed)
10324 {
10325 if (VECTOR_MODE_P (mode))
10326 {
10327 /* Vector ADD. */
10328 *cost += extra_cost->vect.alu;
10329 }
10330 else if (GET_MODE_CLASS (mode) == MODE_INT)
10331 {
10332 /* ADD. */
10333 *cost += extra_cost->alu.arith;
10334 }
10335 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10336 {
10337 /* FADD. */
10338 *cost += extra_cost->fp[mode == DFmode].addsub;
10339 }
10340 }
10341 return true;
10342 }
10343
10344 case BSWAP:
10345 *cost = COSTS_N_INSNS (1);
10346
10347 if (speed)
10348 {
10349 if (VECTOR_MODE_P (mode))
10350 *cost += extra_cost->vect.alu;
10351 else
10352 *cost += extra_cost->alu.rev;
10353 }
10354 return false;
10355
10356 case IOR:
10357 if (aarch_rev16_p (x))
10358 {
10359 *cost = COSTS_N_INSNS (1);
10360
10361 if (speed)
10362 {
10363 if (VECTOR_MODE_P (mode))
10364 *cost += extra_cost->vect.alu;
10365 else
10366 *cost += extra_cost->alu.rev;
10367 }
10368 return true;
10369 }
10370
10371 if (aarch64_extr_rtx_p (x, &op0, &op1))
10372 {
10373 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10374 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10375 if (speed)
10376 *cost += extra_cost->alu.shift;
10377
10378 return true;
10379 }
10380 /* Fall through. */
10381 case XOR:
10382 case AND:
10383 cost_logic:
10384 op0 = XEXP (x, 0);
10385 op1 = XEXP (x, 1);
10386
10387 if (VECTOR_MODE_P (mode))
10388 {
10389 if (speed)
10390 *cost += extra_cost->vect.alu;
10391 return true;
10392 }
10393
10394 if (code == AND
10395 && GET_CODE (op0) == MULT
10396 && CONST_INT_P (XEXP (op0, 1))
10397 && CONST_INT_P (op1)
10398 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10399 INTVAL (op1)) != 0)
10400 {
10401 /* This is a UBFM/SBFM. */
10402 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10403 if (speed)
10404 *cost += extra_cost->alu.bfx;
10405 return true;
10406 }
10407
10408 if (is_int_mode (mode, &int_mode))
10409 {
10410 if (CONST_INT_P (op1))
10411 {
10412 /* We have a mask + shift version of a UBFIZ
10413 i.e. the *andim_ashift<mode>_bfiz pattern. */
10414 if (GET_CODE (op0) == ASHIFT
10415 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10416 XEXP (op0, 1)))
10417 {
10418 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10419 (enum rtx_code) code, 0, speed);
10420 if (speed)
10421 *cost += extra_cost->alu.bfx;
10422
10423 return true;
10424 }
10425 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10426 {
10427 /* We possibly get the immediate for free, this is not
10428 modelled. */
10429 *cost += rtx_cost (op0, int_mode,
10430 (enum rtx_code) code, 0, speed);
10431 if (speed)
10432 *cost += extra_cost->alu.logical;
10433
10434 return true;
10435 }
10436 }
10437 else
10438 {
10439 rtx new_op0 = op0;
10440
10441 /* Handle ORN, EON, or BIC. */
10442 if (GET_CODE (op0) == NOT)
10443 op0 = XEXP (op0, 0);
10444
10445 new_op0 = aarch64_strip_shift (op0);
10446
10447 /* If we had a shift on op0 then this is a logical-shift-
10448 by-register/immediate operation. Otherwise, this is just
10449 a logical operation. */
10450 if (speed)
10451 {
10452 if (new_op0 != op0)
10453 {
10454 /* Shift by immediate. */
10455 if (CONST_INT_P (XEXP (op0, 1)))
10456 *cost += extra_cost->alu.log_shift;
10457 else
10458 *cost += extra_cost->alu.log_shift_reg;
10459 }
10460 else
10461 *cost += extra_cost->alu.logical;
10462 }
10463
10464 /* In both cases we want to cost both operands. */
10465 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10466 0, speed);
10467 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10468 1, speed);
10469
10470 return true;
10471 }
10472 }
10473 return false;
10474
10475 case NOT:
10476 x = XEXP (x, 0);
10477 op0 = aarch64_strip_shift (x);
10478
10479 if (VECTOR_MODE_P (mode))
10480 {
10481 /* Vector NOT. */
10482 *cost += extra_cost->vect.alu;
10483 return false;
10484 }
10485
10486 /* MVN-shifted-reg. */
10487 if (op0 != x)
10488 {
10489 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10490
10491 if (speed)
10492 *cost += extra_cost->alu.log_shift;
10493
10494 return true;
10495 }
10496 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10497 Handle the second form here taking care that 'a' in the above can
10498 be a shift. */
10499 else if (GET_CODE (op0) == XOR)
10500 {
10501 rtx newop0 = XEXP (op0, 0);
10502 rtx newop1 = XEXP (op0, 1);
10503 rtx op0_stripped = aarch64_strip_shift (newop0);
10504
10505 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10506 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10507
10508 if (speed)
10509 {
10510 if (op0_stripped != newop0)
10511 *cost += extra_cost->alu.log_shift;
10512 else
10513 *cost += extra_cost->alu.logical;
10514 }
10515
10516 return true;
10517 }
10518 /* MVN. */
10519 if (speed)
10520 *cost += extra_cost->alu.logical;
10521
10522 return false;
10523
10524 case ZERO_EXTEND:
10525
10526 op0 = XEXP (x, 0);
10527 /* If a value is written in SI mode, then zero extended to DI
10528 mode, the operation will in general be free as a write to
10529 a 'w' register implicitly zeroes the upper bits of an 'x'
10530 register. However, if this is
10531
10532 (set (reg) (zero_extend (reg)))
10533
10534 we must cost the explicit register move. */
10535 if (mode == DImode
10536 && GET_MODE (op0) == SImode
10537 && outer == SET)
10538 {
10539 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10540
10541 /* If OP_COST is non-zero, then the cost of the zero extend
10542 is effectively the cost of the inner operation. Otherwise
10543 we have a MOV instruction and we take the cost from the MOV
10544 itself. This is true independently of whether we are
10545 optimizing for space or time. */
10546 if (op_cost)
10547 *cost = op_cost;
10548
10549 return true;
10550 }
10551 else if (MEM_P (op0))
10552 {
10553 /* All loads can zero extend to any size for free. */
10554 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10555 return true;
10556 }
10557
10558 op0 = aarch64_extend_bitfield_pattern_p (x);
10559 if (op0)
10560 {
10561 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10562 if (speed)
10563 *cost += extra_cost->alu.bfx;
10564 return true;
10565 }
10566
10567 if (speed)
10568 {
10569 if (VECTOR_MODE_P (mode))
10570 {
10571 /* UMOV. */
10572 *cost += extra_cost->vect.alu;
10573 }
10574 else
10575 {
10576 /* We generate an AND instead of UXTB/UXTH. */
10577 *cost += extra_cost->alu.logical;
10578 }
10579 }
10580 return false;
10581
10582 case SIGN_EXTEND:
10583 if (MEM_P (XEXP (x, 0)))
10584 {
10585 /* LDRSH. */
10586 if (speed)
10587 {
10588 rtx address = XEXP (XEXP (x, 0), 0);
10589 *cost += extra_cost->ldst.load_sign_extend;
10590
10591 *cost +=
10592 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10593 0, speed));
10594 }
10595 return true;
10596 }
10597
10598 op0 = aarch64_extend_bitfield_pattern_p (x);
10599 if (op0)
10600 {
10601 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10602 if (speed)
10603 *cost += extra_cost->alu.bfx;
10604 return true;
10605 }
10606
10607 if (speed)
10608 {
10609 if (VECTOR_MODE_P (mode))
10610 *cost += extra_cost->vect.alu;
10611 else
10612 *cost += extra_cost->alu.extend;
10613 }
10614 return false;
10615
10616 case ASHIFT:
10617 op0 = XEXP (x, 0);
10618 op1 = XEXP (x, 1);
10619
10620 if (CONST_INT_P (op1))
10621 {
10622 if (speed)
10623 {
10624 if (VECTOR_MODE_P (mode))
10625 {
10626 /* Vector shift (immediate). */
10627 *cost += extra_cost->vect.alu;
10628 }
10629 else
10630 {
10631 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10632 aliases. */
10633 *cost += extra_cost->alu.shift;
10634 }
10635 }
10636
10637 /* We can incorporate zero/sign extend for free. */
10638 if (GET_CODE (op0) == ZERO_EXTEND
10639 || GET_CODE (op0) == SIGN_EXTEND)
10640 op0 = XEXP (op0, 0);
10641
10642 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10643 return true;
10644 }
10645 else
10646 {
10647 if (VECTOR_MODE_P (mode))
10648 {
10649 if (speed)
10650 /* Vector shift (register). */
10651 *cost += extra_cost->vect.alu;
10652 }
10653 else
10654 {
10655 if (speed)
10656 /* LSLV. */
10657 *cost += extra_cost->alu.shift_reg;
10658
10659 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10660 && CONST_INT_P (XEXP (op1, 1))
10661 && known_eq (INTVAL (XEXP (op1, 1)),
10662 GET_MODE_BITSIZE (mode) - 1))
10663 {
10664 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10665 /* We already demanded XEXP (op1, 0) to be REG_P, so
10666 don't recurse into it. */
10667 return true;
10668 }
10669 }
10670 return false; /* All arguments need to be in registers. */
10671 }
10672
10673 case ROTATE:
10674 case ROTATERT:
10675 case LSHIFTRT:
10676 case ASHIFTRT:
10677 op0 = XEXP (x, 0);
10678 op1 = XEXP (x, 1);
10679
10680 if (CONST_INT_P (op1))
10681 {
10682 /* ASR (immediate) and friends. */
10683 if (speed)
10684 {
10685 if (VECTOR_MODE_P (mode))
10686 *cost += extra_cost->vect.alu;
10687 else
10688 *cost += extra_cost->alu.shift;
10689 }
10690
10691 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10692 return true;
10693 }
10694 else
10695 {
10696 if (VECTOR_MODE_P (mode))
10697 {
10698 if (speed)
10699 /* Vector shift (register). */
10700 *cost += extra_cost->vect.alu;
10701 }
10702 else
10703 {
10704 if (speed)
10705 /* ASR (register) and friends. */
10706 *cost += extra_cost->alu.shift_reg;
10707
10708 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10709 && CONST_INT_P (XEXP (op1, 1))
10710 && known_eq (INTVAL (XEXP (op1, 1)),
10711 GET_MODE_BITSIZE (mode) - 1))
10712 {
10713 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10714 /* We already demanded XEXP (op1, 0) to be REG_P, so
10715 don't recurse into it. */
10716 return true;
10717 }
10718 }
10719 return false; /* All arguments need to be in registers. */
10720 }
10721
10722 case SYMBOL_REF:
10723
10724 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10725 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10726 {
10727 /* LDR. */
10728 if (speed)
10729 *cost += extra_cost->ldst.load;
10730 }
10731 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10732 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10733 {
10734 /* ADRP, followed by ADD. */
10735 *cost += COSTS_N_INSNS (1);
10736 if (speed)
10737 *cost += 2 * extra_cost->alu.arith;
10738 }
10739 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10740 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10741 {
10742 /* ADR. */
10743 if (speed)
10744 *cost += extra_cost->alu.arith;
10745 }
10746
10747 if (flag_pic)
10748 {
10749 /* One extra load instruction, after accessing the GOT. */
10750 *cost += COSTS_N_INSNS (1);
10751 if (speed)
10752 *cost += extra_cost->ldst.load;
10753 }
10754 return true;
10755
10756 case HIGH:
10757 case LO_SUM:
10758 /* ADRP/ADD (immediate). */
10759 if (speed)
10760 *cost += extra_cost->alu.arith;
10761 return true;
10762
10763 case ZERO_EXTRACT:
10764 case SIGN_EXTRACT:
10765 /* UBFX/SBFX. */
10766 if (speed)
10767 {
10768 if (VECTOR_MODE_P (mode))
10769 *cost += extra_cost->vect.alu;
10770 else
10771 *cost += extra_cost->alu.bfx;
10772 }
10773
10774 /* We can trust that the immediates used will be correct (there
10775 are no by-register forms), so we need only cost op0. */
10776 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10777 return true;
10778
10779 case MULT:
10780 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10781 /* aarch64_rtx_mult_cost always handles recursion to its
10782 operands. */
10783 return true;
10784
10785 case MOD:
10786 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10787 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10788 an unconditional negate. This case should only ever be reached through
10789 the set_smod_pow2_cheap check in expmed.c. */
10790 if (CONST_INT_P (XEXP (x, 1))
10791 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10792 && (mode == SImode || mode == DImode))
10793 {
10794 /* We expand to 4 instructions. Reset the baseline. */
10795 *cost = COSTS_N_INSNS (4);
10796
10797 if (speed)
10798 *cost += 2 * extra_cost->alu.logical
10799 + 2 * extra_cost->alu.arith;
10800
10801 return true;
10802 }
10803
10804 /* Fall-through. */
10805 case UMOD:
10806 if (speed)
10807 {
10808 /* Slighly prefer UMOD over SMOD. */
10809 if (VECTOR_MODE_P (mode))
10810 *cost += extra_cost->vect.alu;
10811 else if (GET_MODE_CLASS (mode) == MODE_INT)
10812 *cost += (extra_cost->mult[mode == DImode].add
10813 + extra_cost->mult[mode == DImode].idiv
10814 + (code == MOD ? 1 : 0));
10815 }
10816 return false; /* All arguments need to be in registers. */
10817
10818 case DIV:
10819 case UDIV:
10820 case SQRT:
10821 if (speed)
10822 {
10823 if (VECTOR_MODE_P (mode))
10824 *cost += extra_cost->vect.alu;
10825 else if (GET_MODE_CLASS (mode) == MODE_INT)
10826 /* There is no integer SQRT, so only DIV and UDIV can get
10827 here. */
10828 *cost += (extra_cost->mult[mode == DImode].idiv
10829 /* Slighly prefer UDIV over SDIV. */
10830 + (code == DIV ? 1 : 0));
10831 else
10832 *cost += extra_cost->fp[mode == DFmode].div;
10833 }
10834 return false; /* All arguments need to be in registers. */
10835
10836 case IF_THEN_ELSE:
10837 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10838 XEXP (x, 2), cost, speed);
10839
10840 case EQ:
10841 case NE:
10842 case GT:
10843 case GTU:
10844 case LT:
10845 case LTU:
10846 case GE:
10847 case GEU:
10848 case LE:
10849 case LEU:
10850
10851 return false; /* All arguments must be in registers. */
10852
10853 case FMA:
10854 op0 = XEXP (x, 0);
10855 op1 = XEXP (x, 1);
10856 op2 = XEXP (x, 2);
10857
10858 if (speed)
10859 {
10860 if (VECTOR_MODE_P (mode))
10861 *cost += extra_cost->vect.alu;
10862 else
10863 *cost += extra_cost->fp[mode == DFmode].fma;
10864 }
10865
10866 /* FMSUB, FNMADD, and FNMSUB are free. */
10867 if (GET_CODE (op0) == NEG)
10868 op0 = XEXP (op0, 0);
10869
10870 if (GET_CODE (op2) == NEG)
10871 op2 = XEXP (op2, 0);
10872
10873 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10874 and the by-element operand as operand 0. */
10875 if (GET_CODE (op1) == NEG)
10876 op1 = XEXP (op1, 0);
10877
10878 /* Catch vector-by-element operations. The by-element operand can
10879 either be (vec_duplicate (vec_select (x))) or just
10880 (vec_select (x)), depending on whether we are multiplying by
10881 a vector or a scalar.
10882
10883 Canonicalization is not very good in these cases, FMA4 will put the
10884 by-element operand as operand 0, FNMA4 will have it as operand 1. */
10885 if (GET_CODE (op0) == VEC_DUPLICATE)
10886 op0 = XEXP (op0, 0);
10887 else if (GET_CODE (op1) == VEC_DUPLICATE)
10888 op1 = XEXP (op1, 0);
10889
10890 if (GET_CODE (op0) == VEC_SELECT)
10891 op0 = XEXP (op0, 0);
10892 else if (GET_CODE (op1) == VEC_SELECT)
10893 op1 = XEXP (op1, 0);
10894
10895 /* If the remaining parameters are not registers,
10896 get the cost to put them into registers. */
10897 *cost += rtx_cost (op0, mode, FMA, 0, speed);
10898 *cost += rtx_cost (op1, mode, FMA, 1, speed);
10899 *cost += rtx_cost (op2, mode, FMA, 2, speed);
10900 return true;
10901
10902 case FLOAT:
10903 case UNSIGNED_FLOAT:
10904 if (speed)
10905 *cost += extra_cost->fp[mode == DFmode].fromint;
10906 return false;
10907
10908 case FLOAT_EXTEND:
10909 if (speed)
10910 {
10911 if (VECTOR_MODE_P (mode))
10912 {
10913 /*Vector truncate. */
10914 *cost += extra_cost->vect.alu;
10915 }
10916 else
10917 *cost += extra_cost->fp[mode == DFmode].widen;
10918 }
10919 return false;
10920
10921 case FLOAT_TRUNCATE:
10922 if (speed)
10923 {
10924 if (VECTOR_MODE_P (mode))
10925 {
10926 /*Vector conversion. */
10927 *cost += extra_cost->vect.alu;
10928 }
10929 else
10930 *cost += extra_cost->fp[mode == DFmode].narrow;
10931 }
10932 return false;
10933
10934 case FIX:
10935 case UNSIGNED_FIX:
10936 x = XEXP (x, 0);
10937 /* Strip the rounding part. They will all be implemented
10938 by the fcvt* family of instructions anyway. */
10939 if (GET_CODE (x) == UNSPEC)
10940 {
10941 unsigned int uns_code = XINT (x, 1);
10942
10943 if (uns_code == UNSPEC_FRINTA
10944 || uns_code == UNSPEC_FRINTM
10945 || uns_code == UNSPEC_FRINTN
10946 || uns_code == UNSPEC_FRINTP
10947 || uns_code == UNSPEC_FRINTZ)
10948 x = XVECEXP (x, 0, 0);
10949 }
10950
10951 if (speed)
10952 {
10953 if (VECTOR_MODE_P (mode))
10954 *cost += extra_cost->vect.alu;
10955 else
10956 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10957 }
10958
10959 /* We can combine fmul by a power of 2 followed by a fcvt into a single
10960 fixed-point fcvt. */
10961 if (GET_CODE (x) == MULT
10962 && ((VECTOR_MODE_P (mode)
10963 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10964 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10965 {
10966 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10967 0, speed);
10968 return true;
10969 }
10970
10971 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10972 return true;
10973
10974 case ABS:
10975 if (VECTOR_MODE_P (mode))
10976 {
10977 /* ABS (vector). */
10978 if (speed)
10979 *cost += extra_cost->vect.alu;
10980 }
10981 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10982 {
10983 op0 = XEXP (x, 0);
10984
10985 /* FABD, which is analogous to FADD. */
10986 if (GET_CODE (op0) == MINUS)
10987 {
10988 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10989 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10990 if (speed)
10991 *cost += extra_cost->fp[mode == DFmode].addsub;
10992
10993 return true;
10994 }
10995 /* Simple FABS is analogous to FNEG. */
10996 if (speed)
10997 *cost += extra_cost->fp[mode == DFmode].neg;
10998 }
10999 else
11000 {
11001 /* Integer ABS will either be split to
11002 two arithmetic instructions, or will be an ABS
11003 (scalar), which we don't model. */
11004 *cost = COSTS_N_INSNS (2);
11005 if (speed)
11006 *cost += 2 * extra_cost->alu.arith;
11007 }
11008 return false;
11009
11010 case SMAX:
11011 case SMIN:
11012 if (speed)
11013 {
11014 if (VECTOR_MODE_P (mode))
11015 *cost += extra_cost->vect.alu;
11016 else
11017 {
11018 /* FMAXNM/FMINNM/FMAX/FMIN.
11019 TODO: This may not be accurate for all implementations, but
11020 we do not model this in the cost tables. */
11021 *cost += extra_cost->fp[mode == DFmode].addsub;
11022 }
11023 }
11024 return false;
11025
11026 case UNSPEC:
11027 /* The floating point round to integer frint* instructions. */
11028 if (aarch64_frint_unspec_p (XINT (x, 1)))
11029 {
11030 if (speed)
11031 *cost += extra_cost->fp[mode == DFmode].roundint;
11032
11033 return false;
11034 }
11035
11036 if (XINT (x, 1) == UNSPEC_RBIT)
11037 {
11038 if (speed)
11039 *cost += extra_cost->alu.rev;
11040
11041 return false;
11042 }
11043 break;
11044
11045 case TRUNCATE:
11046
11047 /* Decompose <su>muldi3_highpart. */
11048 if (/* (truncate:DI */
11049 mode == DImode
11050 /* (lshiftrt:TI */
11051 && GET_MODE (XEXP (x, 0)) == TImode
11052 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11053 /* (mult:TI */
11054 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11055 /* (ANY_EXTEND:TI (reg:DI))
11056 (ANY_EXTEND:TI (reg:DI))) */
11057 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11058 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11059 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11060 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11061 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11062 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11063 /* (const_int 64) */
11064 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11065 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11066 {
11067 /* UMULH/SMULH. */
11068 if (speed)
11069 *cost += extra_cost->mult[mode == DImode].extend;
11070 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11071 mode, MULT, 0, speed);
11072 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11073 mode, MULT, 1, speed);
11074 return true;
11075 }
11076
11077 /* Fall through. */
11078 default:
11079 break;
11080 }
11081
11082 if (dump_file
11083 && flag_aarch64_verbose_cost)
11084 fprintf (dump_file,
11085 "\nFailed to cost RTX. Assuming default cost.\n");
11086
11087 return true;
11088 }
11089
11090 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11091 calculated for X. This cost is stored in *COST. Returns true
11092 if the total cost of X was calculated. */
11093 static bool
11094 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11095 int param, int *cost, bool speed)
11096 {
11097 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11098
11099 if (dump_file
11100 && flag_aarch64_verbose_cost)
11101 {
11102 print_rtl_single (dump_file, x);
11103 fprintf (dump_file, "\n%s cost: %d (%s)\n",
11104 speed ? "Hot" : "Cold",
11105 *cost, result ? "final" : "partial");
11106 }
11107
11108 return result;
11109 }
11110
11111 static int
11112 aarch64_register_move_cost (machine_mode mode,
11113 reg_class_t from_i, reg_class_t to_i)
11114 {
11115 enum reg_class from = (enum reg_class) from_i;
11116 enum reg_class to = (enum reg_class) to_i;
11117 const struct cpu_regmove_cost *regmove_cost
11118 = aarch64_tune_params.regmove_cost;
11119
11120 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11121 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11122 to = GENERAL_REGS;
11123
11124 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11125 from = GENERAL_REGS;
11126
11127 /* Moving between GPR and stack cost is the same as GP2GP. */
11128 if ((from == GENERAL_REGS && to == STACK_REG)
11129 || (to == GENERAL_REGS && from == STACK_REG))
11130 return regmove_cost->GP2GP;
11131
11132 /* To/From the stack register, we move via the gprs. */
11133 if (to == STACK_REG || from == STACK_REG)
11134 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11135 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11136
11137 if (known_eq (GET_MODE_SIZE (mode), 16))
11138 {
11139 /* 128-bit operations on general registers require 2 instructions. */
11140 if (from == GENERAL_REGS && to == GENERAL_REGS)
11141 return regmove_cost->GP2GP * 2;
11142 else if (from == GENERAL_REGS)
11143 return regmove_cost->GP2FP * 2;
11144 else if (to == GENERAL_REGS)
11145 return regmove_cost->FP2GP * 2;
11146
11147 /* When AdvSIMD instructions are disabled it is not possible to move
11148 a 128-bit value directly between Q registers. This is handled in
11149 secondary reload. A general register is used as a scratch to move
11150 the upper DI value and the lower DI value is moved directly,
11151 hence the cost is the sum of three moves. */
11152 if (! TARGET_SIMD)
11153 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11154
11155 return regmove_cost->FP2FP;
11156 }
11157
11158 if (from == GENERAL_REGS && to == GENERAL_REGS)
11159 return regmove_cost->GP2GP;
11160 else if (from == GENERAL_REGS)
11161 return regmove_cost->GP2FP;
11162 else if (to == GENERAL_REGS)
11163 return regmove_cost->FP2GP;
11164
11165 return regmove_cost->FP2FP;
11166 }
11167
11168 static int
11169 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11170 reg_class_t rclass ATTRIBUTE_UNUSED,
11171 bool in ATTRIBUTE_UNUSED)
11172 {
11173 return aarch64_tune_params.memmov_cost;
11174 }
11175
11176 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11177 to optimize 1.0/sqrt. */
11178
11179 static bool
11180 use_rsqrt_p (machine_mode mode)
11181 {
11182 return (!flag_trapping_math
11183 && flag_unsafe_math_optimizations
11184 && ((aarch64_tune_params.approx_modes->recip_sqrt
11185 & AARCH64_APPROX_MODE (mode))
11186 || flag_mrecip_low_precision_sqrt));
11187 }
11188
11189 /* Function to decide when to use the approximate reciprocal square root
11190 builtin. */
11191
11192 static tree
11193 aarch64_builtin_reciprocal (tree fndecl)
11194 {
11195 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11196
11197 if (!use_rsqrt_p (mode))
11198 return NULL_TREE;
11199 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11200 }
11201
11202 /* Emit instruction sequence to compute either the approximate square root
11203 or its approximate reciprocal, depending on the flag RECP, and return
11204 whether the sequence was emitted or not. */
11205
11206 bool
11207 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11208 {
11209 machine_mode mode = GET_MODE (dst);
11210
11211 if (GET_MODE_INNER (mode) == HFmode)
11212 {
11213 gcc_assert (!recp);
11214 return false;
11215 }
11216
11217 if (!recp)
11218 {
11219 if (!(flag_mlow_precision_sqrt
11220 || (aarch64_tune_params.approx_modes->sqrt
11221 & AARCH64_APPROX_MODE (mode))))
11222 return false;
11223
11224 if (flag_finite_math_only
11225 || flag_trapping_math
11226 || !flag_unsafe_math_optimizations
11227 || optimize_function_for_size_p (cfun))
11228 return false;
11229 }
11230 else
11231 /* Caller assumes we cannot fail. */
11232 gcc_assert (use_rsqrt_p (mode));
11233
11234 machine_mode mmsk = mode_for_int_vector (mode).require ();
11235 rtx xmsk = gen_reg_rtx (mmsk);
11236 if (!recp)
11237 /* When calculating the approximate square root, compare the
11238 argument with 0.0 and create a mask. */
11239 emit_insn (gen_rtx_SET (xmsk,
11240 gen_rtx_NEG (mmsk,
11241 gen_rtx_EQ (mmsk, src,
11242 CONST0_RTX (mode)))));
11243
11244 /* Estimate the approximate reciprocal square root. */
11245 rtx xdst = gen_reg_rtx (mode);
11246 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11247
11248 /* Iterate over the series twice for SF and thrice for DF. */
11249 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11250
11251 /* Optionally iterate over the series once less for faster performance
11252 while sacrificing the accuracy. */
11253 if ((recp && flag_mrecip_low_precision_sqrt)
11254 || (!recp && flag_mlow_precision_sqrt))
11255 iterations--;
11256
11257 /* Iterate over the series to calculate the approximate reciprocal square
11258 root. */
11259 rtx x1 = gen_reg_rtx (mode);
11260 while (iterations--)
11261 {
11262 rtx x2 = gen_reg_rtx (mode);
11263 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11264
11265 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11266
11267 if (iterations > 0)
11268 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11269 }
11270
11271 if (!recp)
11272 {
11273 /* Qualify the approximate reciprocal square root when the argument is
11274 0.0 by squashing the intermediary result to 0.0. */
11275 rtx xtmp = gen_reg_rtx (mmsk);
11276 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11277 gen_rtx_SUBREG (mmsk, xdst, 0)));
11278 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11279
11280 /* Calculate the approximate square root. */
11281 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11282 }
11283
11284 /* Finalize the approximation. */
11285 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11286
11287 return true;
11288 }
11289
11290 /* Emit the instruction sequence to compute the approximation for the division
11291 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11292
11293 bool
11294 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11295 {
11296 machine_mode mode = GET_MODE (quo);
11297
11298 if (GET_MODE_INNER (mode) == HFmode)
11299 return false;
11300
11301 bool use_approx_division_p = (flag_mlow_precision_div
11302 || (aarch64_tune_params.approx_modes->division
11303 & AARCH64_APPROX_MODE (mode)));
11304
11305 if (!flag_finite_math_only
11306 || flag_trapping_math
11307 || !flag_unsafe_math_optimizations
11308 || optimize_function_for_size_p (cfun)
11309 || !use_approx_division_p)
11310 return false;
11311
11312 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11313 return false;
11314
11315 /* Estimate the approximate reciprocal. */
11316 rtx xrcp = gen_reg_rtx (mode);
11317 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11318
11319 /* Iterate over the series twice for SF and thrice for DF. */
11320 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11321
11322 /* Optionally iterate over the series once less for faster performance,
11323 while sacrificing the accuracy. */
11324 if (flag_mlow_precision_div)
11325 iterations--;
11326
11327 /* Iterate over the series to calculate the approximate reciprocal. */
11328 rtx xtmp = gen_reg_rtx (mode);
11329 while (iterations--)
11330 {
11331 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11332
11333 if (iterations > 0)
11334 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11335 }
11336
11337 if (num != CONST1_RTX (mode))
11338 {
11339 /* As the approximate reciprocal of DEN is already calculated, only
11340 calculate the approximate division when NUM is not 1.0. */
11341 rtx xnum = force_reg (mode, num);
11342 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11343 }
11344
11345 /* Finalize the approximation. */
11346 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11347 return true;
11348 }
11349
11350 /* Return the number of instructions that can be issued per cycle. */
11351 static int
11352 aarch64_sched_issue_rate (void)
11353 {
11354 return aarch64_tune_params.issue_rate;
11355 }
11356
11357 static int
11358 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11359 {
11360 int issue_rate = aarch64_sched_issue_rate ();
11361
11362 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11363 }
11364
11365
11366 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11367 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11368 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11369
11370 static int
11371 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11372 int ready_index)
11373 {
11374 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11375 }
11376
11377
11378 /* Vectorizer cost model target hooks. */
11379
11380 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11381 static int
11382 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11383 tree vectype,
11384 int misalign ATTRIBUTE_UNUSED)
11385 {
11386 unsigned elements;
11387 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11388 bool fp = false;
11389
11390 if (vectype != NULL)
11391 fp = FLOAT_TYPE_P (vectype);
11392
11393 switch (type_of_cost)
11394 {
11395 case scalar_stmt:
11396 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11397
11398 case scalar_load:
11399 return costs->scalar_load_cost;
11400
11401 case scalar_store:
11402 return costs->scalar_store_cost;
11403
11404 case vector_stmt:
11405 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11406
11407 case vector_load:
11408 return costs->vec_align_load_cost;
11409
11410 case vector_store:
11411 return costs->vec_store_cost;
11412
11413 case vec_to_scalar:
11414 return costs->vec_to_scalar_cost;
11415
11416 case scalar_to_vec:
11417 return costs->scalar_to_vec_cost;
11418
11419 case unaligned_load:
11420 case vector_gather_load:
11421 return costs->vec_unalign_load_cost;
11422
11423 case unaligned_store:
11424 case vector_scatter_store:
11425 return costs->vec_unalign_store_cost;
11426
11427 case cond_branch_taken:
11428 return costs->cond_taken_branch_cost;
11429
11430 case cond_branch_not_taken:
11431 return costs->cond_not_taken_branch_cost;
11432
11433 case vec_perm:
11434 return costs->vec_permute_cost;
11435
11436 case vec_promote_demote:
11437 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11438
11439 case vec_construct:
11440 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11441 return elements / 2 + 1;
11442
11443 default:
11444 gcc_unreachable ();
11445 }
11446 }
11447
11448 /* Implement targetm.vectorize.add_stmt_cost. */
11449 static unsigned
11450 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11451 struct _stmt_vec_info *stmt_info, int misalign,
11452 enum vect_cost_model_location where)
11453 {
11454 unsigned *cost = (unsigned *) data;
11455 unsigned retval = 0;
11456
11457 if (flag_vect_cost_model)
11458 {
11459 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11460 int stmt_cost =
11461 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11462
11463 /* Statements in an inner loop relative to the loop being
11464 vectorized are weighted more heavily. The value here is
11465 arbitrary and could potentially be improved with analysis. */
11466 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11467 count *= 50; /* FIXME */
11468
11469 retval = (unsigned) (count * stmt_cost);
11470 cost[where] += retval;
11471 }
11472
11473 return retval;
11474 }
11475
11476 static void initialize_aarch64_code_model (struct gcc_options *);
11477
11478 /* Parse the TO_PARSE string and put the architecture struct that it
11479 selects into RES and the architectural features into ISA_FLAGS.
11480 Return an aarch64_parse_opt_result describing the parse result.
11481 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11482 When the TO_PARSE string contains an invalid extension,
11483 a copy of the string is created and stored to INVALID_EXTENSION. */
11484
11485 static enum aarch64_parse_opt_result
11486 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11487 uint64_t *isa_flags, std::string *invalid_extension)
11488 {
11489 const char *ext;
11490 const struct processor *arch;
11491 size_t len;
11492
11493 ext = strchr (to_parse, '+');
11494
11495 if (ext != NULL)
11496 len = ext - to_parse;
11497 else
11498 len = strlen (to_parse);
11499
11500 if (len == 0)
11501 return AARCH64_PARSE_MISSING_ARG;
11502
11503
11504 /* Loop through the list of supported ARCHes to find a match. */
11505 for (arch = all_architectures; arch->name != NULL; arch++)
11506 {
11507 if (strlen (arch->name) == len
11508 && strncmp (arch->name, to_parse, len) == 0)
11509 {
11510 uint64_t isa_temp = arch->flags;
11511
11512 if (ext != NULL)
11513 {
11514 /* TO_PARSE string contains at least one extension. */
11515 enum aarch64_parse_opt_result ext_res
11516 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11517
11518 if (ext_res != AARCH64_PARSE_OK)
11519 return ext_res;
11520 }
11521 /* Extension parsing was successful. Confirm the result
11522 arch and ISA flags. */
11523 *res = arch;
11524 *isa_flags = isa_temp;
11525 return AARCH64_PARSE_OK;
11526 }
11527 }
11528
11529 /* ARCH name not found in list. */
11530 return AARCH64_PARSE_INVALID_ARG;
11531 }
11532
11533 /* Parse the TO_PARSE string and put the result tuning in RES and the
11534 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11535 describing the parse result. If there is an error parsing, RES and
11536 ISA_FLAGS are left unchanged.
11537 When the TO_PARSE string contains an invalid extension,
11538 a copy of the string is created and stored to INVALID_EXTENSION. */
11539
11540 static enum aarch64_parse_opt_result
11541 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11542 uint64_t *isa_flags, std::string *invalid_extension)
11543 {
11544 const char *ext;
11545 const struct processor *cpu;
11546 size_t len;
11547
11548 ext = strchr (to_parse, '+');
11549
11550 if (ext != NULL)
11551 len = ext - to_parse;
11552 else
11553 len = strlen (to_parse);
11554
11555 if (len == 0)
11556 return AARCH64_PARSE_MISSING_ARG;
11557
11558
11559 /* Loop through the list of supported CPUs to find a match. */
11560 for (cpu = all_cores; cpu->name != NULL; cpu++)
11561 {
11562 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11563 {
11564 uint64_t isa_temp = cpu->flags;
11565
11566
11567 if (ext != NULL)
11568 {
11569 /* TO_PARSE string contains at least one extension. */
11570 enum aarch64_parse_opt_result ext_res
11571 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11572
11573 if (ext_res != AARCH64_PARSE_OK)
11574 return ext_res;
11575 }
11576 /* Extension parsing was successfull. Confirm the result
11577 cpu and ISA flags. */
11578 *res = cpu;
11579 *isa_flags = isa_temp;
11580 return AARCH64_PARSE_OK;
11581 }
11582 }
11583
11584 /* CPU name not found in list. */
11585 return AARCH64_PARSE_INVALID_ARG;
11586 }
11587
11588 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11589 Return an aarch64_parse_opt_result describing the parse result.
11590 If the parsing fails the RES does not change. */
11591
11592 static enum aarch64_parse_opt_result
11593 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11594 {
11595 const struct processor *cpu;
11596
11597 /* Loop through the list of supported CPUs to find a match. */
11598 for (cpu = all_cores; cpu->name != NULL; cpu++)
11599 {
11600 if (strcmp (cpu->name, to_parse) == 0)
11601 {
11602 *res = cpu;
11603 return AARCH64_PARSE_OK;
11604 }
11605 }
11606
11607 /* CPU name not found in list. */
11608 return AARCH64_PARSE_INVALID_ARG;
11609 }
11610
11611 /* Parse TOKEN, which has length LENGTH to see if it is an option
11612 described in FLAG. If it is, return the index bit for that fusion type.
11613 If not, error (printing OPTION_NAME) and return zero. */
11614
11615 static unsigned int
11616 aarch64_parse_one_option_token (const char *token,
11617 size_t length,
11618 const struct aarch64_flag_desc *flag,
11619 const char *option_name)
11620 {
11621 for (; flag->name != NULL; flag++)
11622 {
11623 if (length == strlen (flag->name)
11624 && !strncmp (flag->name, token, length))
11625 return flag->flag;
11626 }
11627
11628 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11629 return 0;
11630 }
11631
11632 /* Parse OPTION which is a comma-separated list of flags to enable.
11633 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11634 default state we inherit from the CPU tuning structures. OPTION_NAME
11635 gives the top-level option we are parsing in the -moverride string,
11636 for use in error messages. */
11637
11638 static unsigned int
11639 aarch64_parse_boolean_options (const char *option,
11640 const struct aarch64_flag_desc *flags,
11641 unsigned int initial_state,
11642 const char *option_name)
11643 {
11644 const char separator = '.';
11645 const char* specs = option;
11646 const char* ntoken = option;
11647 unsigned int found_flags = initial_state;
11648
11649 while ((ntoken = strchr (specs, separator)))
11650 {
11651 size_t token_length = ntoken - specs;
11652 unsigned token_ops = aarch64_parse_one_option_token (specs,
11653 token_length,
11654 flags,
11655 option_name);
11656 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11657 in the token stream, reset the supported operations. So:
11658
11659 adrp+add.cmp+branch.none.adrp+add
11660
11661 would have the result of turning on only adrp+add fusion. */
11662 if (!token_ops)
11663 found_flags = 0;
11664
11665 found_flags |= token_ops;
11666 specs = ++ntoken;
11667 }
11668
11669 /* We ended with a comma, print something. */
11670 if (!(*specs))
11671 {
11672 error ("%s string ill-formed\n", option_name);
11673 return 0;
11674 }
11675
11676 /* We still have one more token to parse. */
11677 size_t token_length = strlen (specs);
11678 unsigned token_ops = aarch64_parse_one_option_token (specs,
11679 token_length,
11680 flags,
11681 option_name);
11682 if (!token_ops)
11683 found_flags = 0;
11684
11685 found_flags |= token_ops;
11686 return found_flags;
11687 }
11688
11689 /* Support for overriding instruction fusion. */
11690
11691 static void
11692 aarch64_parse_fuse_string (const char *fuse_string,
11693 struct tune_params *tune)
11694 {
11695 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11696 aarch64_fusible_pairs,
11697 tune->fusible_ops,
11698 "fuse=");
11699 }
11700
11701 /* Support for overriding other tuning flags. */
11702
11703 static void
11704 aarch64_parse_tune_string (const char *tune_string,
11705 struct tune_params *tune)
11706 {
11707 tune->extra_tuning_flags
11708 = aarch64_parse_boolean_options (tune_string,
11709 aarch64_tuning_flags,
11710 tune->extra_tuning_flags,
11711 "tune=");
11712 }
11713
11714 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11715 Accept the valid SVE vector widths allowed by
11716 aarch64_sve_vector_bits_enum and use it to override sve_width
11717 in TUNE. */
11718
11719 static void
11720 aarch64_parse_sve_width_string (const char *tune_string,
11721 struct tune_params *tune)
11722 {
11723 int width = -1;
11724
11725 int n = sscanf (tune_string, "%d", &width);
11726 if (n == EOF)
11727 {
11728 error ("invalid format for sve_width");
11729 return;
11730 }
11731 switch (width)
11732 {
11733 case SVE_128:
11734 case SVE_256:
11735 case SVE_512:
11736 case SVE_1024:
11737 case SVE_2048:
11738 break;
11739 default:
11740 error ("invalid sve_width value: %d", width);
11741 }
11742 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11743 }
11744
11745 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11746 we understand. If it is, extract the option string and handoff to
11747 the appropriate function. */
11748
11749 void
11750 aarch64_parse_one_override_token (const char* token,
11751 size_t length,
11752 struct tune_params *tune)
11753 {
11754 const struct aarch64_tuning_override_function *fn
11755 = aarch64_tuning_override_functions;
11756
11757 const char *option_part = strchr (token, '=');
11758 if (!option_part)
11759 {
11760 error ("tuning string missing in option (%s)", token);
11761 return;
11762 }
11763
11764 /* Get the length of the option name. */
11765 length = option_part - token;
11766 /* Skip the '=' to get to the option string. */
11767 option_part++;
11768
11769 for (; fn->name != NULL; fn++)
11770 {
11771 if (!strncmp (fn->name, token, length))
11772 {
11773 fn->parse_override (option_part, tune);
11774 return;
11775 }
11776 }
11777
11778 error ("unknown tuning option (%s)",token);
11779 return;
11780 }
11781
11782 /* A checking mechanism for the implementation of the tls size. */
11783
11784 static void
11785 initialize_aarch64_tls_size (struct gcc_options *opts)
11786 {
11787 if (aarch64_tls_size == 0)
11788 aarch64_tls_size = 24;
11789
11790 switch (opts->x_aarch64_cmodel_var)
11791 {
11792 case AARCH64_CMODEL_TINY:
11793 /* Both the default and maximum TLS size allowed under tiny is 1M which
11794 needs two instructions to address, so we clamp the size to 24. */
11795 if (aarch64_tls_size > 24)
11796 aarch64_tls_size = 24;
11797 break;
11798 case AARCH64_CMODEL_SMALL:
11799 /* The maximum TLS size allowed under small is 4G. */
11800 if (aarch64_tls_size > 32)
11801 aarch64_tls_size = 32;
11802 break;
11803 case AARCH64_CMODEL_LARGE:
11804 /* The maximum TLS size allowed under large is 16E.
11805 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11806 if (aarch64_tls_size > 48)
11807 aarch64_tls_size = 48;
11808 break;
11809 default:
11810 gcc_unreachable ();
11811 }
11812
11813 return;
11814 }
11815
11816 /* Parse STRING looking for options in the format:
11817 string :: option:string
11818 option :: name=substring
11819 name :: {a-z}
11820 substring :: defined by option. */
11821
11822 static void
11823 aarch64_parse_override_string (const char* input_string,
11824 struct tune_params* tune)
11825 {
11826 const char separator = ':';
11827 size_t string_length = strlen (input_string) + 1;
11828 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11829 char *string = string_root;
11830 strncpy (string, input_string, string_length);
11831 string[string_length - 1] = '\0';
11832
11833 char* ntoken = string;
11834
11835 while ((ntoken = strchr (string, separator)))
11836 {
11837 size_t token_length = ntoken - string;
11838 /* Make this substring look like a string. */
11839 *ntoken = '\0';
11840 aarch64_parse_one_override_token (string, token_length, tune);
11841 string = ++ntoken;
11842 }
11843
11844 /* One last option to parse. */
11845 aarch64_parse_one_override_token (string, strlen (string), tune);
11846 free (string_root);
11847 }
11848
11849
11850 static void
11851 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11852 {
11853 if (accepted_branch_protection_string)
11854 {
11855 opts->x_aarch64_branch_protection_string
11856 = xstrdup (accepted_branch_protection_string);
11857 }
11858
11859 /* PR 70044: We have to be careful about being called multiple times for the
11860 same function. This means all changes should be repeatable. */
11861
11862 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11863 Disable the frame pointer flag so the mid-end will not use a frame
11864 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11865 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11866 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
11867 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11868 if (opts->x_flag_omit_frame_pointer == 0)
11869 opts->x_flag_omit_frame_pointer = 2;
11870
11871 /* If not optimizing for size, set the default
11872 alignment to what the target wants. */
11873 if (!opts->x_optimize_size)
11874 {
11875 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11876 opts->x_str_align_loops = aarch64_tune_params.loop_align;
11877 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11878 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11879 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11880 opts->x_str_align_functions = aarch64_tune_params.function_align;
11881 }
11882
11883 /* We default to no pc-relative literal loads. */
11884
11885 aarch64_pcrelative_literal_loads = false;
11886
11887 /* If -mpc-relative-literal-loads is set on the command line, this
11888 implies that the user asked for PC relative literal loads. */
11889 if (opts->x_pcrelative_literal_loads == 1)
11890 aarch64_pcrelative_literal_loads = true;
11891
11892 /* In the tiny memory model it makes no sense to disallow PC relative
11893 literal pool loads. */
11894 if (aarch64_cmodel == AARCH64_CMODEL_TINY
11895 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11896 aarch64_pcrelative_literal_loads = true;
11897
11898 /* When enabling the lower precision Newton series for the square root, also
11899 enable it for the reciprocal square root, since the latter is an
11900 intermediary step for the former. */
11901 if (flag_mlow_precision_sqrt)
11902 flag_mrecip_low_precision_sqrt = true;
11903 }
11904
11905 /* 'Unpack' up the internal tuning structs and update the options
11906 in OPTS. The caller must have set up selected_tune and selected_arch
11907 as all the other target-specific codegen decisions are
11908 derived from them. */
11909
11910 void
11911 aarch64_override_options_internal (struct gcc_options *opts)
11912 {
11913 aarch64_tune_flags = selected_tune->flags;
11914 aarch64_tune = selected_tune->sched_core;
11915 /* Make a copy of the tuning parameters attached to the core, which
11916 we may later overwrite. */
11917 aarch64_tune_params = *(selected_tune->tune);
11918 aarch64_architecture_version = selected_arch->architecture_version;
11919
11920 if (opts->x_aarch64_override_tune_string)
11921 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11922 &aarch64_tune_params);
11923
11924 /* This target defaults to strict volatile bitfields. */
11925 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11926 opts->x_flag_strict_volatile_bitfields = 1;
11927
11928 if (aarch64_stack_protector_guard == SSP_GLOBAL
11929 && opts->x_aarch64_stack_protector_guard_offset_str)
11930 {
11931 error ("incompatible options %<-mstack-protector-guard=global%> and "
11932 "%<-mstack-protector-guard-offset=%s%>",
11933 aarch64_stack_protector_guard_offset_str);
11934 }
11935
11936 if (aarch64_stack_protector_guard == SSP_SYSREG
11937 && !(opts->x_aarch64_stack_protector_guard_offset_str
11938 && opts->x_aarch64_stack_protector_guard_reg_str))
11939 {
11940 error ("both %<-mstack-protector-guard-offset%> and "
11941 "%<-mstack-protector-guard-reg%> must be used "
11942 "with %<-mstack-protector-guard=sysreg%>");
11943 }
11944
11945 if (opts->x_aarch64_stack_protector_guard_reg_str)
11946 {
11947 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11948 error ("specify a system register with a small string length.");
11949 }
11950
11951 if (opts->x_aarch64_stack_protector_guard_offset_str)
11952 {
11953 char *end;
11954 const char *str = aarch64_stack_protector_guard_offset_str;
11955 errno = 0;
11956 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11957 if (!*str || *end || errno)
11958 error ("%qs is not a valid offset in %qs", str,
11959 "-mstack-protector-guard-offset=");
11960 aarch64_stack_protector_guard_offset = offs;
11961 }
11962
11963 initialize_aarch64_code_model (opts);
11964 initialize_aarch64_tls_size (opts);
11965
11966 int queue_depth = 0;
11967 switch (aarch64_tune_params.autoprefetcher_model)
11968 {
11969 case tune_params::AUTOPREFETCHER_OFF:
11970 queue_depth = -1;
11971 break;
11972 case tune_params::AUTOPREFETCHER_WEAK:
11973 queue_depth = 0;
11974 break;
11975 case tune_params::AUTOPREFETCHER_STRONG:
11976 queue_depth = max_insn_queue_index + 1;
11977 break;
11978 default:
11979 gcc_unreachable ();
11980 }
11981
11982 /* We don't mind passing in global_options_set here as we don't use
11983 the *options_set structs anyway. */
11984 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11985 queue_depth,
11986 opts->x_param_values,
11987 global_options_set.x_param_values);
11988
11989 /* Set up parameters to be used in prefetching algorithm. Do not
11990 override the defaults unless we are tuning for a core we have
11991 researched values for. */
11992 if (aarch64_tune_params.prefetch->num_slots > 0)
11993 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11994 aarch64_tune_params.prefetch->num_slots,
11995 opts->x_param_values,
11996 global_options_set.x_param_values);
11997 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11998 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11999 aarch64_tune_params.prefetch->l1_cache_size,
12000 opts->x_param_values,
12001 global_options_set.x_param_values);
12002 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12003 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12004 aarch64_tune_params.prefetch->l1_cache_line_size,
12005 opts->x_param_values,
12006 global_options_set.x_param_values);
12007 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12008 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12009 aarch64_tune_params.prefetch->l2_cache_size,
12010 opts->x_param_values,
12011 global_options_set.x_param_values);
12012 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12013 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12014 0,
12015 opts->x_param_values,
12016 global_options_set.x_param_values);
12017 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12018 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12019 aarch64_tune_params.prefetch->minimum_stride,
12020 opts->x_param_values,
12021 global_options_set.x_param_values);
12022
12023 /* Use the alternative scheduling-pressure algorithm by default. */
12024 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12025 opts->x_param_values,
12026 global_options_set.x_param_values);
12027
12028 /* If the user hasn't changed it via configure then set the default to 64 KB
12029 for the backend. */
12030 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12031 DEFAULT_STK_CLASH_GUARD_SIZE == 0
12032 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12033 opts->x_param_values,
12034 global_options_set.x_param_values);
12035
12036 /* Validate the guard size. */
12037 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12038
12039 /* Enforce that interval is the same size as size so the mid-end does the
12040 right thing. */
12041 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12042 guard_size,
12043 opts->x_param_values,
12044 global_options_set.x_param_values);
12045
12046 /* The maybe_set calls won't update the value if the user has explicitly set
12047 one. Which means we need to validate that probing interval and guard size
12048 are equal. */
12049 int probe_interval
12050 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12051 if (guard_size != probe_interval)
12052 error ("stack clash guard size %<%d%> must be equal to probing interval "
12053 "%<%d%>", guard_size, probe_interval);
12054
12055 /* Enable sw prefetching at specified optimization level for
12056 CPUS that have prefetch. Lower optimization level threshold by 1
12057 when profiling is enabled. */
12058 if (opts->x_flag_prefetch_loop_arrays < 0
12059 && !opts->x_optimize_size
12060 && aarch64_tune_params.prefetch->default_opt_level >= 0
12061 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12062 opts->x_flag_prefetch_loop_arrays = 1;
12063
12064 if (opts->x_aarch64_arch_string == NULL)
12065 opts->x_aarch64_arch_string = selected_arch->name;
12066 if (opts->x_aarch64_cpu_string == NULL)
12067 opts->x_aarch64_cpu_string = selected_cpu->name;
12068 if (opts->x_aarch64_tune_string == NULL)
12069 opts->x_aarch64_tune_string = selected_tune->name;
12070
12071 aarch64_override_options_after_change_1 (opts);
12072 }
12073
12074 /* Print a hint with a suggestion for a core or architecture name that
12075 most closely resembles what the user passed in STR. ARCH is true if
12076 the user is asking for an architecture name. ARCH is false if the user
12077 is asking for a core name. */
12078
12079 static void
12080 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12081 {
12082 auto_vec<const char *> candidates;
12083 const struct processor *entry = arch ? all_architectures : all_cores;
12084 for (; entry->name != NULL; entry++)
12085 candidates.safe_push (entry->name);
12086
12087 #ifdef HAVE_LOCAL_CPU_DETECT
12088 /* Add also "native" as possible value. */
12089 if (arch)
12090 candidates.safe_push ("native");
12091 #endif
12092
12093 char *s;
12094 const char *hint = candidates_list_and_hint (str, s, candidates);
12095 if (hint)
12096 inform (input_location, "valid arguments are: %s;"
12097 " did you mean %qs?", s, hint);
12098 else
12099 inform (input_location, "valid arguments are: %s", s);
12100
12101 XDELETEVEC (s);
12102 }
12103
12104 /* Print a hint with a suggestion for a core name that most closely resembles
12105 what the user passed in STR. */
12106
12107 inline static void
12108 aarch64_print_hint_for_core (const char *str)
12109 {
12110 aarch64_print_hint_for_core_or_arch (str, false);
12111 }
12112
12113 /* Print a hint with a suggestion for an architecture name that most closely
12114 resembles what the user passed in STR. */
12115
12116 inline static void
12117 aarch64_print_hint_for_arch (const char *str)
12118 {
12119 aarch64_print_hint_for_core_or_arch (str, true);
12120 }
12121
12122
12123 /* Print a hint with a suggestion for an extension name
12124 that most closely resembles what the user passed in STR. */
12125
12126 void
12127 aarch64_print_hint_for_extensions (const std::string &str)
12128 {
12129 auto_vec<const char *> candidates;
12130 aarch64_get_all_extension_candidates (&candidates);
12131 char *s;
12132 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12133 if (hint)
12134 inform (input_location, "valid arguments are: %s;"
12135 " did you mean %qs?", s, hint);
12136 else
12137 inform (input_location, "valid arguments are: %s;", s);
12138
12139 XDELETEVEC (s);
12140 }
12141
12142 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12143 specified in STR and throw errors if appropriate. Put the results if
12144 they are valid in RES and ISA_FLAGS. Return whether the option is
12145 valid. */
12146
12147 static bool
12148 aarch64_validate_mcpu (const char *str, const struct processor **res,
12149 uint64_t *isa_flags)
12150 {
12151 std::string invalid_extension;
12152 enum aarch64_parse_opt_result parse_res
12153 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12154
12155 if (parse_res == AARCH64_PARSE_OK)
12156 return true;
12157
12158 switch (parse_res)
12159 {
12160 case AARCH64_PARSE_MISSING_ARG:
12161 error ("missing cpu name in %<-mcpu=%s%>", str);
12162 break;
12163 case AARCH64_PARSE_INVALID_ARG:
12164 error ("unknown value %qs for %<-mcpu%>", str);
12165 aarch64_print_hint_for_core (str);
12166 break;
12167 case AARCH64_PARSE_INVALID_FEATURE:
12168 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12169 invalid_extension.c_str (), str);
12170 aarch64_print_hint_for_extensions (invalid_extension);
12171 break;
12172 default:
12173 gcc_unreachable ();
12174 }
12175
12176 return false;
12177 }
12178
12179 /* Parses CONST_STR for branch protection features specified in
12180 aarch64_branch_protect_types, and set any global variables required. Returns
12181 the parsing result and assigns LAST_STR to the last processed token from
12182 CONST_STR so that it can be used for error reporting. */
12183
12184 static enum
12185 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12186 char** last_str)
12187 {
12188 char *str_root = xstrdup (const_str);
12189 char* token_save = NULL;
12190 char *str = strtok_r (str_root, "+", &token_save);
12191 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12192 if (!str)
12193 res = AARCH64_PARSE_MISSING_ARG;
12194 else
12195 {
12196 char *next_str = strtok_r (NULL, "+", &token_save);
12197 /* Reset the branch protection features to their defaults. */
12198 aarch64_handle_no_branch_protection (NULL, NULL);
12199
12200 while (str && res == AARCH64_PARSE_OK)
12201 {
12202 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12203 bool found = false;
12204 /* Search for this type. */
12205 while (type && type->name && !found && res == AARCH64_PARSE_OK)
12206 {
12207 if (strcmp (str, type->name) == 0)
12208 {
12209 found = true;
12210 res = type->handler (str, next_str);
12211 str = next_str;
12212 next_str = strtok_r (NULL, "+", &token_save);
12213 }
12214 else
12215 type++;
12216 }
12217 if (found && res == AARCH64_PARSE_OK)
12218 {
12219 bool found_subtype = true;
12220 /* Loop through each token until we find one that isn't a
12221 subtype. */
12222 while (found_subtype)
12223 {
12224 found_subtype = false;
12225 const aarch64_branch_protect_type *subtype = type->subtypes;
12226 /* Search for the subtype. */
12227 while (str && subtype && subtype->name && !found_subtype
12228 && res == AARCH64_PARSE_OK)
12229 {
12230 if (strcmp (str, subtype->name) == 0)
12231 {
12232 found_subtype = true;
12233 res = subtype->handler (str, next_str);
12234 str = next_str;
12235 next_str = strtok_r (NULL, "+", &token_save);
12236 }
12237 else
12238 subtype++;
12239 }
12240 }
12241 }
12242 else if (!found)
12243 res = AARCH64_PARSE_INVALID_ARG;
12244 }
12245 }
12246 /* Copy the last processed token into the argument to pass it back.
12247 Used by option and attribute validation to print the offending token. */
12248 if (last_str)
12249 {
12250 if (str) strcpy (*last_str, str);
12251 else *last_str = NULL;
12252 }
12253 if (res == AARCH64_PARSE_OK)
12254 {
12255 /* If needed, alloc the accepted string then copy in const_str.
12256 Used by override_option_after_change_1. */
12257 if (!accepted_branch_protection_string)
12258 accepted_branch_protection_string = (char *) xmalloc (
12259 BRANCH_PROTECT_STR_MAX
12260 + 1);
12261 strncpy (accepted_branch_protection_string, const_str,
12262 BRANCH_PROTECT_STR_MAX + 1);
12263 /* Forcibly null-terminate. */
12264 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12265 }
12266 return res;
12267 }
12268
12269 static bool
12270 aarch64_validate_mbranch_protection (const char *const_str)
12271 {
12272 char *str = (char *) xmalloc (strlen (const_str));
12273 enum aarch64_parse_opt_result res =
12274 aarch64_parse_branch_protection (const_str, &str);
12275 if (res == AARCH64_PARSE_INVALID_ARG)
12276 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12277 else if (res == AARCH64_PARSE_MISSING_ARG)
12278 error ("missing argument for %<-mbranch-protection=%>");
12279 free (str);
12280 return res == AARCH64_PARSE_OK;
12281 }
12282
12283 /* Validate a command-line -march option. Parse the arch and extensions
12284 (if any) specified in STR and throw errors if appropriate. Put the
12285 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12286 option is valid. */
12287
12288 static bool
12289 aarch64_validate_march (const char *str, const struct processor **res,
12290 uint64_t *isa_flags)
12291 {
12292 std::string invalid_extension;
12293 enum aarch64_parse_opt_result parse_res
12294 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12295
12296 if (parse_res == AARCH64_PARSE_OK)
12297 return true;
12298
12299 switch (parse_res)
12300 {
12301 case AARCH64_PARSE_MISSING_ARG:
12302 error ("missing arch name in %<-march=%s%>", str);
12303 break;
12304 case AARCH64_PARSE_INVALID_ARG:
12305 error ("unknown value %qs for %<-march%>", str);
12306 aarch64_print_hint_for_arch (str);
12307 break;
12308 case AARCH64_PARSE_INVALID_FEATURE:
12309 error ("invalid feature modifier %qs in %<-march=%s%>",
12310 invalid_extension.c_str (), str);
12311 aarch64_print_hint_for_extensions (invalid_extension);
12312 break;
12313 default:
12314 gcc_unreachable ();
12315 }
12316
12317 return false;
12318 }
12319
12320 /* Validate a command-line -mtune option. Parse the cpu
12321 specified in STR and throw errors if appropriate. Put the
12322 result, if it is valid, in RES. Return whether the option is
12323 valid. */
12324
12325 static bool
12326 aarch64_validate_mtune (const char *str, const struct processor **res)
12327 {
12328 enum aarch64_parse_opt_result parse_res
12329 = aarch64_parse_tune (str, res);
12330
12331 if (parse_res == AARCH64_PARSE_OK)
12332 return true;
12333
12334 switch (parse_res)
12335 {
12336 case AARCH64_PARSE_MISSING_ARG:
12337 error ("missing cpu name in %<-mtune=%s%>", str);
12338 break;
12339 case AARCH64_PARSE_INVALID_ARG:
12340 error ("unknown value %qs for %<-mtune%>", str);
12341 aarch64_print_hint_for_core (str);
12342 break;
12343 default:
12344 gcc_unreachable ();
12345 }
12346 return false;
12347 }
12348
12349 /* Return the CPU corresponding to the enum CPU.
12350 If it doesn't specify a cpu, return the default. */
12351
12352 static const struct processor *
12353 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12354 {
12355 if (cpu != aarch64_none)
12356 return &all_cores[cpu];
12357
12358 /* The & 0x3f is to extract the bottom 6 bits that encode the
12359 default cpu as selected by the --with-cpu GCC configure option
12360 in config.gcc.
12361 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12362 flags mechanism should be reworked to make it more sane. */
12363 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12364 }
12365
12366 /* Return the architecture corresponding to the enum ARCH.
12367 If it doesn't specify a valid architecture, return the default. */
12368
12369 static const struct processor *
12370 aarch64_get_arch (enum aarch64_arch arch)
12371 {
12372 if (arch != aarch64_no_arch)
12373 return &all_architectures[arch];
12374
12375 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12376
12377 return &all_architectures[cpu->arch];
12378 }
12379
12380 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12381
12382 static poly_uint16
12383 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12384 {
12385 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12386 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12387 deciding which .md file patterns to use and when deciding whether
12388 something is a legitimate address or constant. */
12389 if (value == SVE_SCALABLE || value == SVE_128)
12390 return poly_uint16 (2, 2);
12391 else
12392 return (int) value / 64;
12393 }
12394
12395 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12396 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12397 tuning structs. In particular it must set selected_tune and
12398 aarch64_isa_flags that define the available ISA features and tuning
12399 decisions. It must also set selected_arch as this will be used to
12400 output the .arch asm tags for each function. */
12401
12402 static void
12403 aarch64_override_options (void)
12404 {
12405 uint64_t cpu_isa = 0;
12406 uint64_t arch_isa = 0;
12407 aarch64_isa_flags = 0;
12408
12409 bool valid_cpu = true;
12410 bool valid_tune = true;
12411 bool valid_arch = true;
12412
12413 selected_cpu = NULL;
12414 selected_arch = NULL;
12415 selected_tune = NULL;
12416
12417 if (aarch64_branch_protection_string)
12418 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12419
12420 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12421 If either of -march or -mtune is given, they override their
12422 respective component of -mcpu. */
12423 if (aarch64_cpu_string)
12424 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12425 &cpu_isa);
12426
12427 if (aarch64_arch_string)
12428 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12429 &arch_isa);
12430
12431 if (aarch64_tune_string)
12432 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12433
12434 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12435 SUBTARGET_OVERRIDE_OPTIONS;
12436 #endif
12437
12438 /* If the user did not specify a processor, choose the default
12439 one for them. This will be the CPU set during configuration using
12440 --with-cpu, otherwise it is "generic". */
12441 if (!selected_cpu)
12442 {
12443 if (selected_arch)
12444 {
12445 selected_cpu = &all_cores[selected_arch->ident];
12446 aarch64_isa_flags = arch_isa;
12447 explicit_arch = selected_arch->arch;
12448 }
12449 else
12450 {
12451 /* Get default configure-time CPU. */
12452 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12453 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12454 }
12455
12456 if (selected_tune)
12457 explicit_tune_core = selected_tune->ident;
12458 }
12459 /* If both -mcpu and -march are specified check that they are architecturally
12460 compatible, warn if they're not and prefer the -march ISA flags. */
12461 else if (selected_arch)
12462 {
12463 if (selected_arch->arch != selected_cpu->arch)
12464 {
12465 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12466 all_architectures[selected_cpu->arch].name,
12467 selected_arch->name);
12468 }
12469 aarch64_isa_flags = arch_isa;
12470 explicit_arch = selected_arch->arch;
12471 explicit_tune_core = selected_tune ? selected_tune->ident
12472 : selected_cpu->ident;
12473 }
12474 else
12475 {
12476 /* -mcpu but no -march. */
12477 aarch64_isa_flags = cpu_isa;
12478 explicit_tune_core = selected_tune ? selected_tune->ident
12479 : selected_cpu->ident;
12480 gcc_assert (selected_cpu);
12481 selected_arch = &all_architectures[selected_cpu->arch];
12482 explicit_arch = selected_arch->arch;
12483 }
12484
12485 /* Set the arch as well as we will need it when outputing
12486 the .arch directive in assembly. */
12487 if (!selected_arch)
12488 {
12489 gcc_assert (selected_cpu);
12490 selected_arch = &all_architectures[selected_cpu->arch];
12491 }
12492
12493 if (!selected_tune)
12494 selected_tune = selected_cpu;
12495
12496 if (aarch64_enable_bti == 2)
12497 {
12498 #ifdef TARGET_ENABLE_BTI
12499 aarch64_enable_bti = 1;
12500 #else
12501 aarch64_enable_bti = 0;
12502 #endif
12503 }
12504
12505 /* Return address signing is currently not supported for ILP32 targets. For
12506 LP64 targets use the configured option in the absence of a command-line
12507 option for -mbranch-protection. */
12508 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12509 {
12510 #ifdef TARGET_ENABLE_PAC_RET
12511 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12512 #else
12513 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12514 #endif
12515 }
12516
12517 #ifndef HAVE_AS_MABI_OPTION
12518 /* The compiler may have been configured with 2.23.* binutils, which does
12519 not have support for ILP32. */
12520 if (TARGET_ILP32)
12521 error ("assembler does not support %<-mabi=ilp32%>");
12522 #endif
12523
12524 /* Convert -msve-vector-bits to a VG count. */
12525 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12526
12527 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12528 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12529
12530 /* Make sure we properly set up the explicit options. */
12531 if ((aarch64_cpu_string && valid_cpu)
12532 || (aarch64_tune_string && valid_tune))
12533 gcc_assert (explicit_tune_core != aarch64_none);
12534
12535 if ((aarch64_cpu_string && valid_cpu)
12536 || (aarch64_arch_string && valid_arch))
12537 gcc_assert (explicit_arch != aarch64_no_arch);
12538
12539 /* The pass to insert speculation tracking runs before
12540 shrink-wrapping and the latter does not know how to update the
12541 tracking status. So disable it in this case. */
12542 if (aarch64_track_speculation)
12543 flag_shrink_wrap = 0;
12544
12545 aarch64_override_options_internal (&global_options);
12546
12547 /* Save these options as the default ones in case we push and pop them later
12548 while processing functions with potential target attributes. */
12549 target_option_default_node = target_option_current_node
12550 = build_target_option_node (&global_options);
12551 }
12552
12553 /* Implement targetm.override_options_after_change. */
12554
12555 static void
12556 aarch64_override_options_after_change (void)
12557 {
12558 aarch64_override_options_after_change_1 (&global_options);
12559 }
12560
12561 static struct machine_function *
12562 aarch64_init_machine_status (void)
12563 {
12564 struct machine_function *machine;
12565 machine = ggc_cleared_alloc<machine_function> ();
12566 return machine;
12567 }
12568
12569 void
12570 aarch64_init_expanders (void)
12571 {
12572 init_machine_status = aarch64_init_machine_status;
12573 }
12574
12575 /* A checking mechanism for the implementation of the various code models. */
12576 static void
12577 initialize_aarch64_code_model (struct gcc_options *opts)
12578 {
12579 if (opts->x_flag_pic)
12580 {
12581 switch (opts->x_aarch64_cmodel_var)
12582 {
12583 case AARCH64_CMODEL_TINY:
12584 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12585 break;
12586 case AARCH64_CMODEL_SMALL:
12587 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12588 aarch64_cmodel = (flag_pic == 2
12589 ? AARCH64_CMODEL_SMALL_PIC
12590 : AARCH64_CMODEL_SMALL_SPIC);
12591 #else
12592 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12593 #endif
12594 break;
12595 case AARCH64_CMODEL_LARGE:
12596 sorry ("code model %qs with %<-f%s%>", "large",
12597 opts->x_flag_pic > 1 ? "PIC" : "pic");
12598 break;
12599 default:
12600 gcc_unreachable ();
12601 }
12602 }
12603 else
12604 aarch64_cmodel = opts->x_aarch64_cmodel_var;
12605 }
12606
12607 /* Implement TARGET_OPTION_SAVE. */
12608
12609 static void
12610 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12611 {
12612 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12613 ptr->x_aarch64_branch_protection_string
12614 = opts->x_aarch64_branch_protection_string;
12615 }
12616
12617 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12618 using the information saved in PTR. */
12619
12620 static void
12621 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12622 {
12623 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12624 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12625 opts->x_explicit_arch = ptr->x_explicit_arch;
12626 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12627 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12628 opts->x_aarch64_branch_protection_string
12629 = ptr->x_aarch64_branch_protection_string;
12630 if (opts->x_aarch64_branch_protection_string)
12631 {
12632 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12633 NULL);
12634 }
12635
12636 aarch64_override_options_internal (opts);
12637 }
12638
12639 /* Implement TARGET_OPTION_PRINT. */
12640
12641 static void
12642 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12643 {
12644 const struct processor *cpu
12645 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12646 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
12647 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12648 std::string extension
12649 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12650
12651 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12652 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12653 arch->name, extension.c_str ());
12654 }
12655
12656 static GTY(()) tree aarch64_previous_fndecl;
12657
12658 void
12659 aarch64_reset_previous_fndecl (void)
12660 {
12661 aarch64_previous_fndecl = NULL;
12662 }
12663
12664 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12665 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12666 make sure optab availability predicates are recomputed when necessary. */
12667
12668 void
12669 aarch64_save_restore_target_globals (tree new_tree)
12670 {
12671 if (TREE_TARGET_GLOBALS (new_tree))
12672 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12673 else if (new_tree == target_option_default_node)
12674 restore_target_globals (&default_target_globals);
12675 else
12676 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12677 }
12678
12679 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12680 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12681 of the function, if such exists. This function may be called multiple
12682 times on a single function so use aarch64_previous_fndecl to avoid
12683 setting up identical state. */
12684
12685 static void
12686 aarch64_set_current_function (tree fndecl)
12687 {
12688 if (!fndecl || fndecl == aarch64_previous_fndecl)
12689 return;
12690
12691 tree old_tree = (aarch64_previous_fndecl
12692 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12693 : NULL_TREE);
12694
12695 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12696
12697 /* If current function has no attributes but the previous one did,
12698 use the default node. */
12699 if (!new_tree && old_tree)
12700 new_tree = target_option_default_node;
12701
12702 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12703 the default have been handled by aarch64_save_restore_target_globals from
12704 aarch64_pragma_target_parse. */
12705 if (old_tree == new_tree)
12706 return;
12707
12708 aarch64_previous_fndecl = fndecl;
12709
12710 /* First set the target options. */
12711 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12712
12713 aarch64_save_restore_target_globals (new_tree);
12714 }
12715
12716 /* Enum describing the various ways we can handle attributes.
12717 In many cases we can reuse the generic option handling machinery. */
12718
12719 enum aarch64_attr_opt_type
12720 {
12721 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
12722 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
12723 aarch64_attr_enum, /* Attribute sets an enum variable. */
12724 aarch64_attr_custom /* Attribute requires a custom handling function. */
12725 };
12726
12727 /* All the information needed to handle a target attribute.
12728 NAME is the name of the attribute.
12729 ATTR_TYPE specifies the type of behavior of the attribute as described
12730 in the definition of enum aarch64_attr_opt_type.
12731 ALLOW_NEG is true if the attribute supports a "no-" form.
12732 HANDLER is the function that takes the attribute string as an argument
12733 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12734 OPT_NUM is the enum specifying the option that the attribute modifies.
12735 This is needed for attributes that mirror the behavior of a command-line
12736 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12737 aarch64_attr_enum. */
12738
12739 struct aarch64_attribute_info
12740 {
12741 const char *name;
12742 enum aarch64_attr_opt_type attr_type;
12743 bool allow_neg;
12744 bool (*handler) (const char *);
12745 enum opt_code opt_num;
12746 };
12747
12748 /* Handle the ARCH_STR argument to the arch= target attribute. */
12749
12750 static bool
12751 aarch64_handle_attr_arch (const char *str)
12752 {
12753 const struct processor *tmp_arch = NULL;
12754 std::string invalid_extension;
12755 enum aarch64_parse_opt_result parse_res
12756 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12757
12758 if (parse_res == AARCH64_PARSE_OK)
12759 {
12760 gcc_assert (tmp_arch);
12761 selected_arch = tmp_arch;
12762 explicit_arch = selected_arch->arch;
12763 return true;
12764 }
12765
12766 switch (parse_res)
12767 {
12768 case AARCH64_PARSE_MISSING_ARG:
12769 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12770 break;
12771 case AARCH64_PARSE_INVALID_ARG:
12772 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12773 aarch64_print_hint_for_arch (str);
12774 break;
12775 case AARCH64_PARSE_INVALID_FEATURE:
12776 error ("invalid feature modifier %s of value (\"%s\") in "
12777 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12778 aarch64_print_hint_for_extensions (invalid_extension);
12779 break;
12780 default:
12781 gcc_unreachable ();
12782 }
12783
12784 return false;
12785 }
12786
12787 /* Handle the argument CPU_STR to the cpu= target attribute. */
12788
12789 static bool
12790 aarch64_handle_attr_cpu (const char *str)
12791 {
12792 const struct processor *tmp_cpu = NULL;
12793 std::string invalid_extension;
12794 enum aarch64_parse_opt_result parse_res
12795 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12796
12797 if (parse_res == AARCH64_PARSE_OK)
12798 {
12799 gcc_assert (tmp_cpu);
12800 selected_tune = tmp_cpu;
12801 explicit_tune_core = selected_tune->ident;
12802
12803 selected_arch = &all_architectures[tmp_cpu->arch];
12804 explicit_arch = selected_arch->arch;
12805 return true;
12806 }
12807
12808 switch (parse_res)
12809 {
12810 case AARCH64_PARSE_MISSING_ARG:
12811 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12812 break;
12813 case AARCH64_PARSE_INVALID_ARG:
12814 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12815 aarch64_print_hint_for_core (str);
12816 break;
12817 case AARCH64_PARSE_INVALID_FEATURE:
12818 error ("invalid feature modifier %s of value (\"%s\") in "
12819 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12820 aarch64_print_hint_for_extensions (invalid_extension);
12821 break;
12822 default:
12823 gcc_unreachable ();
12824 }
12825
12826 return false;
12827 }
12828
12829 /* Handle the argument STR to the branch-protection= attribute. */
12830
12831 static bool
12832 aarch64_handle_attr_branch_protection (const char* str)
12833 {
12834 char *err_str = (char *) xmalloc (strlen (str));
12835 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12836 &err_str);
12837 bool success = false;
12838 switch (res)
12839 {
12840 case AARCH64_PARSE_MISSING_ARG:
12841 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12842 " attribute");
12843 break;
12844 case AARCH64_PARSE_INVALID_ARG:
12845 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12846 "=\")%> pragma or attribute", err_str);
12847 break;
12848 case AARCH64_PARSE_OK:
12849 success = true;
12850 /* Fall through. */
12851 case AARCH64_PARSE_INVALID_FEATURE:
12852 break;
12853 default:
12854 gcc_unreachable ();
12855 }
12856 free (err_str);
12857 return success;
12858 }
12859
12860 /* Handle the argument STR to the tune= target attribute. */
12861
12862 static bool
12863 aarch64_handle_attr_tune (const char *str)
12864 {
12865 const struct processor *tmp_tune = NULL;
12866 enum aarch64_parse_opt_result parse_res
12867 = aarch64_parse_tune (str, &tmp_tune);
12868
12869 if (parse_res == AARCH64_PARSE_OK)
12870 {
12871 gcc_assert (tmp_tune);
12872 selected_tune = tmp_tune;
12873 explicit_tune_core = selected_tune->ident;
12874 return true;
12875 }
12876
12877 switch (parse_res)
12878 {
12879 case AARCH64_PARSE_INVALID_ARG:
12880 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12881 aarch64_print_hint_for_core (str);
12882 break;
12883 default:
12884 gcc_unreachable ();
12885 }
12886
12887 return false;
12888 }
12889
12890 /* Parse an architecture extensions target attribute string specified in STR.
12891 For example "+fp+nosimd". Show any errors if needed. Return TRUE
12892 if successful. Update aarch64_isa_flags to reflect the ISA features
12893 modified. */
12894
12895 static bool
12896 aarch64_handle_attr_isa_flags (char *str)
12897 {
12898 enum aarch64_parse_opt_result parse_res;
12899 uint64_t isa_flags = aarch64_isa_flags;
12900
12901 /* We allow "+nothing" in the beginning to clear out all architectural
12902 features if the user wants to handpick specific features. */
12903 if (strncmp ("+nothing", str, 8) == 0)
12904 {
12905 isa_flags = 0;
12906 str += 8;
12907 }
12908
12909 std::string invalid_extension;
12910 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12911
12912 if (parse_res == AARCH64_PARSE_OK)
12913 {
12914 aarch64_isa_flags = isa_flags;
12915 return true;
12916 }
12917
12918 switch (parse_res)
12919 {
12920 case AARCH64_PARSE_MISSING_ARG:
12921 error ("missing value in %<target()%> pragma or attribute");
12922 break;
12923
12924 case AARCH64_PARSE_INVALID_FEATURE:
12925 error ("invalid feature modifier %s of value (\"%s\") in "
12926 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12927 break;
12928
12929 default:
12930 gcc_unreachable ();
12931 }
12932
12933 return false;
12934 }
12935
12936 /* The target attributes that we support. On top of these we also support just
12937 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
12938 handled explicitly in aarch64_process_one_target_attr. */
12939
12940 static const struct aarch64_attribute_info aarch64_attributes[] =
12941 {
12942 { "general-regs-only", aarch64_attr_mask, false, NULL,
12943 OPT_mgeneral_regs_only },
12944 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12945 OPT_mfix_cortex_a53_835769 },
12946 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12947 OPT_mfix_cortex_a53_843419 },
12948 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12949 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12950 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12951 OPT_momit_leaf_frame_pointer },
12952 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12953 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12954 OPT_march_ },
12955 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12956 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12957 OPT_mtune_ },
12958 { "branch-protection", aarch64_attr_custom, false,
12959 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12960 { "sign-return-address", aarch64_attr_enum, false, NULL,
12961 OPT_msign_return_address_ },
12962 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12963 };
12964
12965 /* Parse ARG_STR which contains the definition of one target attribute.
12966 Show appropriate errors if any or return true if the attribute is valid. */
12967
12968 static bool
12969 aarch64_process_one_target_attr (char *arg_str)
12970 {
12971 bool invert = false;
12972
12973 size_t len = strlen (arg_str);
12974
12975 if (len == 0)
12976 {
12977 error ("malformed %<target()%> pragma or attribute");
12978 return false;
12979 }
12980
12981 char *str_to_check = (char *) alloca (len + 1);
12982 strcpy (str_to_check, arg_str);
12983
12984 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12985 It is easier to detect and handle it explicitly here rather than going
12986 through the machinery for the rest of the target attributes in this
12987 function. */
12988 if (*str_to_check == '+')
12989 return aarch64_handle_attr_isa_flags (str_to_check);
12990
12991 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12992 {
12993 invert = true;
12994 str_to_check += 3;
12995 }
12996 char *arg = strchr (str_to_check, '=');
12997
12998 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12999 and point ARG to "foo". */
13000 if (arg)
13001 {
13002 *arg = '\0';
13003 arg++;
13004 }
13005 const struct aarch64_attribute_info *p_attr;
13006 bool found = false;
13007 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13008 {
13009 /* If the names don't match up, or the user has given an argument
13010 to an attribute that doesn't accept one, or didn't give an argument
13011 to an attribute that expects one, fail to match. */
13012 if (strcmp (str_to_check, p_attr->name) != 0)
13013 continue;
13014
13015 found = true;
13016 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13017 || p_attr->attr_type == aarch64_attr_enum;
13018
13019 if (attr_need_arg_p ^ (arg != NULL))
13020 {
13021 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13022 return false;
13023 }
13024
13025 /* If the name matches but the attribute does not allow "no-" versions
13026 then we can't match. */
13027 if (invert && !p_attr->allow_neg)
13028 {
13029 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13030 return false;
13031 }
13032
13033 switch (p_attr->attr_type)
13034 {
13035 /* Has a custom handler registered.
13036 For example, cpu=, arch=, tune=. */
13037 case aarch64_attr_custom:
13038 gcc_assert (p_attr->handler);
13039 if (!p_attr->handler (arg))
13040 return false;
13041 break;
13042
13043 /* Either set or unset a boolean option. */
13044 case aarch64_attr_bool:
13045 {
13046 struct cl_decoded_option decoded;
13047
13048 generate_option (p_attr->opt_num, NULL, !invert,
13049 CL_TARGET, &decoded);
13050 aarch64_handle_option (&global_options, &global_options_set,
13051 &decoded, input_location);
13052 break;
13053 }
13054 /* Set or unset a bit in the target_flags. aarch64_handle_option
13055 should know what mask to apply given the option number. */
13056 case aarch64_attr_mask:
13057 {
13058 struct cl_decoded_option decoded;
13059 /* We only need to specify the option number.
13060 aarch64_handle_option will know which mask to apply. */
13061 decoded.opt_index = p_attr->opt_num;
13062 decoded.value = !invert;
13063 aarch64_handle_option (&global_options, &global_options_set,
13064 &decoded, input_location);
13065 break;
13066 }
13067 /* Use the option setting machinery to set an option to an enum. */
13068 case aarch64_attr_enum:
13069 {
13070 gcc_assert (arg);
13071 bool valid;
13072 int value;
13073 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13074 &value, CL_TARGET);
13075 if (valid)
13076 {
13077 set_option (&global_options, NULL, p_attr->opt_num, value,
13078 NULL, DK_UNSPECIFIED, input_location,
13079 global_dc);
13080 }
13081 else
13082 {
13083 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13084 }
13085 break;
13086 }
13087 default:
13088 gcc_unreachable ();
13089 }
13090 }
13091
13092 /* If we reached here we either have found an attribute and validated
13093 it or didn't match any. If we matched an attribute but its arguments
13094 were malformed we will have returned false already. */
13095 return found;
13096 }
13097
13098 /* Count how many times the character C appears in
13099 NULL-terminated string STR. */
13100
13101 static unsigned int
13102 num_occurences_in_str (char c, char *str)
13103 {
13104 unsigned int res = 0;
13105 while (*str != '\0')
13106 {
13107 if (*str == c)
13108 res++;
13109
13110 str++;
13111 }
13112
13113 return res;
13114 }
13115
13116 /* Parse the tree in ARGS that contains the target attribute information
13117 and update the global target options space. */
13118
13119 bool
13120 aarch64_process_target_attr (tree args)
13121 {
13122 if (TREE_CODE (args) == TREE_LIST)
13123 {
13124 do
13125 {
13126 tree head = TREE_VALUE (args);
13127 if (head)
13128 {
13129 if (!aarch64_process_target_attr (head))
13130 return false;
13131 }
13132 args = TREE_CHAIN (args);
13133 } while (args);
13134
13135 return true;
13136 }
13137
13138 if (TREE_CODE (args) != STRING_CST)
13139 {
13140 error ("attribute %<target%> argument not a string");
13141 return false;
13142 }
13143
13144 size_t len = strlen (TREE_STRING_POINTER (args));
13145 char *str_to_check = (char *) alloca (len + 1);
13146 strcpy (str_to_check, TREE_STRING_POINTER (args));
13147
13148 if (len == 0)
13149 {
13150 error ("malformed %<target()%> pragma or attribute");
13151 return false;
13152 }
13153
13154 /* Used to catch empty spaces between commas i.e.
13155 attribute ((target ("attr1,,attr2"))). */
13156 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13157
13158 /* Handle multiple target attributes separated by ','. */
13159 char *token = strtok_r (str_to_check, ",", &str_to_check);
13160
13161 unsigned int num_attrs = 0;
13162 while (token)
13163 {
13164 num_attrs++;
13165 if (!aarch64_process_one_target_attr (token))
13166 {
13167 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13168 return false;
13169 }
13170
13171 token = strtok_r (NULL, ",", &str_to_check);
13172 }
13173
13174 if (num_attrs != num_commas + 1)
13175 {
13176 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13177 return false;
13178 }
13179
13180 return true;
13181 }
13182
13183 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13184 process attribute ((target ("..."))). */
13185
13186 static bool
13187 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13188 {
13189 struct cl_target_option cur_target;
13190 bool ret;
13191 tree old_optimize;
13192 tree new_target, new_optimize;
13193 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13194
13195 /* If what we're processing is the current pragma string then the
13196 target option node is already stored in target_option_current_node
13197 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13198 having to re-parse the string. This is especially useful to keep
13199 arm_neon.h compile times down since that header contains a lot
13200 of intrinsics enclosed in pragmas. */
13201 if (!existing_target && args == current_target_pragma)
13202 {
13203 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13204 return true;
13205 }
13206 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13207
13208 old_optimize = build_optimization_node (&global_options);
13209 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13210
13211 /* If the function changed the optimization levels as well as setting
13212 target options, start with the optimizations specified. */
13213 if (func_optimize && func_optimize != old_optimize)
13214 cl_optimization_restore (&global_options,
13215 TREE_OPTIMIZATION (func_optimize));
13216
13217 /* Save the current target options to restore at the end. */
13218 cl_target_option_save (&cur_target, &global_options);
13219
13220 /* If fndecl already has some target attributes applied to it, unpack
13221 them so that we add this attribute on top of them, rather than
13222 overwriting them. */
13223 if (existing_target)
13224 {
13225 struct cl_target_option *existing_options
13226 = TREE_TARGET_OPTION (existing_target);
13227
13228 if (existing_options)
13229 cl_target_option_restore (&global_options, existing_options);
13230 }
13231 else
13232 cl_target_option_restore (&global_options,
13233 TREE_TARGET_OPTION (target_option_current_node));
13234
13235 ret = aarch64_process_target_attr (args);
13236
13237 /* Set up any additional state. */
13238 if (ret)
13239 {
13240 aarch64_override_options_internal (&global_options);
13241 /* Initialize SIMD builtins if we haven't already.
13242 Set current_target_pragma to NULL for the duration so that
13243 the builtin initialization code doesn't try to tag the functions
13244 being built with the attributes specified by any current pragma, thus
13245 going into an infinite recursion. */
13246 if (TARGET_SIMD)
13247 {
13248 tree saved_current_target_pragma = current_target_pragma;
13249 current_target_pragma = NULL;
13250 aarch64_init_simd_builtins ();
13251 current_target_pragma = saved_current_target_pragma;
13252 }
13253 new_target = build_target_option_node (&global_options);
13254 }
13255 else
13256 new_target = NULL;
13257
13258 new_optimize = build_optimization_node (&global_options);
13259
13260 if (fndecl && ret)
13261 {
13262 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13263
13264 if (old_optimize != new_optimize)
13265 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13266 }
13267
13268 cl_target_option_restore (&global_options, &cur_target);
13269
13270 if (old_optimize != new_optimize)
13271 cl_optimization_restore (&global_options,
13272 TREE_OPTIMIZATION (old_optimize));
13273 return ret;
13274 }
13275
13276 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13277 tri-bool options (yes, no, don't care) and the default value is
13278 DEF, determine whether to reject inlining. */
13279
13280 static bool
13281 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13282 int dont_care, int def)
13283 {
13284 /* If the callee doesn't care, always allow inlining. */
13285 if (callee == dont_care)
13286 return true;
13287
13288 /* If the caller doesn't care, always allow inlining. */
13289 if (caller == dont_care)
13290 return true;
13291
13292 /* Otherwise, allow inlining if either the callee and caller values
13293 agree, or if the callee is using the default value. */
13294 return (callee == caller || callee == def);
13295 }
13296
13297 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13298 to inline CALLEE into CALLER based on target-specific info.
13299 Make sure that the caller and callee have compatible architectural
13300 features. Then go through the other possible target attributes
13301 and see if they can block inlining. Try not to reject always_inline
13302 callees unless they are incompatible architecturally. */
13303
13304 static bool
13305 aarch64_can_inline_p (tree caller, tree callee)
13306 {
13307 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13308 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13309
13310 struct cl_target_option *caller_opts
13311 = TREE_TARGET_OPTION (caller_tree ? caller_tree
13312 : target_option_default_node);
13313
13314 struct cl_target_option *callee_opts
13315 = TREE_TARGET_OPTION (callee_tree ? callee_tree
13316 : target_option_default_node);
13317
13318 /* Callee's ISA flags should be a subset of the caller's. */
13319 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13320 != callee_opts->x_aarch64_isa_flags)
13321 return false;
13322
13323 /* Allow non-strict aligned functions inlining into strict
13324 aligned ones. */
13325 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13326 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13327 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13328 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13329 return false;
13330
13331 bool always_inline = lookup_attribute ("always_inline",
13332 DECL_ATTRIBUTES (callee));
13333
13334 /* If the architectural features match up and the callee is always_inline
13335 then the other attributes don't matter. */
13336 if (always_inline)
13337 return true;
13338
13339 if (caller_opts->x_aarch64_cmodel_var
13340 != callee_opts->x_aarch64_cmodel_var)
13341 return false;
13342
13343 if (caller_opts->x_aarch64_tls_dialect
13344 != callee_opts->x_aarch64_tls_dialect)
13345 return false;
13346
13347 /* Honour explicit requests to workaround errata. */
13348 if (!aarch64_tribools_ok_for_inlining_p (
13349 caller_opts->x_aarch64_fix_a53_err835769,
13350 callee_opts->x_aarch64_fix_a53_err835769,
13351 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13352 return false;
13353
13354 if (!aarch64_tribools_ok_for_inlining_p (
13355 caller_opts->x_aarch64_fix_a53_err843419,
13356 callee_opts->x_aarch64_fix_a53_err843419,
13357 2, TARGET_FIX_ERR_A53_843419))
13358 return false;
13359
13360 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13361 caller and calle and they don't match up, reject inlining. */
13362 if (!aarch64_tribools_ok_for_inlining_p (
13363 caller_opts->x_flag_omit_leaf_frame_pointer,
13364 callee_opts->x_flag_omit_leaf_frame_pointer,
13365 2, 1))
13366 return false;
13367
13368 /* If the callee has specific tuning overrides, respect them. */
13369 if (callee_opts->x_aarch64_override_tune_string != NULL
13370 && caller_opts->x_aarch64_override_tune_string == NULL)
13371 return false;
13372
13373 /* If the user specified tuning override strings for the
13374 caller and callee and they don't match up, reject inlining.
13375 We just do a string compare here, we don't analyze the meaning
13376 of the string, as it would be too costly for little gain. */
13377 if (callee_opts->x_aarch64_override_tune_string
13378 && caller_opts->x_aarch64_override_tune_string
13379 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13380 caller_opts->x_aarch64_override_tune_string) != 0))
13381 return false;
13382
13383 return true;
13384 }
13385
13386 /* Return true if SYMBOL_REF X binds locally. */
13387
13388 static bool
13389 aarch64_symbol_binds_local_p (const_rtx x)
13390 {
13391 return (SYMBOL_REF_DECL (x)
13392 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13393 : SYMBOL_REF_LOCAL_P (x));
13394 }
13395
13396 /* Return true if SYMBOL_REF X is thread local */
13397 static bool
13398 aarch64_tls_symbol_p (rtx x)
13399 {
13400 if (! TARGET_HAVE_TLS)
13401 return false;
13402
13403 if (GET_CODE (x) != SYMBOL_REF)
13404 return false;
13405
13406 return SYMBOL_REF_TLS_MODEL (x) != 0;
13407 }
13408
13409 /* Classify a TLS symbol into one of the TLS kinds. */
13410 enum aarch64_symbol_type
13411 aarch64_classify_tls_symbol (rtx x)
13412 {
13413 enum tls_model tls_kind = tls_symbolic_operand_type (x);
13414
13415 switch (tls_kind)
13416 {
13417 case TLS_MODEL_GLOBAL_DYNAMIC:
13418 case TLS_MODEL_LOCAL_DYNAMIC:
13419 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13420
13421 case TLS_MODEL_INITIAL_EXEC:
13422 switch (aarch64_cmodel)
13423 {
13424 case AARCH64_CMODEL_TINY:
13425 case AARCH64_CMODEL_TINY_PIC:
13426 return SYMBOL_TINY_TLSIE;
13427 default:
13428 return SYMBOL_SMALL_TLSIE;
13429 }
13430
13431 case TLS_MODEL_LOCAL_EXEC:
13432 if (aarch64_tls_size == 12)
13433 return SYMBOL_TLSLE12;
13434 else if (aarch64_tls_size == 24)
13435 return SYMBOL_TLSLE24;
13436 else if (aarch64_tls_size == 32)
13437 return SYMBOL_TLSLE32;
13438 else if (aarch64_tls_size == 48)
13439 return SYMBOL_TLSLE48;
13440 else
13441 gcc_unreachable ();
13442
13443 case TLS_MODEL_EMULATED:
13444 case TLS_MODEL_NONE:
13445 return SYMBOL_FORCE_TO_MEM;
13446
13447 default:
13448 gcc_unreachable ();
13449 }
13450 }
13451
13452 /* Return the correct method for accessing X + OFFSET, where X is either
13453 a SYMBOL_REF or LABEL_REF. */
13454
13455 enum aarch64_symbol_type
13456 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13457 {
13458 if (GET_CODE (x) == LABEL_REF)
13459 {
13460 switch (aarch64_cmodel)
13461 {
13462 case AARCH64_CMODEL_LARGE:
13463 return SYMBOL_FORCE_TO_MEM;
13464
13465 case AARCH64_CMODEL_TINY_PIC:
13466 case AARCH64_CMODEL_TINY:
13467 return SYMBOL_TINY_ABSOLUTE;
13468
13469 case AARCH64_CMODEL_SMALL_SPIC:
13470 case AARCH64_CMODEL_SMALL_PIC:
13471 case AARCH64_CMODEL_SMALL:
13472 return SYMBOL_SMALL_ABSOLUTE;
13473
13474 default:
13475 gcc_unreachable ();
13476 }
13477 }
13478
13479 if (GET_CODE (x) == SYMBOL_REF)
13480 {
13481 if (aarch64_tls_symbol_p (x))
13482 return aarch64_classify_tls_symbol (x);
13483
13484 switch (aarch64_cmodel)
13485 {
13486 case AARCH64_CMODEL_TINY:
13487 /* When we retrieve symbol + offset address, we have to make sure
13488 the offset does not cause overflow of the final address. But
13489 we have no way of knowing the address of symbol at compile time
13490 so we can't accurately say if the distance between the PC and
13491 symbol + offset is outside the addressible range of +/-1M in the
13492 TINY code model. So we rely on images not being greater than
13493 1M and cap the offset at 1M and anything beyond 1M will have to
13494 be loaded using an alternative mechanism. Furthermore if the
13495 symbol is a weak reference to something that isn't known to
13496 resolve to a symbol in this module, then force to memory. */
13497 if ((SYMBOL_REF_WEAK (x)
13498 && !aarch64_symbol_binds_local_p (x))
13499 || !IN_RANGE (offset, -1048575, 1048575))
13500 return SYMBOL_FORCE_TO_MEM;
13501 return SYMBOL_TINY_ABSOLUTE;
13502
13503 case AARCH64_CMODEL_SMALL:
13504 /* Same reasoning as the tiny code model, but the offset cap here is
13505 4G. */
13506 if ((SYMBOL_REF_WEAK (x)
13507 && !aarch64_symbol_binds_local_p (x))
13508 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13509 HOST_WIDE_INT_C (4294967264)))
13510 return SYMBOL_FORCE_TO_MEM;
13511 return SYMBOL_SMALL_ABSOLUTE;
13512
13513 case AARCH64_CMODEL_TINY_PIC:
13514 if (!aarch64_symbol_binds_local_p (x))
13515 return SYMBOL_TINY_GOT;
13516 return SYMBOL_TINY_ABSOLUTE;
13517
13518 case AARCH64_CMODEL_SMALL_SPIC:
13519 case AARCH64_CMODEL_SMALL_PIC:
13520 if (!aarch64_symbol_binds_local_p (x))
13521 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13522 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13523 return SYMBOL_SMALL_ABSOLUTE;
13524
13525 case AARCH64_CMODEL_LARGE:
13526 /* This is alright even in PIC code as the constant
13527 pool reference is always PC relative and within
13528 the same translation unit. */
13529 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13530 return SYMBOL_SMALL_ABSOLUTE;
13531 else
13532 return SYMBOL_FORCE_TO_MEM;
13533
13534 default:
13535 gcc_unreachable ();
13536 }
13537 }
13538
13539 /* By default push everything into the constant pool. */
13540 return SYMBOL_FORCE_TO_MEM;
13541 }
13542
13543 bool
13544 aarch64_constant_address_p (rtx x)
13545 {
13546 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13547 }
13548
13549 bool
13550 aarch64_legitimate_pic_operand_p (rtx x)
13551 {
13552 if (GET_CODE (x) == SYMBOL_REF
13553 || (GET_CODE (x) == CONST
13554 && GET_CODE (XEXP (x, 0)) == PLUS
13555 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13556 return false;
13557
13558 return true;
13559 }
13560
13561 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13562 that should be rematerialized rather than spilled. */
13563
13564 static bool
13565 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13566 {
13567 /* Support CSE and rematerialization of common constants. */
13568 if (CONST_INT_P (x)
13569 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13570 || GET_CODE (x) == CONST_VECTOR)
13571 return true;
13572
13573 /* Do not allow vector struct mode constants for Advanced SIMD.
13574 We could support 0 and -1 easily, but they need support in
13575 aarch64-simd.md. */
13576 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13577 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13578 return false;
13579
13580 /* Only accept variable-length vector constants if they can be
13581 handled directly.
13582
13583 ??? It would be possible to handle rematerialization of other
13584 constants via secondary reloads. */
13585 if (vec_flags & VEC_ANY_SVE)
13586 return aarch64_simd_valid_immediate (x, NULL);
13587
13588 if (GET_CODE (x) == HIGH)
13589 x = XEXP (x, 0);
13590
13591 /* Accept polynomial constants that can be calculated by using the
13592 destination of a move as the sole temporary. Constants that
13593 require a second temporary cannot be rematerialized (they can't be
13594 forced to memory and also aren't legitimate constants). */
13595 poly_int64 offset;
13596 if (poly_int_rtx_p (x, &offset))
13597 return aarch64_offset_temporaries (false, offset) <= 1;
13598
13599 /* If an offset is being added to something else, we need to allow the
13600 base to be moved into the destination register, meaning that there
13601 are no free temporaries for the offset. */
13602 x = strip_offset (x, &offset);
13603 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13604 return false;
13605
13606 /* Do not allow const (plus (anchor_symbol, const_int)). */
13607 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13608 return false;
13609
13610 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13611 so spilling them is better than rematerialization. */
13612 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13613 return true;
13614
13615 /* Label references are always constant. */
13616 if (GET_CODE (x) == LABEL_REF)
13617 return true;
13618
13619 return false;
13620 }
13621
13622 rtx
13623 aarch64_load_tp (rtx target)
13624 {
13625 if (!target
13626 || GET_MODE (target) != Pmode
13627 || !register_operand (target, Pmode))
13628 target = gen_reg_rtx (Pmode);
13629
13630 /* Can return in any reg. */
13631 emit_insn (gen_aarch64_load_tp_hard (target));
13632 return target;
13633 }
13634
13635 /* On AAPCS systems, this is the "struct __va_list". */
13636 static GTY(()) tree va_list_type;
13637
13638 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13639 Return the type to use as __builtin_va_list.
13640
13641 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13642
13643 struct __va_list
13644 {
13645 void *__stack;
13646 void *__gr_top;
13647 void *__vr_top;
13648 int __gr_offs;
13649 int __vr_offs;
13650 }; */
13651
13652 static tree
13653 aarch64_build_builtin_va_list (void)
13654 {
13655 tree va_list_name;
13656 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13657
13658 /* Create the type. */
13659 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13660 /* Give it the required name. */
13661 va_list_name = build_decl (BUILTINS_LOCATION,
13662 TYPE_DECL,
13663 get_identifier ("__va_list"),
13664 va_list_type);
13665 DECL_ARTIFICIAL (va_list_name) = 1;
13666 TYPE_NAME (va_list_type) = va_list_name;
13667 TYPE_STUB_DECL (va_list_type) = va_list_name;
13668
13669 /* Create the fields. */
13670 f_stack = build_decl (BUILTINS_LOCATION,
13671 FIELD_DECL, get_identifier ("__stack"),
13672 ptr_type_node);
13673 f_grtop = build_decl (BUILTINS_LOCATION,
13674 FIELD_DECL, get_identifier ("__gr_top"),
13675 ptr_type_node);
13676 f_vrtop = build_decl (BUILTINS_LOCATION,
13677 FIELD_DECL, get_identifier ("__vr_top"),
13678 ptr_type_node);
13679 f_groff = build_decl (BUILTINS_LOCATION,
13680 FIELD_DECL, get_identifier ("__gr_offs"),
13681 integer_type_node);
13682 f_vroff = build_decl (BUILTINS_LOCATION,
13683 FIELD_DECL, get_identifier ("__vr_offs"),
13684 integer_type_node);
13685
13686 /* Tell tree-stdarg pass about our internal offset fields.
13687 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13688 purpose to identify whether the code is updating va_list internal
13689 offset fields through irregular way. */
13690 va_list_gpr_counter_field = f_groff;
13691 va_list_fpr_counter_field = f_vroff;
13692
13693 DECL_ARTIFICIAL (f_stack) = 1;
13694 DECL_ARTIFICIAL (f_grtop) = 1;
13695 DECL_ARTIFICIAL (f_vrtop) = 1;
13696 DECL_ARTIFICIAL (f_groff) = 1;
13697 DECL_ARTIFICIAL (f_vroff) = 1;
13698
13699 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13700 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13701 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13702 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13703 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13704
13705 TYPE_FIELDS (va_list_type) = f_stack;
13706 DECL_CHAIN (f_stack) = f_grtop;
13707 DECL_CHAIN (f_grtop) = f_vrtop;
13708 DECL_CHAIN (f_vrtop) = f_groff;
13709 DECL_CHAIN (f_groff) = f_vroff;
13710
13711 /* Compute its layout. */
13712 layout_type (va_list_type);
13713
13714 return va_list_type;
13715 }
13716
13717 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13718 static void
13719 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13720 {
13721 const CUMULATIVE_ARGS *cum;
13722 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13723 tree stack, grtop, vrtop, groff, vroff;
13724 tree t;
13725 int gr_save_area_size = cfun->va_list_gpr_size;
13726 int vr_save_area_size = cfun->va_list_fpr_size;
13727 int vr_offset;
13728
13729 cum = &crtl->args.info;
13730 if (cfun->va_list_gpr_size)
13731 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13732 cfun->va_list_gpr_size);
13733 if (cfun->va_list_fpr_size)
13734 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13735 * UNITS_PER_VREG, cfun->va_list_fpr_size);
13736
13737 if (!TARGET_FLOAT)
13738 {
13739 gcc_assert (cum->aapcs_nvrn == 0);
13740 vr_save_area_size = 0;
13741 }
13742
13743 f_stack = TYPE_FIELDS (va_list_type_node);
13744 f_grtop = DECL_CHAIN (f_stack);
13745 f_vrtop = DECL_CHAIN (f_grtop);
13746 f_groff = DECL_CHAIN (f_vrtop);
13747 f_vroff = DECL_CHAIN (f_groff);
13748
13749 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13750 NULL_TREE);
13751 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13752 NULL_TREE);
13753 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13754 NULL_TREE);
13755 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13756 NULL_TREE);
13757 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13758 NULL_TREE);
13759
13760 /* Emit code to initialize STACK, which points to the next varargs stack
13761 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13762 by named arguments. STACK is 8-byte aligned. */
13763 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13764 if (cum->aapcs_stack_size > 0)
13765 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13766 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13767 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13768
13769 /* Emit code to initialize GRTOP, the top of the GR save area.
13770 virtual_incoming_args_rtx should have been 16 byte aligned. */
13771 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13772 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13773 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13774
13775 /* Emit code to initialize VRTOP, the top of the VR save area.
13776 This address is gr_save_area_bytes below GRTOP, rounded
13777 down to the next 16-byte boundary. */
13778 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13779 vr_offset = ROUND_UP (gr_save_area_size,
13780 STACK_BOUNDARY / BITS_PER_UNIT);
13781
13782 if (vr_offset)
13783 t = fold_build_pointer_plus_hwi (t, -vr_offset);
13784 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13785 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13786
13787 /* Emit code to initialize GROFF, the offset from GRTOP of the
13788 next GPR argument. */
13789 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13790 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13791 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13792
13793 /* Likewise emit code to initialize VROFF, the offset from FTOP
13794 of the next VR argument. */
13795 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13796 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13797 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13798 }
13799
13800 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13801
13802 static tree
13803 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13804 gimple_seq *post_p ATTRIBUTE_UNUSED)
13805 {
13806 tree addr;
13807 bool indirect_p;
13808 bool is_ha; /* is HFA or HVA. */
13809 bool dw_align; /* double-word align. */
13810 machine_mode ag_mode = VOIDmode;
13811 int nregs;
13812 machine_mode mode;
13813
13814 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13815 tree stack, f_top, f_off, off, arg, roundup, on_stack;
13816 HOST_WIDE_INT size, rsize, adjust, align;
13817 tree t, u, cond1, cond2;
13818
13819 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13820 if (indirect_p)
13821 type = build_pointer_type (type);
13822
13823 mode = TYPE_MODE (type);
13824
13825 f_stack = TYPE_FIELDS (va_list_type_node);
13826 f_grtop = DECL_CHAIN (f_stack);
13827 f_vrtop = DECL_CHAIN (f_grtop);
13828 f_groff = DECL_CHAIN (f_vrtop);
13829 f_vroff = DECL_CHAIN (f_groff);
13830
13831 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13832 f_stack, NULL_TREE);
13833 size = int_size_in_bytes (type);
13834
13835 bool abi_break;
13836 align
13837 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13838
13839 dw_align = false;
13840 adjust = 0;
13841 if (aarch64_vfp_is_call_or_return_candidate (mode,
13842 type,
13843 &ag_mode,
13844 &nregs,
13845 &is_ha))
13846 {
13847 /* No frontends can create types with variable-sized modes, so we
13848 shouldn't be asked to pass or return them. */
13849 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13850
13851 /* TYPE passed in fp/simd registers. */
13852 if (!TARGET_FLOAT)
13853 aarch64_err_no_fpadvsimd (mode);
13854
13855 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13856 unshare_expr (valist), f_vrtop, NULL_TREE);
13857 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13858 unshare_expr (valist), f_vroff, NULL_TREE);
13859
13860 rsize = nregs * UNITS_PER_VREG;
13861
13862 if (is_ha)
13863 {
13864 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13865 adjust = UNITS_PER_VREG - ag_size;
13866 }
13867 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13868 && size < UNITS_PER_VREG)
13869 {
13870 adjust = UNITS_PER_VREG - size;
13871 }
13872 }
13873 else
13874 {
13875 /* TYPE passed in general registers. */
13876 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13877 unshare_expr (valist), f_grtop, NULL_TREE);
13878 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13879 unshare_expr (valist), f_groff, NULL_TREE);
13880 rsize = ROUND_UP (size, UNITS_PER_WORD);
13881 nregs = rsize / UNITS_PER_WORD;
13882
13883 if (align > 8)
13884 {
13885 if (abi_break && warn_psabi)
13886 inform (input_location, "parameter passing for argument of type "
13887 "%qT changed in GCC 9.1", type);
13888 dw_align = true;
13889 }
13890
13891 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13892 && size < UNITS_PER_WORD)
13893 {
13894 adjust = UNITS_PER_WORD - size;
13895 }
13896 }
13897
13898 /* Get a local temporary for the field value. */
13899 off = get_initialized_tmp_var (f_off, pre_p, NULL);
13900
13901 /* Emit code to branch if off >= 0. */
13902 t = build2 (GE_EXPR, boolean_type_node, off,
13903 build_int_cst (TREE_TYPE (off), 0));
13904 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13905
13906 if (dw_align)
13907 {
13908 /* Emit: offs = (offs + 15) & -16. */
13909 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13910 build_int_cst (TREE_TYPE (off), 15));
13911 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13912 build_int_cst (TREE_TYPE (off), -16));
13913 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13914 }
13915 else
13916 roundup = NULL;
13917
13918 /* Update ap.__[g|v]r_offs */
13919 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13920 build_int_cst (TREE_TYPE (off), rsize));
13921 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13922
13923 /* String up. */
13924 if (roundup)
13925 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13926
13927 /* [cond2] if (ap.__[g|v]r_offs > 0) */
13928 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13929 build_int_cst (TREE_TYPE (f_off), 0));
13930 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13931
13932 /* String up: make sure the assignment happens before the use. */
13933 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13934 COND_EXPR_ELSE (cond1) = t;
13935
13936 /* Prepare the trees handling the argument that is passed on the stack;
13937 the top level node will store in ON_STACK. */
13938 arg = get_initialized_tmp_var (stack, pre_p, NULL);
13939 if (align > 8)
13940 {
13941 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
13942 t = fold_build_pointer_plus_hwi (arg, 15);
13943 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13944 build_int_cst (TREE_TYPE (t), -16));
13945 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13946 }
13947 else
13948 roundup = NULL;
13949 /* Advance ap.__stack */
13950 t = fold_build_pointer_plus_hwi (arg, size + 7);
13951 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13952 build_int_cst (TREE_TYPE (t), -8));
13953 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13954 /* String up roundup and advance. */
13955 if (roundup)
13956 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13957 /* String up with arg */
13958 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13959 /* Big-endianness related address adjustment. */
13960 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13961 && size < UNITS_PER_WORD)
13962 {
13963 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13964 size_int (UNITS_PER_WORD - size));
13965 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13966 }
13967
13968 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13969 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13970
13971 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
13972 t = off;
13973 if (adjust)
13974 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13975 build_int_cst (TREE_TYPE (off), adjust));
13976
13977 t = fold_convert (sizetype, t);
13978 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13979
13980 if (is_ha)
13981 {
13982 /* type ha; // treat as "struct {ftype field[n];}"
13983 ... [computing offs]
13984 for (i = 0; i <nregs; ++i, offs += 16)
13985 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13986 return ha; */
13987 int i;
13988 tree tmp_ha, field_t, field_ptr_t;
13989
13990 /* Declare a local variable. */
13991 tmp_ha = create_tmp_var_raw (type, "ha");
13992 gimple_add_tmp_var (tmp_ha);
13993
13994 /* Establish the base type. */
13995 switch (ag_mode)
13996 {
13997 case E_SFmode:
13998 field_t = float_type_node;
13999 field_ptr_t = float_ptr_type_node;
14000 break;
14001 case E_DFmode:
14002 field_t = double_type_node;
14003 field_ptr_t = double_ptr_type_node;
14004 break;
14005 case E_TFmode:
14006 field_t = long_double_type_node;
14007 field_ptr_t = long_double_ptr_type_node;
14008 break;
14009 case E_HFmode:
14010 field_t = aarch64_fp16_type_node;
14011 field_ptr_t = aarch64_fp16_ptr_type_node;
14012 break;
14013 case E_V2SImode:
14014 case E_V4SImode:
14015 {
14016 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14017 field_t = build_vector_type_for_mode (innertype, ag_mode);
14018 field_ptr_t = build_pointer_type (field_t);
14019 }
14020 break;
14021 default:
14022 gcc_assert (0);
14023 }
14024
14025 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14026 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14027 addr = t;
14028 t = fold_convert (field_ptr_t, addr);
14029 t = build2 (MODIFY_EXPR, field_t,
14030 build1 (INDIRECT_REF, field_t, tmp_ha),
14031 build1 (INDIRECT_REF, field_t, t));
14032
14033 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14034 for (i = 1; i < nregs; ++i)
14035 {
14036 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14037 u = fold_convert (field_ptr_t, addr);
14038 u = build2 (MODIFY_EXPR, field_t,
14039 build2 (MEM_REF, field_t, tmp_ha,
14040 build_int_cst (field_ptr_t,
14041 (i *
14042 int_size_in_bytes (field_t)))),
14043 build1 (INDIRECT_REF, field_t, u));
14044 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14045 }
14046
14047 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14048 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14049 }
14050
14051 COND_EXPR_ELSE (cond2) = t;
14052 addr = fold_convert (build_pointer_type (type), cond1);
14053 addr = build_va_arg_indirect_ref (addr);
14054
14055 if (indirect_p)
14056 addr = build_va_arg_indirect_ref (addr);
14057
14058 return addr;
14059 }
14060
14061 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14062
14063 static void
14064 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14065 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14066 int no_rtl)
14067 {
14068 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14069 CUMULATIVE_ARGS local_cum;
14070 int gr_saved = cfun->va_list_gpr_size;
14071 int vr_saved = cfun->va_list_fpr_size;
14072
14073 /* The caller has advanced CUM up to, but not beyond, the last named
14074 argument. Advance a local copy of CUM past the last "real" named
14075 argument, to find out how many registers are left over. */
14076 local_cum = *cum;
14077 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14078
14079 /* Found out how many registers we need to save.
14080 Honor tree-stdvar analysis results. */
14081 if (cfun->va_list_gpr_size)
14082 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14083 cfun->va_list_gpr_size / UNITS_PER_WORD);
14084 if (cfun->va_list_fpr_size)
14085 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14086 cfun->va_list_fpr_size / UNITS_PER_VREG);
14087
14088 if (!TARGET_FLOAT)
14089 {
14090 gcc_assert (local_cum.aapcs_nvrn == 0);
14091 vr_saved = 0;
14092 }
14093
14094 if (!no_rtl)
14095 {
14096 if (gr_saved > 0)
14097 {
14098 rtx ptr, mem;
14099
14100 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14101 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14102 - gr_saved * UNITS_PER_WORD);
14103 mem = gen_frame_mem (BLKmode, ptr);
14104 set_mem_alias_set (mem, get_varargs_alias_set ());
14105
14106 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14107 mem, gr_saved);
14108 }
14109 if (vr_saved > 0)
14110 {
14111 /* We can't use move_block_from_reg, because it will use
14112 the wrong mode, storing D regs only. */
14113 machine_mode mode = TImode;
14114 int off, i, vr_start;
14115
14116 /* Set OFF to the offset from virtual_incoming_args_rtx of
14117 the first vector register. The VR save area lies below
14118 the GR one, and is aligned to 16 bytes. */
14119 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14120 STACK_BOUNDARY / BITS_PER_UNIT);
14121 off -= vr_saved * UNITS_PER_VREG;
14122
14123 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14124 for (i = 0; i < vr_saved; ++i)
14125 {
14126 rtx ptr, mem;
14127
14128 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14129 mem = gen_frame_mem (mode, ptr);
14130 set_mem_alias_set (mem, get_varargs_alias_set ());
14131 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14132 off += UNITS_PER_VREG;
14133 }
14134 }
14135 }
14136
14137 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14138 any complication of having crtl->args.pretend_args_size changed. */
14139 cfun->machine->frame.saved_varargs_size
14140 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14141 STACK_BOUNDARY / BITS_PER_UNIT)
14142 + vr_saved * UNITS_PER_VREG);
14143 }
14144
14145 static void
14146 aarch64_conditional_register_usage (void)
14147 {
14148 int i;
14149 if (!TARGET_FLOAT)
14150 {
14151 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14152 {
14153 fixed_regs[i] = 1;
14154 call_used_regs[i] = 1;
14155 }
14156 }
14157 if (!TARGET_SVE)
14158 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14159 {
14160 fixed_regs[i] = 1;
14161 call_used_regs[i] = 1;
14162 }
14163
14164 /* When tracking speculation, we need a couple of call-clobbered registers
14165 to track the speculation state. It would be nice to just use
14166 IP0 and IP1, but currently there are numerous places that just
14167 assume these registers are free for other uses (eg pointer
14168 authentication). */
14169 if (aarch64_track_speculation)
14170 {
14171 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14172 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14173 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14174 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14175 }
14176 }
14177
14178 /* Walk down the type tree of TYPE counting consecutive base elements.
14179 If *MODEP is VOIDmode, then set it to the first valid floating point
14180 type. If a non-floating point type is found, or if a floating point
14181 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14182 otherwise return the count in the sub-tree. */
14183 static int
14184 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14185 {
14186 machine_mode mode;
14187 HOST_WIDE_INT size;
14188
14189 switch (TREE_CODE (type))
14190 {
14191 case REAL_TYPE:
14192 mode = TYPE_MODE (type);
14193 if (mode != DFmode && mode != SFmode
14194 && mode != TFmode && mode != HFmode)
14195 return -1;
14196
14197 if (*modep == VOIDmode)
14198 *modep = mode;
14199
14200 if (*modep == mode)
14201 return 1;
14202
14203 break;
14204
14205 case COMPLEX_TYPE:
14206 mode = TYPE_MODE (TREE_TYPE (type));
14207 if (mode != DFmode && mode != SFmode
14208 && mode != TFmode && mode != HFmode)
14209 return -1;
14210
14211 if (*modep == VOIDmode)
14212 *modep = mode;
14213
14214 if (*modep == mode)
14215 return 2;
14216
14217 break;
14218
14219 case VECTOR_TYPE:
14220 /* Use V2SImode and V4SImode as representatives of all 64-bit
14221 and 128-bit vector types. */
14222 size = int_size_in_bytes (type);
14223 switch (size)
14224 {
14225 case 8:
14226 mode = V2SImode;
14227 break;
14228 case 16:
14229 mode = V4SImode;
14230 break;
14231 default:
14232 return -1;
14233 }
14234
14235 if (*modep == VOIDmode)
14236 *modep = mode;
14237
14238 /* Vector modes are considered to be opaque: two vectors are
14239 equivalent for the purposes of being homogeneous aggregates
14240 if they are the same size. */
14241 if (*modep == mode)
14242 return 1;
14243
14244 break;
14245
14246 case ARRAY_TYPE:
14247 {
14248 int count;
14249 tree index = TYPE_DOMAIN (type);
14250
14251 /* Can't handle incomplete types nor sizes that are not
14252 fixed. */
14253 if (!COMPLETE_TYPE_P (type)
14254 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14255 return -1;
14256
14257 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14258 if (count == -1
14259 || !index
14260 || !TYPE_MAX_VALUE (index)
14261 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14262 || !TYPE_MIN_VALUE (index)
14263 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14264 || count < 0)
14265 return -1;
14266
14267 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14268 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14269
14270 /* There must be no padding. */
14271 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14272 count * GET_MODE_BITSIZE (*modep)))
14273 return -1;
14274
14275 return count;
14276 }
14277
14278 case RECORD_TYPE:
14279 {
14280 int count = 0;
14281 int sub_count;
14282 tree field;
14283
14284 /* Can't handle incomplete types nor sizes that are not
14285 fixed. */
14286 if (!COMPLETE_TYPE_P (type)
14287 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14288 return -1;
14289
14290 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14291 {
14292 if (TREE_CODE (field) != FIELD_DECL)
14293 continue;
14294
14295 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14296 if (sub_count < 0)
14297 return -1;
14298 count += sub_count;
14299 }
14300
14301 /* There must be no padding. */
14302 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14303 count * GET_MODE_BITSIZE (*modep)))
14304 return -1;
14305
14306 return count;
14307 }
14308
14309 case UNION_TYPE:
14310 case QUAL_UNION_TYPE:
14311 {
14312 /* These aren't very interesting except in a degenerate case. */
14313 int count = 0;
14314 int sub_count;
14315 tree field;
14316
14317 /* Can't handle incomplete types nor sizes that are not
14318 fixed. */
14319 if (!COMPLETE_TYPE_P (type)
14320 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14321 return -1;
14322
14323 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14324 {
14325 if (TREE_CODE (field) != FIELD_DECL)
14326 continue;
14327
14328 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14329 if (sub_count < 0)
14330 return -1;
14331 count = count > sub_count ? count : sub_count;
14332 }
14333
14334 /* There must be no padding. */
14335 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14336 count * GET_MODE_BITSIZE (*modep)))
14337 return -1;
14338
14339 return count;
14340 }
14341
14342 default:
14343 break;
14344 }
14345
14346 return -1;
14347 }
14348
14349 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14350 type as described in AAPCS64 \S 4.1.2.
14351
14352 See the comment above aarch64_composite_type_p for the notes on MODE. */
14353
14354 static bool
14355 aarch64_short_vector_p (const_tree type,
14356 machine_mode mode)
14357 {
14358 poly_int64 size = -1;
14359
14360 if (type && TREE_CODE (type) == VECTOR_TYPE)
14361 size = int_size_in_bytes (type);
14362 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14363 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14364 size = GET_MODE_SIZE (mode);
14365
14366 return known_eq (size, 8) || known_eq (size, 16);
14367 }
14368
14369 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14370 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14371 array types. The C99 floating-point complex types are also considered
14372 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14373 types, which are GCC extensions and out of the scope of AAPCS64, are
14374 treated as composite types here as well.
14375
14376 Note that MODE itself is not sufficient in determining whether a type
14377 is such a composite type or not. This is because
14378 stor-layout.c:compute_record_mode may have already changed the MODE
14379 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14380 structure with only one field may have its MODE set to the mode of the
14381 field. Also an integer mode whose size matches the size of the
14382 RECORD_TYPE type may be used to substitute the original mode
14383 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14384 solely relied on. */
14385
14386 static bool
14387 aarch64_composite_type_p (const_tree type,
14388 machine_mode mode)
14389 {
14390 if (aarch64_short_vector_p (type, mode))
14391 return false;
14392
14393 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14394 return true;
14395
14396 if (mode == BLKmode
14397 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14398 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14399 return true;
14400
14401 return false;
14402 }
14403
14404 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14405 shall be passed or returned in simd/fp register(s) (providing these
14406 parameter passing registers are available).
14407
14408 Upon successful return, *COUNT returns the number of needed registers,
14409 *BASE_MODE returns the mode of the individual register and when IS_HAF
14410 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14411 floating-point aggregate or a homogeneous short-vector aggregate. */
14412
14413 static bool
14414 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14415 const_tree type,
14416 machine_mode *base_mode,
14417 int *count,
14418 bool *is_ha)
14419 {
14420 machine_mode new_mode = VOIDmode;
14421 bool composite_p = aarch64_composite_type_p (type, mode);
14422
14423 if (is_ha != NULL) *is_ha = false;
14424
14425 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14426 || aarch64_short_vector_p (type, mode))
14427 {
14428 *count = 1;
14429 new_mode = mode;
14430 }
14431 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14432 {
14433 if (is_ha != NULL) *is_ha = true;
14434 *count = 2;
14435 new_mode = GET_MODE_INNER (mode);
14436 }
14437 else if (type && composite_p)
14438 {
14439 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14440
14441 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14442 {
14443 if (is_ha != NULL) *is_ha = true;
14444 *count = ag_count;
14445 }
14446 else
14447 return false;
14448 }
14449 else
14450 return false;
14451
14452 *base_mode = new_mode;
14453 return true;
14454 }
14455
14456 /* Implement TARGET_STRUCT_VALUE_RTX. */
14457
14458 static rtx
14459 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14460 int incoming ATTRIBUTE_UNUSED)
14461 {
14462 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14463 }
14464
14465 /* Implements target hook vector_mode_supported_p. */
14466 static bool
14467 aarch64_vector_mode_supported_p (machine_mode mode)
14468 {
14469 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14470 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14471 }
14472
14473 /* Return the full-width SVE vector mode for element mode MODE, if one
14474 exists. */
14475 opt_machine_mode
14476 aarch64_full_sve_mode (scalar_mode mode)
14477 {
14478 switch (mode)
14479 {
14480 case E_DFmode:
14481 return VNx2DFmode;
14482 case E_SFmode:
14483 return VNx4SFmode;
14484 case E_HFmode:
14485 return VNx8HFmode;
14486 case E_DImode:
14487 return VNx2DImode;
14488 case E_SImode:
14489 return VNx4SImode;
14490 case E_HImode:
14491 return VNx8HImode;
14492 case E_QImode:
14493 return VNx16QImode;
14494 default:
14495 return opt_machine_mode ();
14496 }
14497 }
14498
14499 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14500 if it exists. */
14501 opt_machine_mode
14502 aarch64_vq_mode (scalar_mode mode)
14503 {
14504 switch (mode)
14505 {
14506 case E_DFmode:
14507 return V2DFmode;
14508 case E_SFmode:
14509 return V4SFmode;
14510 case E_HFmode:
14511 return V8HFmode;
14512 case E_SImode:
14513 return V4SImode;
14514 case E_HImode:
14515 return V8HImode;
14516 case E_QImode:
14517 return V16QImode;
14518 case E_DImode:
14519 return V2DImode;
14520 default:
14521 return opt_machine_mode ();
14522 }
14523 }
14524
14525 /* Return appropriate SIMD container
14526 for MODE within a vector of WIDTH bits. */
14527 static machine_mode
14528 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14529 {
14530 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14531 return aarch64_full_sve_mode (mode).else_mode (word_mode);
14532
14533 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14534 if (TARGET_SIMD)
14535 {
14536 if (known_eq (width, 128))
14537 return aarch64_vq_mode (mode).else_mode (word_mode);
14538 else
14539 switch (mode)
14540 {
14541 case E_SFmode:
14542 return V2SFmode;
14543 case E_HFmode:
14544 return V4HFmode;
14545 case E_SImode:
14546 return V2SImode;
14547 case E_HImode:
14548 return V4HImode;
14549 case E_QImode:
14550 return V8QImode;
14551 default:
14552 break;
14553 }
14554 }
14555 return word_mode;
14556 }
14557
14558 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14559 static machine_mode
14560 aarch64_preferred_simd_mode (scalar_mode mode)
14561 {
14562 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14563 return aarch64_simd_container_mode (mode, bits);
14564 }
14565
14566 /* Return a list of possible vector sizes for the vectorizer
14567 to iterate over. */
14568 static void
14569 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14570 {
14571 if (TARGET_SVE)
14572 sizes->safe_push (BYTES_PER_SVE_VECTOR);
14573 sizes->safe_push (16);
14574 sizes->safe_push (8);
14575 }
14576
14577 /* Implement TARGET_MANGLE_TYPE. */
14578
14579 static const char *
14580 aarch64_mangle_type (const_tree type)
14581 {
14582 /* The AArch64 ABI documents say that "__va_list" has to be
14583 mangled as if it is in the "std" namespace. */
14584 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14585 return "St9__va_list";
14586
14587 /* Half-precision float. */
14588 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14589 return "Dh";
14590
14591 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14592 builtin types. */
14593 if (TYPE_NAME (type) != NULL)
14594 return aarch64_mangle_builtin_type (type);
14595
14596 /* Use the default mangling. */
14597 return NULL;
14598 }
14599
14600 /* Find the first rtx_insn before insn that will generate an assembly
14601 instruction. */
14602
14603 static rtx_insn *
14604 aarch64_prev_real_insn (rtx_insn *insn)
14605 {
14606 if (!insn)
14607 return NULL;
14608
14609 do
14610 {
14611 insn = prev_real_insn (insn);
14612 }
14613 while (insn && recog_memoized (insn) < 0);
14614
14615 return insn;
14616 }
14617
14618 static bool
14619 is_madd_op (enum attr_type t1)
14620 {
14621 unsigned int i;
14622 /* A number of these may be AArch32 only. */
14623 enum attr_type mlatypes[] = {
14624 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14625 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14626 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14627 };
14628
14629 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14630 {
14631 if (t1 == mlatypes[i])
14632 return true;
14633 }
14634
14635 return false;
14636 }
14637
14638 /* Check if there is a register dependency between a load and the insn
14639 for which we hold recog_data. */
14640
14641 static bool
14642 dep_between_memop_and_curr (rtx memop)
14643 {
14644 rtx load_reg;
14645 int opno;
14646
14647 gcc_assert (GET_CODE (memop) == SET);
14648
14649 if (!REG_P (SET_DEST (memop)))
14650 return false;
14651
14652 load_reg = SET_DEST (memop);
14653 for (opno = 1; opno < recog_data.n_operands; opno++)
14654 {
14655 rtx operand = recog_data.operand[opno];
14656 if (REG_P (operand)
14657 && reg_overlap_mentioned_p (load_reg, operand))
14658 return true;
14659
14660 }
14661 return false;
14662 }
14663
14664
14665 /* When working around the Cortex-A53 erratum 835769,
14666 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14667 instruction and has a preceding memory instruction such that a NOP
14668 should be inserted between them. */
14669
14670 bool
14671 aarch64_madd_needs_nop (rtx_insn* insn)
14672 {
14673 enum attr_type attr_type;
14674 rtx_insn *prev;
14675 rtx body;
14676
14677 if (!TARGET_FIX_ERR_A53_835769)
14678 return false;
14679
14680 if (!INSN_P (insn) || recog_memoized (insn) < 0)
14681 return false;
14682
14683 attr_type = get_attr_type (insn);
14684 if (!is_madd_op (attr_type))
14685 return false;
14686
14687 prev = aarch64_prev_real_insn (insn);
14688 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14689 Restore recog state to INSN to avoid state corruption. */
14690 extract_constrain_insn_cached (insn);
14691
14692 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14693 return false;
14694
14695 body = single_set (prev);
14696
14697 /* If the previous insn is a memory op and there is no dependency between
14698 it and the DImode madd, emit a NOP between them. If body is NULL then we
14699 have a complex memory operation, probably a load/store pair.
14700 Be conservative for now and emit a NOP. */
14701 if (GET_MODE (recog_data.operand[0]) == DImode
14702 && (!body || !dep_between_memop_and_curr (body)))
14703 return true;
14704
14705 return false;
14706
14707 }
14708
14709
14710 /* Implement FINAL_PRESCAN_INSN. */
14711
14712 void
14713 aarch64_final_prescan_insn (rtx_insn *insn)
14714 {
14715 if (aarch64_madd_needs_nop (insn))
14716 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14717 }
14718
14719
14720 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14721 instruction. */
14722
14723 bool
14724 aarch64_sve_index_immediate_p (rtx base_or_step)
14725 {
14726 return (CONST_INT_P (base_or_step)
14727 && IN_RANGE (INTVAL (base_or_step), -16, 15));
14728 }
14729
14730 /* Return true if X is a valid immediate for the SVE ADD and SUB
14731 instructions. Negate X first if NEGATE_P is true. */
14732
14733 bool
14734 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14735 {
14736 rtx elt;
14737
14738 if (!const_vec_duplicate_p (x, &elt)
14739 || !CONST_INT_P (elt))
14740 return false;
14741
14742 HOST_WIDE_INT val = INTVAL (elt);
14743 if (negate_p)
14744 val = -val;
14745 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14746
14747 if (val & 0xff)
14748 return IN_RANGE (val, 0, 0xff);
14749 return IN_RANGE (val, 0, 0xff00);
14750 }
14751
14752 /* Return true if X is a valid immediate operand for an SVE logical
14753 instruction such as AND. */
14754
14755 bool
14756 aarch64_sve_bitmask_immediate_p (rtx x)
14757 {
14758 rtx elt;
14759
14760 return (const_vec_duplicate_p (x, &elt)
14761 && CONST_INT_P (elt)
14762 && aarch64_bitmask_imm (INTVAL (elt),
14763 GET_MODE_INNER (GET_MODE (x))));
14764 }
14765
14766 /* Return true if X is a valid immediate for the SVE DUP and CPY
14767 instructions. */
14768
14769 bool
14770 aarch64_sve_dup_immediate_p (rtx x)
14771 {
14772 rtx elt;
14773
14774 if (!const_vec_duplicate_p (x, &elt)
14775 || !CONST_INT_P (elt))
14776 return false;
14777
14778 HOST_WIDE_INT val = INTVAL (elt);
14779 if (val & 0xff)
14780 return IN_RANGE (val, -0x80, 0x7f);
14781 return IN_RANGE (val, -0x8000, 0x7f00);
14782 }
14783
14784 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14785 SIGNED_P says whether the operand is signed rather than unsigned. */
14786
14787 bool
14788 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14789 {
14790 rtx elt;
14791
14792 return (const_vec_duplicate_p (x, &elt)
14793 && CONST_INT_P (elt)
14794 && (signed_p
14795 ? IN_RANGE (INTVAL (elt), -16, 15)
14796 : IN_RANGE (INTVAL (elt), 0, 127)));
14797 }
14798
14799 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14800 instruction. Negate X first if NEGATE_P is true. */
14801
14802 bool
14803 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14804 {
14805 rtx elt;
14806 REAL_VALUE_TYPE r;
14807
14808 if (!const_vec_duplicate_p (x, &elt)
14809 || GET_CODE (elt) != CONST_DOUBLE)
14810 return false;
14811
14812 r = *CONST_DOUBLE_REAL_VALUE (elt);
14813
14814 if (negate_p)
14815 r = real_value_negate (&r);
14816
14817 if (real_equal (&r, &dconst1))
14818 return true;
14819 if (real_equal (&r, &dconsthalf))
14820 return true;
14821 return false;
14822 }
14823
14824 /* Return true if X is a valid immediate operand for an SVE FMUL
14825 instruction. */
14826
14827 bool
14828 aarch64_sve_float_mul_immediate_p (rtx x)
14829 {
14830 rtx elt;
14831
14832 /* GCC will never generate a multiply with an immediate of 2, so there is no
14833 point testing for it (even though it is a valid constant). */
14834 return (const_vec_duplicate_p (x, &elt)
14835 && GET_CODE (elt) == CONST_DOUBLE
14836 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14837 }
14838
14839 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14840 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14841 is nonnull, use it to describe valid immediates. */
14842 static bool
14843 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14844 simd_immediate_info *info,
14845 enum simd_immediate_check which,
14846 simd_immediate_info::insn_type insn)
14847 {
14848 /* Try a 4-byte immediate with LSL. */
14849 for (unsigned int shift = 0; shift < 32; shift += 8)
14850 if ((val32 & (0xff << shift)) == val32)
14851 {
14852 if (info)
14853 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14854 simd_immediate_info::LSL, shift);
14855 return true;
14856 }
14857
14858 /* Try a 2-byte immediate with LSL. */
14859 unsigned int imm16 = val32 & 0xffff;
14860 if (imm16 == (val32 >> 16))
14861 for (unsigned int shift = 0; shift < 16; shift += 8)
14862 if ((imm16 & (0xff << shift)) == imm16)
14863 {
14864 if (info)
14865 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14866 simd_immediate_info::LSL, shift);
14867 return true;
14868 }
14869
14870 /* Try a 4-byte immediate with MSL, except for cases that MVN
14871 can handle. */
14872 if (which == AARCH64_CHECK_MOV)
14873 for (unsigned int shift = 8; shift < 24; shift += 8)
14874 {
14875 unsigned int low = (1 << shift) - 1;
14876 if (((val32 & (0xff << shift)) | low) == val32)
14877 {
14878 if (info)
14879 *info = simd_immediate_info (SImode, val32 >> shift, insn,
14880 simd_immediate_info::MSL, shift);
14881 return true;
14882 }
14883 }
14884
14885 return false;
14886 }
14887
14888 /* Return true if replicating VAL64 is a valid immediate for the
14889 Advanced SIMD operation described by WHICH. If INFO is nonnull,
14890 use it to describe valid immediates. */
14891 static bool
14892 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14893 simd_immediate_info *info,
14894 enum simd_immediate_check which)
14895 {
14896 unsigned int val32 = val64 & 0xffffffff;
14897 unsigned int val16 = val64 & 0xffff;
14898 unsigned int val8 = val64 & 0xff;
14899
14900 if (val32 == (val64 >> 32))
14901 {
14902 if ((which & AARCH64_CHECK_ORR) != 0
14903 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14904 simd_immediate_info::MOV))
14905 return true;
14906
14907 if ((which & AARCH64_CHECK_BIC) != 0
14908 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14909 simd_immediate_info::MVN))
14910 return true;
14911
14912 /* Try using a replicated byte. */
14913 if (which == AARCH64_CHECK_MOV
14914 && val16 == (val32 >> 16)
14915 && val8 == (val16 >> 8))
14916 {
14917 if (info)
14918 *info = simd_immediate_info (QImode, val8);
14919 return true;
14920 }
14921 }
14922
14923 /* Try using a bit-to-bytemask. */
14924 if (which == AARCH64_CHECK_MOV)
14925 {
14926 unsigned int i;
14927 for (i = 0; i < 64; i += 8)
14928 {
14929 unsigned char byte = (val64 >> i) & 0xff;
14930 if (byte != 0 && byte != 0xff)
14931 break;
14932 }
14933 if (i == 64)
14934 {
14935 if (info)
14936 *info = simd_immediate_info (DImode, val64);
14937 return true;
14938 }
14939 }
14940 return false;
14941 }
14942
14943 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14944 instruction. If INFO is nonnull, use it to describe valid immediates. */
14945
14946 static bool
14947 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14948 simd_immediate_info *info)
14949 {
14950 scalar_int_mode mode = DImode;
14951 unsigned int val32 = val64 & 0xffffffff;
14952 if (val32 == (val64 >> 32))
14953 {
14954 mode = SImode;
14955 unsigned int val16 = val32 & 0xffff;
14956 if (val16 == (val32 >> 16))
14957 {
14958 mode = HImode;
14959 unsigned int val8 = val16 & 0xff;
14960 if (val8 == (val16 >> 8))
14961 mode = QImode;
14962 }
14963 }
14964 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14965 if (IN_RANGE (val, -0x80, 0x7f))
14966 {
14967 /* DUP with no shift. */
14968 if (info)
14969 *info = simd_immediate_info (mode, val);
14970 return true;
14971 }
14972 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14973 {
14974 /* DUP with LSL #8. */
14975 if (info)
14976 *info = simd_immediate_info (mode, val);
14977 return true;
14978 }
14979 if (aarch64_bitmask_imm (val64, mode))
14980 {
14981 /* DUPM. */
14982 if (info)
14983 *info = simd_immediate_info (mode, val);
14984 return true;
14985 }
14986 return false;
14987 }
14988
14989 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
14990 it to describe valid immediates. */
14991
14992 static bool
14993 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
14994 {
14995 if (x == CONST0_RTX (GET_MODE (x)))
14996 {
14997 if (info)
14998 *info = simd_immediate_info (DImode, 0);
14999 return true;
15000 }
15001
15002 /* Analyze the value as a VNx16BImode. This should be relatively
15003 efficient, since rtx_vector_builder has enough built-in capacity
15004 to store all VLA predicate constants without needing the heap. */
15005 rtx_vector_builder builder;
15006 if (!aarch64_get_sve_pred_bits (builder, x))
15007 return false;
15008
15009 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15010 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15011 {
15012 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15013 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15014 if (pattern != AARCH64_NUM_SVPATTERNS)
15015 {
15016 if (info)
15017 {
15018 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15019 *info = simd_immediate_info (int_mode, pattern);
15020 }
15021 return true;
15022 }
15023 }
15024 return false;
15025 }
15026
15027 /* Return true if OP is a valid SIMD immediate for the operation
15028 described by WHICH. If INFO is nonnull, use it to describe valid
15029 immediates. */
15030 bool
15031 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15032 enum simd_immediate_check which)
15033 {
15034 machine_mode mode = GET_MODE (op);
15035 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15036 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15037 return false;
15038
15039 if (vec_flags & VEC_SVE_PRED)
15040 return aarch64_sve_pred_valid_immediate (op, info);
15041
15042 scalar_mode elt_mode = GET_MODE_INNER (mode);
15043 rtx base, step;
15044 unsigned int n_elts;
15045 if (GET_CODE (op) == CONST_VECTOR
15046 && CONST_VECTOR_DUPLICATE_P (op))
15047 n_elts = CONST_VECTOR_NPATTERNS (op);
15048 else if ((vec_flags & VEC_SVE_DATA)
15049 && const_vec_series_p (op, &base, &step))
15050 {
15051 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15052 if (!aarch64_sve_index_immediate_p (base)
15053 || !aarch64_sve_index_immediate_p (step))
15054 return false;
15055
15056 if (info)
15057 *info = simd_immediate_info (elt_mode, base, step);
15058 return true;
15059 }
15060 else if (GET_CODE (op) == CONST_VECTOR
15061 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15062 /* N_ELTS set above. */;
15063 else
15064 return false;
15065
15066 scalar_float_mode elt_float_mode;
15067 if (n_elts == 1
15068 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15069 {
15070 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15071 if (aarch64_float_const_zero_rtx_p (elt)
15072 || aarch64_float_const_representable_p (elt))
15073 {
15074 if (info)
15075 *info = simd_immediate_info (elt_float_mode, elt);
15076 return true;
15077 }
15078 }
15079
15080 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15081 if (elt_size > 8)
15082 return false;
15083
15084 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15085
15086 /* Expand the vector constant out into a byte vector, with the least
15087 significant byte of the register first. */
15088 auto_vec<unsigned char, 16> bytes;
15089 bytes.reserve (n_elts * elt_size);
15090 for (unsigned int i = 0; i < n_elts; i++)
15091 {
15092 /* The vector is provided in gcc endian-neutral fashion.
15093 For aarch64_be Advanced SIMD, it must be laid out in the vector
15094 register in reverse order. */
15095 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15096 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15097
15098 if (elt_mode != elt_int_mode)
15099 elt = gen_lowpart (elt_int_mode, elt);
15100
15101 if (!CONST_INT_P (elt))
15102 return false;
15103
15104 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15105 for (unsigned int byte = 0; byte < elt_size; byte++)
15106 {
15107 bytes.quick_push (elt_val & 0xff);
15108 elt_val >>= BITS_PER_UNIT;
15109 }
15110 }
15111
15112 /* The immediate must repeat every eight bytes. */
15113 unsigned int nbytes = bytes.length ();
15114 for (unsigned i = 8; i < nbytes; ++i)
15115 if (bytes[i] != bytes[i - 8])
15116 return false;
15117
15118 /* Get the repeating 8-byte value as an integer. No endian correction
15119 is needed here because bytes is already in lsb-first order. */
15120 unsigned HOST_WIDE_INT val64 = 0;
15121 for (unsigned int i = 0; i < 8; i++)
15122 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15123 << (i * BITS_PER_UNIT));
15124
15125 if (vec_flags & VEC_SVE_DATA)
15126 return aarch64_sve_valid_immediate (val64, info);
15127 else
15128 return aarch64_advsimd_valid_immediate (val64, info, which);
15129 }
15130
15131 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15132 has a step in the range of INDEX. Return the index expression if so,
15133 otherwise return null. */
15134 rtx
15135 aarch64_check_zero_based_sve_index_immediate (rtx x)
15136 {
15137 rtx base, step;
15138 if (const_vec_series_p (x, &base, &step)
15139 && base == const0_rtx
15140 && aarch64_sve_index_immediate_p (step))
15141 return step;
15142 return NULL_RTX;
15143 }
15144
15145 /* Check of immediate shift constants are within range. */
15146 bool
15147 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15148 {
15149 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15150 if (left)
15151 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15152 else
15153 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15154 }
15155
15156 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15157 operation of width WIDTH at bit position POS. */
15158
15159 rtx
15160 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15161 {
15162 gcc_assert (CONST_INT_P (width));
15163 gcc_assert (CONST_INT_P (pos));
15164
15165 unsigned HOST_WIDE_INT mask
15166 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15167 return GEN_INT (mask << UINTVAL (pos));
15168 }
15169
15170 bool
15171 aarch64_mov_operand_p (rtx x, machine_mode mode)
15172 {
15173 if (GET_CODE (x) == HIGH
15174 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15175 return true;
15176
15177 if (CONST_INT_P (x))
15178 return true;
15179
15180 if (VECTOR_MODE_P (GET_MODE (x)))
15181 return aarch64_simd_valid_immediate (x, NULL);
15182
15183 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15184 return true;
15185
15186 if (aarch64_sve_cnt_immediate_p (x))
15187 return true;
15188
15189 return aarch64_classify_symbolic_expression (x)
15190 == SYMBOL_TINY_ABSOLUTE;
15191 }
15192
15193 /* Return a const_int vector of VAL. */
15194 rtx
15195 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15196 {
15197 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15198 return gen_const_vec_duplicate (mode, c);
15199 }
15200
15201 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15202
15203 bool
15204 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15205 {
15206 machine_mode vmode;
15207
15208 vmode = aarch64_simd_container_mode (mode, 64);
15209 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15210 return aarch64_simd_valid_immediate (op_v, NULL);
15211 }
15212
15213 /* Construct and return a PARALLEL RTX vector with elements numbering the
15214 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15215 the vector - from the perspective of the architecture. This does not
15216 line up with GCC's perspective on lane numbers, so we end up with
15217 different masks depending on our target endian-ness. The diagram
15218 below may help. We must draw the distinction when building masks
15219 which select one half of the vector. An instruction selecting
15220 architectural low-lanes for a big-endian target, must be described using
15221 a mask selecting GCC high-lanes.
15222
15223 Big-Endian Little-Endian
15224
15225 GCC 0 1 2 3 3 2 1 0
15226 | x | x | x | x | | x | x | x | x |
15227 Architecture 3 2 1 0 3 2 1 0
15228
15229 Low Mask: { 2, 3 } { 0, 1 }
15230 High Mask: { 0, 1 } { 2, 3 }
15231
15232 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15233
15234 rtx
15235 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15236 {
15237 rtvec v = rtvec_alloc (nunits / 2);
15238 int high_base = nunits / 2;
15239 int low_base = 0;
15240 int base;
15241 rtx t1;
15242 int i;
15243
15244 if (BYTES_BIG_ENDIAN)
15245 base = high ? low_base : high_base;
15246 else
15247 base = high ? high_base : low_base;
15248
15249 for (i = 0; i < nunits / 2; i++)
15250 RTVEC_ELT (v, i) = GEN_INT (base + i);
15251
15252 t1 = gen_rtx_PARALLEL (mode, v);
15253 return t1;
15254 }
15255
15256 /* Check OP for validity as a PARALLEL RTX vector with elements
15257 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15258 from the perspective of the architecture. See the diagram above
15259 aarch64_simd_vect_par_cnst_half for more details. */
15260
15261 bool
15262 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15263 bool high)
15264 {
15265 int nelts;
15266 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15267 return false;
15268
15269 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15270 HOST_WIDE_INT count_op = XVECLEN (op, 0);
15271 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15272 int i = 0;
15273
15274 if (count_op != count_ideal)
15275 return false;
15276
15277 for (i = 0; i < count_ideal; i++)
15278 {
15279 rtx elt_op = XVECEXP (op, 0, i);
15280 rtx elt_ideal = XVECEXP (ideal, 0, i);
15281
15282 if (!CONST_INT_P (elt_op)
15283 || INTVAL (elt_ideal) != INTVAL (elt_op))
15284 return false;
15285 }
15286 return true;
15287 }
15288
15289 /* Return a PARALLEL containing NELTS elements, with element I equal
15290 to BASE + I * STEP. */
15291
15292 rtx
15293 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15294 {
15295 rtvec vec = rtvec_alloc (nelts);
15296 for (unsigned int i = 0; i < nelts; ++i)
15297 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15298 return gen_rtx_PARALLEL (VOIDmode, vec);
15299 }
15300
15301 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15302 series with step STEP. */
15303
15304 bool
15305 aarch64_stepped_int_parallel_p (rtx op, int step)
15306 {
15307 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15308 return false;
15309
15310 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15311 for (int i = 1; i < XVECLEN (op, 0); ++i)
15312 if (!CONST_INT_P (XVECEXP (op, 0, i))
15313 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15314 return false;
15315
15316 return true;
15317 }
15318
15319 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15320 HIGH (exclusive). */
15321 void
15322 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15323 const_tree exp)
15324 {
15325 HOST_WIDE_INT lane;
15326 gcc_assert (CONST_INT_P (operand));
15327 lane = INTVAL (operand);
15328
15329 if (lane < low || lane >= high)
15330 {
15331 if (exp)
15332 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15333 else
15334 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15335 }
15336 }
15337
15338 /* Peform endian correction on lane number N, which indexes a vector
15339 of mode MODE, and return the result as an SImode rtx. */
15340
15341 rtx
15342 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15343 {
15344 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15345 }
15346
15347 /* Return TRUE if OP is a valid vector addressing mode. */
15348
15349 bool
15350 aarch64_simd_mem_operand_p (rtx op)
15351 {
15352 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15353 || REG_P (XEXP (op, 0)));
15354 }
15355
15356 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15357
15358 bool
15359 aarch64_sve_ld1r_operand_p (rtx op)
15360 {
15361 struct aarch64_address_info addr;
15362 scalar_mode mode;
15363
15364 return (MEM_P (op)
15365 && is_a <scalar_mode> (GET_MODE (op), &mode)
15366 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15367 && addr.type == ADDRESS_REG_IMM
15368 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15369 }
15370
15371 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15372 bool
15373 aarch64_sve_ld1rq_operand_p (rtx op)
15374 {
15375 struct aarch64_address_info addr;
15376 scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15377 if (!MEM_P (op)
15378 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15379 return false;
15380
15381 if (addr.type == ADDRESS_REG_IMM)
15382 return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15383
15384 if (addr.type == ADDRESS_REG_REG)
15385 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15386
15387 return false;
15388 }
15389
15390 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15391 The conditions for STR are the same. */
15392 bool
15393 aarch64_sve_ldr_operand_p (rtx op)
15394 {
15395 struct aarch64_address_info addr;
15396
15397 return (MEM_P (op)
15398 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15399 false, ADDR_QUERY_ANY)
15400 && addr.type == ADDRESS_REG_IMM);
15401 }
15402
15403 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15404 We need to be able to access the individual pieces, so the range
15405 is different from LD[234] and ST[234]. */
15406 bool
15407 aarch64_sve_struct_memory_operand_p (rtx op)
15408 {
15409 if (!MEM_P (op))
15410 return false;
15411
15412 machine_mode mode = GET_MODE (op);
15413 struct aarch64_address_info addr;
15414 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15415 ADDR_QUERY_ANY)
15416 || addr.type != ADDRESS_REG_IMM)
15417 return false;
15418
15419 poly_int64 first = addr.const_offset;
15420 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15421 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15422 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15423 }
15424
15425 /* Emit a register copy from operand to operand, taking care not to
15426 early-clobber source registers in the process.
15427
15428 COUNT is the number of components into which the copy needs to be
15429 decomposed. */
15430 void
15431 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15432 unsigned int count)
15433 {
15434 unsigned int i;
15435 int rdest = REGNO (operands[0]);
15436 int rsrc = REGNO (operands[1]);
15437
15438 if (!reg_overlap_mentioned_p (operands[0], operands[1])
15439 || rdest < rsrc)
15440 for (i = 0; i < count; i++)
15441 emit_move_insn (gen_rtx_REG (mode, rdest + i),
15442 gen_rtx_REG (mode, rsrc + i));
15443 else
15444 for (i = 0; i < count; i++)
15445 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15446 gen_rtx_REG (mode, rsrc + count - i - 1));
15447 }
15448
15449 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15450 one of VSTRUCT modes: OI, CI, or XI. */
15451 int
15452 aarch64_simd_attr_length_rglist (machine_mode mode)
15453 {
15454 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15455 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15456 }
15457
15458 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15459 alignment of a vector to 128 bits. SVE predicates have an alignment of
15460 16 bits. */
15461 static HOST_WIDE_INT
15462 aarch64_simd_vector_alignment (const_tree type)
15463 {
15464 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15465 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15466 be set for non-predicate vectors of booleans. Modes are the most
15467 direct way we have of identifying real SVE predicate types. */
15468 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15469 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15470 }
15471
15472 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15473 static poly_uint64
15474 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15475 {
15476 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15477 {
15478 /* If the length of the vector is fixed, try to align to that length,
15479 otherwise don't try to align at all. */
15480 HOST_WIDE_INT result;
15481 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15482 result = TYPE_ALIGN (TREE_TYPE (type));
15483 return result;
15484 }
15485 return TYPE_ALIGN (type);
15486 }
15487
15488 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15489 static bool
15490 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15491 {
15492 if (is_packed)
15493 return false;
15494
15495 /* For fixed-length vectors, check that the vectorizer will aim for
15496 full-vector alignment. This isn't true for generic GCC vectors
15497 that are wider than the ABI maximum of 128 bits. */
15498 poly_uint64 preferred_alignment =
15499 aarch64_vectorize_preferred_vector_alignment (type);
15500 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15501 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15502 preferred_alignment))
15503 return false;
15504
15505 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15506 return true;
15507 }
15508
15509 /* Return true if the vector misalignment factor is supported by the
15510 target. */
15511 static bool
15512 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15513 const_tree type, int misalignment,
15514 bool is_packed)
15515 {
15516 if (TARGET_SIMD && STRICT_ALIGNMENT)
15517 {
15518 /* Return if movmisalign pattern is not supported for this mode. */
15519 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15520 return false;
15521
15522 /* Misalignment factor is unknown at compile time. */
15523 if (misalignment == -1)
15524 return false;
15525 }
15526 return default_builtin_support_vector_misalignment (mode, type, misalignment,
15527 is_packed);
15528 }
15529
15530 /* If VALS is a vector constant that can be loaded into a register
15531 using DUP, generate instructions to do so and return an RTX to
15532 assign to the register. Otherwise return NULL_RTX. */
15533 static rtx
15534 aarch64_simd_dup_constant (rtx vals)
15535 {
15536 machine_mode mode = GET_MODE (vals);
15537 machine_mode inner_mode = GET_MODE_INNER (mode);
15538 rtx x;
15539
15540 if (!const_vec_duplicate_p (vals, &x))
15541 return NULL_RTX;
15542
15543 /* We can load this constant by using DUP and a constant in a
15544 single ARM register. This will be cheaper than a vector
15545 load. */
15546 x = copy_to_mode_reg (inner_mode, x);
15547 return gen_vec_duplicate (mode, x);
15548 }
15549
15550
15551 /* Generate code to load VALS, which is a PARALLEL containing only
15552 constants (for vec_init) or CONST_VECTOR, efficiently into a
15553 register. Returns an RTX to copy into the register, or NULL_RTX
15554 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15555 static rtx
15556 aarch64_simd_make_constant (rtx vals)
15557 {
15558 machine_mode mode = GET_MODE (vals);
15559 rtx const_dup;
15560 rtx const_vec = NULL_RTX;
15561 int n_const = 0;
15562 int i;
15563
15564 if (GET_CODE (vals) == CONST_VECTOR)
15565 const_vec = vals;
15566 else if (GET_CODE (vals) == PARALLEL)
15567 {
15568 /* A CONST_VECTOR must contain only CONST_INTs and
15569 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15570 Only store valid constants in a CONST_VECTOR. */
15571 int n_elts = XVECLEN (vals, 0);
15572 for (i = 0; i < n_elts; ++i)
15573 {
15574 rtx x = XVECEXP (vals, 0, i);
15575 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15576 n_const++;
15577 }
15578 if (n_const == n_elts)
15579 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15580 }
15581 else
15582 gcc_unreachable ();
15583
15584 if (const_vec != NULL_RTX
15585 && aarch64_simd_valid_immediate (const_vec, NULL))
15586 /* Load using MOVI/MVNI. */
15587 return const_vec;
15588 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15589 /* Loaded using DUP. */
15590 return const_dup;
15591 else if (const_vec != NULL_RTX)
15592 /* Load from constant pool. We cannot take advantage of single-cycle
15593 LD1 because we need a PC-relative addressing mode. */
15594 return const_vec;
15595 else
15596 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15597 We cannot construct an initializer. */
15598 return NULL_RTX;
15599 }
15600
15601 /* Expand a vector initialisation sequence, such that TARGET is
15602 initialised to contain VALS. */
15603
15604 void
15605 aarch64_expand_vector_init (rtx target, rtx vals)
15606 {
15607 machine_mode mode = GET_MODE (target);
15608 scalar_mode inner_mode = GET_MODE_INNER (mode);
15609 /* The number of vector elements. */
15610 int n_elts = XVECLEN (vals, 0);
15611 /* The number of vector elements which are not constant. */
15612 int n_var = 0;
15613 rtx any_const = NULL_RTX;
15614 /* The first element of vals. */
15615 rtx v0 = XVECEXP (vals, 0, 0);
15616 bool all_same = true;
15617
15618 /* This is a special vec_init<M><N> where N is not an element mode but a
15619 vector mode with half the elements of M. We expect to find two entries
15620 of mode N in VALS and we must put their concatentation into TARGET. */
15621 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
15622 {
15623 gcc_assert (known_eq (GET_MODE_SIZE (mode),
15624 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
15625 rtx lo = XVECEXP (vals, 0, 0);
15626 rtx hi = XVECEXP (vals, 0, 1);
15627 machine_mode narrow_mode = GET_MODE (lo);
15628 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
15629 gcc_assert (narrow_mode == GET_MODE (hi));
15630
15631 /* When we want to concatenate a half-width vector with zeroes we can
15632 use the aarch64_combinez[_be] patterns. Just make sure that the
15633 zeroes are in the right half. */
15634 if (BYTES_BIG_ENDIAN
15635 && aarch64_simd_imm_zero (lo, narrow_mode)
15636 && general_operand (hi, narrow_mode))
15637 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
15638 else if (!BYTES_BIG_ENDIAN
15639 && aarch64_simd_imm_zero (hi, narrow_mode)
15640 && general_operand (lo, narrow_mode))
15641 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
15642 else
15643 {
15644 /* Else create the two half-width registers and combine them. */
15645 if (!REG_P (lo))
15646 lo = force_reg (GET_MODE (lo), lo);
15647 if (!REG_P (hi))
15648 hi = force_reg (GET_MODE (hi), hi);
15649
15650 if (BYTES_BIG_ENDIAN)
15651 std::swap (lo, hi);
15652 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
15653 }
15654 return;
15655 }
15656
15657 /* Count the number of variable elements to initialise. */
15658 for (int i = 0; i < n_elts; ++i)
15659 {
15660 rtx x = XVECEXP (vals, 0, i);
15661 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15662 ++n_var;
15663 else
15664 any_const = x;
15665
15666 all_same &= rtx_equal_p (x, v0);
15667 }
15668
15669 /* No variable elements, hand off to aarch64_simd_make_constant which knows
15670 how best to handle this. */
15671 if (n_var == 0)
15672 {
15673 rtx constant = aarch64_simd_make_constant (vals);
15674 if (constant != NULL_RTX)
15675 {
15676 emit_move_insn (target, constant);
15677 return;
15678 }
15679 }
15680
15681 /* Splat a single non-constant element if we can. */
15682 if (all_same)
15683 {
15684 rtx x = copy_to_mode_reg (inner_mode, v0);
15685 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15686 return;
15687 }
15688
15689 enum insn_code icode = optab_handler (vec_set_optab, mode);
15690 gcc_assert (icode != CODE_FOR_nothing);
15691
15692 /* If there are only variable elements, try to optimize
15693 the insertion using dup for the most common element
15694 followed by insertions. */
15695
15696 /* The algorithm will fill matches[*][0] with the earliest matching element,
15697 and matches[X][1] with the count of duplicate elements (if X is the
15698 earliest element which has duplicates). */
15699
15700 if (n_var == n_elts && n_elts <= 16)
15701 {
15702 int matches[16][2] = {0};
15703 for (int i = 0; i < n_elts; i++)
15704 {
15705 for (int j = 0; j <= i; j++)
15706 {
15707 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15708 {
15709 matches[i][0] = j;
15710 matches[j][1]++;
15711 break;
15712 }
15713 }
15714 }
15715 int maxelement = 0;
15716 int maxv = 0;
15717 for (int i = 0; i < n_elts; i++)
15718 if (matches[i][1] > maxv)
15719 {
15720 maxelement = i;
15721 maxv = matches[i][1];
15722 }
15723
15724 /* Create a duplicate of the most common element, unless all elements
15725 are equally useless to us, in which case just immediately set the
15726 vector register using the first element. */
15727
15728 if (maxv == 1)
15729 {
15730 /* For vectors of two 64-bit elements, we can do even better. */
15731 if (n_elts == 2
15732 && (inner_mode == E_DImode
15733 || inner_mode == E_DFmode))
15734
15735 {
15736 rtx x0 = XVECEXP (vals, 0, 0);
15737 rtx x1 = XVECEXP (vals, 0, 1);
15738 /* Combine can pick up this case, but handling it directly
15739 here leaves clearer RTL.
15740
15741 This is load_pair_lanes<mode>, and also gives us a clean-up
15742 for store_pair_lanes<mode>. */
15743 if (memory_operand (x0, inner_mode)
15744 && memory_operand (x1, inner_mode)
15745 && !STRICT_ALIGNMENT
15746 && rtx_equal_p (XEXP (x1, 0),
15747 plus_constant (Pmode,
15748 XEXP (x0, 0),
15749 GET_MODE_SIZE (inner_mode))))
15750 {
15751 rtx t;
15752 if (inner_mode == DFmode)
15753 t = gen_load_pair_lanesdf (target, x0, x1);
15754 else
15755 t = gen_load_pair_lanesdi (target, x0, x1);
15756 emit_insn (t);
15757 return;
15758 }
15759 }
15760 /* The subreg-move sequence below will move into lane zero of the
15761 vector register. For big-endian we want that position to hold
15762 the last element of VALS. */
15763 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15764 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15765 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15766 }
15767 else
15768 {
15769 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15770 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15771 }
15772
15773 /* Insert the rest. */
15774 for (int i = 0; i < n_elts; i++)
15775 {
15776 rtx x = XVECEXP (vals, 0, i);
15777 if (matches[i][0] == maxelement)
15778 continue;
15779 x = copy_to_mode_reg (inner_mode, x);
15780 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15781 }
15782 return;
15783 }
15784
15785 /* Initialise a vector which is part-variable. We want to first try
15786 to build those lanes which are constant in the most efficient way we
15787 can. */
15788 if (n_var != n_elts)
15789 {
15790 rtx copy = copy_rtx (vals);
15791
15792 /* Load constant part of vector. We really don't care what goes into the
15793 parts we will overwrite, but we're more likely to be able to load the
15794 constant efficiently if it has fewer, larger, repeating parts
15795 (see aarch64_simd_valid_immediate). */
15796 for (int i = 0; i < n_elts; i++)
15797 {
15798 rtx x = XVECEXP (vals, 0, i);
15799 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15800 continue;
15801 rtx subst = any_const;
15802 for (int bit = n_elts / 2; bit > 0; bit /= 2)
15803 {
15804 /* Look in the copied vector, as more elements are const. */
15805 rtx test = XVECEXP (copy, 0, i ^ bit);
15806 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15807 {
15808 subst = test;
15809 break;
15810 }
15811 }
15812 XVECEXP (copy, 0, i) = subst;
15813 }
15814 aarch64_expand_vector_init (target, copy);
15815 }
15816
15817 /* Insert the variable lanes directly. */
15818 for (int i = 0; i < n_elts; i++)
15819 {
15820 rtx x = XVECEXP (vals, 0, i);
15821 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15822 continue;
15823 x = copy_to_mode_reg (inner_mode, x);
15824 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15825 }
15826 }
15827
15828 /* Emit RTL corresponding to:
15829 insr TARGET, ELEM. */
15830
15831 static void
15832 emit_insr (rtx target, rtx elem)
15833 {
15834 machine_mode mode = GET_MODE (target);
15835 scalar_mode elem_mode = GET_MODE_INNER (mode);
15836 elem = force_reg (elem_mode, elem);
15837
15838 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
15839 gcc_assert (icode != CODE_FOR_nothing);
15840 emit_insn (GEN_FCN (icode) (target, target, elem));
15841 }
15842
15843 /* Subroutine of aarch64_sve_expand_vector_init for handling
15844 trailing constants.
15845 This function works as follows:
15846 (a) Create a new vector consisting of trailing constants.
15847 (b) Initialize TARGET with the constant vector using emit_move_insn.
15848 (c) Insert remaining elements in TARGET using insr.
15849 NELTS is the total number of elements in original vector while
15850 while NELTS_REQD is the number of elements that are actually
15851 significant.
15852
15853 ??? The heuristic used is to do above only if number of constants
15854 is at least half the total number of elements. May need fine tuning. */
15855
15856 static bool
15857 aarch64_sve_expand_vector_init_handle_trailing_constants
15858 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
15859 {
15860 machine_mode mode = GET_MODE (target);
15861 scalar_mode elem_mode = GET_MODE_INNER (mode);
15862 int n_trailing_constants = 0;
15863
15864 for (int i = nelts_reqd - 1;
15865 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
15866 i--)
15867 n_trailing_constants++;
15868
15869 if (n_trailing_constants >= nelts_reqd / 2)
15870 {
15871 rtx_vector_builder v (mode, 1, nelts);
15872 for (int i = 0; i < nelts; i++)
15873 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
15874 rtx const_vec = v.build ();
15875 emit_move_insn (target, const_vec);
15876
15877 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
15878 emit_insr (target, builder.elt (i));
15879
15880 return true;
15881 }
15882
15883 return false;
15884 }
15885
15886 /* Subroutine of aarch64_sve_expand_vector_init.
15887 Works as follows:
15888 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
15889 (b) Skip trailing elements from BUILDER, which are the same as
15890 element NELTS_REQD - 1.
15891 (c) Insert earlier elements in reverse order in TARGET using insr. */
15892
15893 static void
15894 aarch64_sve_expand_vector_init_insert_elems (rtx target,
15895 const rtx_vector_builder &builder,
15896 int nelts_reqd)
15897 {
15898 machine_mode mode = GET_MODE (target);
15899 scalar_mode elem_mode = GET_MODE_INNER (mode);
15900
15901 struct expand_operand ops[2];
15902 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
15903 gcc_assert (icode != CODE_FOR_nothing);
15904
15905 create_output_operand (&ops[0], target, mode);
15906 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
15907 expand_insn (icode, 2, ops);
15908
15909 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15910 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
15911 emit_insr (target, builder.elt (i));
15912 }
15913
15914 /* Subroutine of aarch64_sve_expand_vector_init to handle case
15915 when all trailing elements of builder are same.
15916 This works as follows:
15917 (a) Use expand_insn interface to broadcast last vector element in TARGET.
15918 (b) Insert remaining elements in TARGET using insr.
15919
15920 ??? The heuristic used is to do above if number of same trailing elements
15921 is at least 3/4 of total number of elements, loosely based on
15922 heuristic from mostly_zeros_p. May need fine-tuning. */
15923
15924 static bool
15925 aarch64_sve_expand_vector_init_handle_trailing_same_elem
15926 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
15927 {
15928 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
15929 if (ndups >= (3 * nelts_reqd) / 4)
15930 {
15931 aarch64_sve_expand_vector_init_insert_elems (target, builder,
15932 nelts_reqd - ndups + 1);
15933 return true;
15934 }
15935
15936 return false;
15937 }
15938
15939 /* Initialize register TARGET from BUILDER. NELTS is the constant number
15940 of elements in BUILDER.
15941
15942 The function tries to initialize TARGET from BUILDER if it fits one
15943 of the special cases outlined below.
15944
15945 Failing that, the function divides BUILDER into two sub-vectors:
15946 v_even = even elements of BUILDER;
15947 v_odd = odd elements of BUILDER;
15948
15949 and recursively calls itself with v_even and v_odd.
15950
15951 if (recursive call succeeded for v_even or v_odd)
15952 TARGET = zip (v_even, v_odd)
15953
15954 The function returns true if it managed to build TARGET from BUILDER
15955 with one of the special cases, false otherwise.
15956
15957 Example: {a, 1, b, 2, c, 3, d, 4}
15958
15959 The vector gets divided into:
15960 v_even = {a, b, c, d}
15961 v_odd = {1, 2, 3, 4}
15962
15963 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
15964 initialize tmp2 from constant vector v_odd using emit_move_insn.
15965
15966 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
15967 4 elements, so we construct tmp1 from v_even using insr:
15968 tmp1 = dup(d)
15969 insr tmp1, c
15970 insr tmp1, b
15971 insr tmp1, a
15972
15973 And finally:
15974 TARGET = zip (tmp1, tmp2)
15975 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
15976
15977 static bool
15978 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
15979 int nelts, int nelts_reqd)
15980 {
15981 machine_mode mode = GET_MODE (target);
15982
15983 /* Case 1: Vector contains trailing constants. */
15984
15985 if (aarch64_sve_expand_vector_init_handle_trailing_constants
15986 (target, builder, nelts, nelts_reqd))
15987 return true;
15988
15989 /* Case 2: Vector contains leading constants. */
15990
15991 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
15992 for (int i = 0; i < nelts_reqd; i++)
15993 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
15994 rev_builder.finalize ();
15995
15996 if (aarch64_sve_expand_vector_init_handle_trailing_constants
15997 (target, rev_builder, nelts, nelts_reqd))
15998 {
15999 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16000 return true;
16001 }
16002
16003 /* Case 3: Vector contains trailing same element. */
16004
16005 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16006 (target, builder, nelts_reqd))
16007 return true;
16008
16009 /* Case 4: Vector contains leading same element. */
16010
16011 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16012 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16013 {
16014 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16015 return true;
16016 }
16017
16018 /* Avoid recursing below 4-elements.
16019 ??? The threshold 4 may need fine-tuning. */
16020
16021 if (nelts_reqd <= 4)
16022 return false;
16023
16024 rtx_vector_builder v_even (mode, 1, nelts);
16025 rtx_vector_builder v_odd (mode, 1, nelts);
16026
16027 for (int i = 0; i < nelts * 2; i += 2)
16028 {
16029 v_even.quick_push (builder.elt (i));
16030 v_odd.quick_push (builder.elt (i + 1));
16031 }
16032
16033 v_even.finalize ();
16034 v_odd.finalize ();
16035
16036 rtx tmp1 = gen_reg_rtx (mode);
16037 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16038 nelts, nelts_reqd / 2);
16039
16040 rtx tmp2 = gen_reg_rtx (mode);
16041 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16042 nelts, nelts_reqd / 2);
16043
16044 if (!did_even_p && !did_odd_p)
16045 return false;
16046
16047 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16048 special cases and zip v_even, v_odd. */
16049
16050 if (!did_even_p)
16051 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16052
16053 if (!did_odd_p)
16054 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16055
16056 rtvec v = gen_rtvec (2, tmp1, tmp2);
16057 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16058 return true;
16059 }
16060
16061 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16062
16063 void
16064 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16065 {
16066 machine_mode mode = GET_MODE (target);
16067 int nelts = XVECLEN (vals, 0);
16068
16069 rtx_vector_builder v (mode, 1, nelts);
16070 for (int i = 0; i < nelts; i++)
16071 v.quick_push (XVECEXP (vals, 0, i));
16072 v.finalize ();
16073
16074 /* If neither sub-vectors of v could be initialized specially,
16075 then use INSR to insert all elements from v into TARGET.
16076 ??? This might not be optimal for vectors with large
16077 initializers like 16-element or above.
16078 For nelts < 4, it probably isn't useful to handle specially. */
16079
16080 if (nelts < 4
16081 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16082 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16083 }
16084
16085 static unsigned HOST_WIDE_INT
16086 aarch64_shift_truncation_mask (machine_mode mode)
16087 {
16088 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16089 return 0;
16090 return GET_MODE_UNIT_BITSIZE (mode) - 1;
16091 }
16092
16093 /* Select a format to encode pointers in exception handling data. */
16094 int
16095 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16096 {
16097 int type;
16098 switch (aarch64_cmodel)
16099 {
16100 case AARCH64_CMODEL_TINY:
16101 case AARCH64_CMODEL_TINY_PIC:
16102 case AARCH64_CMODEL_SMALL:
16103 case AARCH64_CMODEL_SMALL_PIC:
16104 case AARCH64_CMODEL_SMALL_SPIC:
16105 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16106 for everything. */
16107 type = DW_EH_PE_sdata4;
16108 break;
16109 default:
16110 /* No assumptions here. 8-byte relocs required. */
16111 type = DW_EH_PE_sdata8;
16112 break;
16113 }
16114 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16115 }
16116
16117 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16118
16119 static void
16120 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16121 {
16122 if (aarch64_simd_decl_p (decl))
16123 {
16124 fprintf (stream, "\t.variant_pcs\t");
16125 assemble_name (stream, name);
16126 fprintf (stream, "\n");
16127 }
16128 }
16129
16130 /* The last .arch and .tune assembly strings that we printed. */
16131 static std::string aarch64_last_printed_arch_string;
16132 static std::string aarch64_last_printed_tune_string;
16133
16134 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16135 by the function fndecl. */
16136
16137 void
16138 aarch64_declare_function_name (FILE *stream, const char* name,
16139 tree fndecl)
16140 {
16141 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16142
16143 struct cl_target_option *targ_options;
16144 if (target_parts)
16145 targ_options = TREE_TARGET_OPTION (target_parts);
16146 else
16147 targ_options = TREE_TARGET_OPTION (target_option_current_node);
16148 gcc_assert (targ_options);
16149
16150 const struct processor *this_arch
16151 = aarch64_get_arch (targ_options->x_explicit_arch);
16152
16153 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16154 std::string extension
16155 = aarch64_get_extension_string_for_isa_flags (isa_flags,
16156 this_arch->flags);
16157 /* Only update the assembler .arch string if it is distinct from the last
16158 such string we printed. */
16159 std::string to_print = this_arch->name + extension;
16160 if (to_print != aarch64_last_printed_arch_string)
16161 {
16162 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16163 aarch64_last_printed_arch_string = to_print;
16164 }
16165
16166 /* Print the cpu name we're tuning for in the comments, might be
16167 useful to readers of the generated asm. Do it only when it changes
16168 from function to function and verbose assembly is requested. */
16169 const struct processor *this_tune
16170 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16171
16172 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16173 {
16174 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16175 this_tune->name);
16176 aarch64_last_printed_tune_string = this_tune->name;
16177 }
16178
16179 aarch64_asm_output_variant_pcs (stream, fndecl, name);
16180
16181 /* Don't forget the type directive for ELF. */
16182 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16183 ASM_OUTPUT_LABEL (stream, name);
16184 }
16185
16186 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16187
16188 void
16189 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16190 {
16191 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16192 const char *value = IDENTIFIER_POINTER (target);
16193 aarch64_asm_output_variant_pcs (stream, decl, name);
16194 ASM_OUTPUT_DEF (stream, name, value);
16195 }
16196
16197 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16198 function symbol references. */
16199
16200 void
16201 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16202 {
16203 default_elf_asm_output_external (stream, decl, name);
16204 aarch64_asm_output_variant_pcs (stream, decl, name);
16205 }
16206
16207 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16208 Used to output the .cfi_b_key_frame directive when signing the current
16209 function with the B key. */
16210
16211 void
16212 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16213 {
16214 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16215 && aarch64_ra_sign_key == AARCH64_KEY_B)
16216 asm_fprintf (f, "\t.cfi_b_key_frame\n");
16217 }
16218
16219 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16220
16221 static void
16222 aarch64_start_file (void)
16223 {
16224 struct cl_target_option *default_options
16225 = TREE_TARGET_OPTION (target_option_default_node);
16226
16227 const struct processor *default_arch
16228 = aarch64_get_arch (default_options->x_explicit_arch);
16229 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16230 std::string extension
16231 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16232 default_arch->flags);
16233
16234 aarch64_last_printed_arch_string = default_arch->name + extension;
16235 aarch64_last_printed_tune_string = "";
16236 asm_fprintf (asm_out_file, "\t.arch %s\n",
16237 aarch64_last_printed_arch_string.c_str ());
16238
16239 default_file_start ();
16240 }
16241
16242 /* Emit load exclusive. */
16243
16244 static void
16245 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16246 rtx mem, rtx model_rtx)
16247 {
16248 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16249 }
16250
16251 /* Emit store exclusive. */
16252
16253 static void
16254 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16255 rtx rval, rtx mem, rtx model_rtx)
16256 {
16257 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16258 }
16259
16260 /* Mark the previous jump instruction as unlikely. */
16261
16262 static void
16263 aarch64_emit_unlikely_jump (rtx insn)
16264 {
16265 rtx_insn *jump = emit_jump_insn (insn);
16266 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16267 }
16268
16269 /* Expand a compare and swap pattern. */
16270
16271 void
16272 aarch64_expand_compare_and_swap (rtx operands[])
16273 {
16274 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16275 machine_mode mode, r_mode;
16276
16277 bval = operands[0];
16278 rval = operands[1];
16279 mem = operands[2];
16280 oldval = operands[3];
16281 newval = operands[4];
16282 is_weak = operands[5];
16283 mod_s = operands[6];
16284 mod_f = operands[7];
16285 mode = GET_MODE (mem);
16286
16287 /* Normally the succ memory model must be stronger than fail, but in the
16288 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16289 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16290 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16291 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16292 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16293
16294 r_mode = mode;
16295 if (mode == QImode || mode == HImode)
16296 {
16297 r_mode = SImode;
16298 rval = gen_reg_rtx (r_mode);
16299 }
16300
16301 if (TARGET_LSE)
16302 {
16303 /* The CAS insn requires oldval and rval overlap, but we need to
16304 have a copy of oldval saved across the operation to tell if
16305 the operation is successful. */
16306 if (reg_overlap_mentioned_p (rval, oldval))
16307 rval = copy_to_mode_reg (r_mode, oldval);
16308 else
16309 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16310
16311 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16312 newval, mod_s));
16313 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16314 }
16315 else
16316 {
16317 /* The oldval predicate varies by mode. Test it and force to reg. */
16318 insn_code code = code_for_aarch64_compare_and_swap (mode);
16319 if (!insn_data[code].operand[2].predicate (oldval, mode))
16320 oldval = force_reg (mode, oldval);
16321
16322 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16323 is_weak, mod_s, mod_f));
16324 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16325 }
16326
16327 if (r_mode != mode)
16328 rval = gen_lowpart (mode, rval);
16329 emit_move_insn (operands[1], rval);
16330
16331 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16332 emit_insn (gen_rtx_SET (bval, x));
16333 }
16334
16335 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16336 sequence implementing an atomic operation. */
16337
16338 static void
16339 aarch64_emit_post_barrier (enum memmodel model)
16340 {
16341 const enum memmodel base_model = memmodel_base (model);
16342
16343 if (is_mm_sync (model)
16344 && (base_model == MEMMODEL_ACQUIRE
16345 || base_model == MEMMODEL_ACQ_REL
16346 || base_model == MEMMODEL_SEQ_CST))
16347 {
16348 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16349 }
16350 }
16351
16352 /* Split a compare and swap pattern. */
16353
16354 void
16355 aarch64_split_compare_and_swap (rtx operands[])
16356 {
16357 rtx rval, mem, oldval, newval, scratch;
16358 machine_mode mode;
16359 bool is_weak;
16360 rtx_code_label *label1, *label2;
16361 rtx x, cond;
16362 enum memmodel model;
16363 rtx model_rtx;
16364
16365 rval = operands[0];
16366 mem = operands[1];
16367 oldval = operands[2];
16368 newval = operands[3];
16369 is_weak = (operands[4] != const0_rtx);
16370 model_rtx = operands[5];
16371 scratch = operands[7];
16372 mode = GET_MODE (mem);
16373 model = memmodel_from_int (INTVAL (model_rtx));
16374
16375 /* When OLDVAL is zero and we want the strong version we can emit a tighter
16376 loop:
16377 .label1:
16378 LD[A]XR rval, [mem]
16379 CBNZ rval, .label2
16380 ST[L]XR scratch, newval, [mem]
16381 CBNZ scratch, .label1
16382 .label2:
16383 CMP rval, 0. */
16384 bool strong_zero_p = !is_weak && oldval == const0_rtx;
16385
16386 label1 = NULL;
16387 if (!is_weak)
16388 {
16389 label1 = gen_label_rtx ();
16390 emit_label (label1);
16391 }
16392 label2 = gen_label_rtx ();
16393
16394 /* The initial load can be relaxed for a __sync operation since a final
16395 barrier will be emitted to stop code hoisting. */
16396 if (is_mm_sync (model))
16397 aarch64_emit_load_exclusive (mode, rval, mem,
16398 GEN_INT (MEMMODEL_RELAXED));
16399 else
16400 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16401
16402 if (strong_zero_p)
16403 {
16404 if (aarch64_track_speculation)
16405 {
16406 /* Emit an explicit compare instruction, so that we can correctly
16407 track the condition codes. */
16408 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16409 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16410 }
16411 else
16412 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16413
16414 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16415 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16416 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16417 }
16418 else
16419 {
16420 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16421 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16422 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16423 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16424 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16425 }
16426
16427 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16428
16429 if (!is_weak)
16430 {
16431 if (aarch64_track_speculation)
16432 {
16433 /* Emit an explicit compare instruction, so that we can correctly
16434 track the condition codes. */
16435 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16436 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16437 }
16438 else
16439 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16440
16441 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16442 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16443 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16444 }
16445 else
16446 {
16447 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16448 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16449 emit_insn (gen_rtx_SET (cond, x));
16450 }
16451
16452 emit_label (label2);
16453 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16454 to set the condition flags. If this is not used it will be removed by
16455 later passes. */
16456 if (strong_zero_p)
16457 {
16458 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16459 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16460 emit_insn (gen_rtx_SET (cond, x));
16461 }
16462 /* Emit any final barrier needed for a __sync operation. */
16463 if (is_mm_sync (model))
16464 aarch64_emit_post_barrier (model);
16465 }
16466
16467 /* Split an atomic operation. */
16468
16469 void
16470 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16471 rtx value, rtx model_rtx, rtx cond)
16472 {
16473 machine_mode mode = GET_MODE (mem);
16474 machine_mode wmode = (mode == DImode ? DImode : SImode);
16475 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16476 const bool is_sync = is_mm_sync (model);
16477 rtx_code_label *label;
16478 rtx x;
16479
16480 /* Split the atomic operation into a sequence. */
16481 label = gen_label_rtx ();
16482 emit_label (label);
16483
16484 if (new_out)
16485 new_out = gen_lowpart (wmode, new_out);
16486 if (old_out)
16487 old_out = gen_lowpart (wmode, old_out);
16488 else
16489 old_out = new_out;
16490 value = simplify_gen_subreg (wmode, value, mode, 0);
16491
16492 /* The initial load can be relaxed for a __sync operation since a final
16493 barrier will be emitted to stop code hoisting. */
16494 if (is_sync)
16495 aarch64_emit_load_exclusive (mode, old_out, mem,
16496 GEN_INT (MEMMODEL_RELAXED));
16497 else
16498 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16499
16500 switch (code)
16501 {
16502 case SET:
16503 new_out = value;
16504 break;
16505
16506 case NOT:
16507 x = gen_rtx_AND (wmode, old_out, value);
16508 emit_insn (gen_rtx_SET (new_out, x));
16509 x = gen_rtx_NOT (wmode, new_out);
16510 emit_insn (gen_rtx_SET (new_out, x));
16511 break;
16512
16513 case MINUS:
16514 if (CONST_INT_P (value))
16515 {
16516 value = GEN_INT (-INTVAL (value));
16517 code = PLUS;
16518 }
16519 /* Fall through. */
16520
16521 default:
16522 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16523 emit_insn (gen_rtx_SET (new_out, x));
16524 break;
16525 }
16526
16527 aarch64_emit_store_exclusive (mode, cond, mem,
16528 gen_lowpart (mode, new_out), model_rtx);
16529
16530 if (aarch64_track_speculation)
16531 {
16532 /* Emit an explicit compare instruction, so that we can correctly
16533 track the condition codes. */
16534 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16535 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16536 }
16537 else
16538 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16539
16540 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16541 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16542 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16543
16544 /* Emit any final barrier needed for a __sync operation. */
16545 if (is_sync)
16546 aarch64_emit_post_barrier (model);
16547 }
16548
16549 static void
16550 aarch64_init_libfuncs (void)
16551 {
16552 /* Half-precision float operations. The compiler handles all operations
16553 with NULL libfuncs by converting to SFmode. */
16554
16555 /* Conversions. */
16556 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16557 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16558
16559 /* Arithmetic. */
16560 set_optab_libfunc (add_optab, HFmode, NULL);
16561 set_optab_libfunc (sdiv_optab, HFmode, NULL);
16562 set_optab_libfunc (smul_optab, HFmode, NULL);
16563 set_optab_libfunc (neg_optab, HFmode, NULL);
16564 set_optab_libfunc (sub_optab, HFmode, NULL);
16565
16566 /* Comparisons. */
16567 set_optab_libfunc (eq_optab, HFmode, NULL);
16568 set_optab_libfunc (ne_optab, HFmode, NULL);
16569 set_optab_libfunc (lt_optab, HFmode, NULL);
16570 set_optab_libfunc (le_optab, HFmode, NULL);
16571 set_optab_libfunc (ge_optab, HFmode, NULL);
16572 set_optab_libfunc (gt_optab, HFmode, NULL);
16573 set_optab_libfunc (unord_optab, HFmode, NULL);
16574 }
16575
16576 /* Target hook for c_mode_for_suffix. */
16577 static machine_mode
16578 aarch64_c_mode_for_suffix (char suffix)
16579 {
16580 if (suffix == 'q')
16581 return TFmode;
16582
16583 return VOIDmode;
16584 }
16585
16586 /* We can only represent floating point constants which will fit in
16587 "quarter-precision" values. These values are characterised by
16588 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
16589 by:
16590
16591 (-1)^s * (n/16) * 2^r
16592
16593 Where:
16594 's' is the sign bit.
16595 'n' is an integer in the range 16 <= n <= 31.
16596 'r' is an integer in the range -3 <= r <= 4. */
16597
16598 /* Return true iff X can be represented by a quarter-precision
16599 floating point immediate operand X. Note, we cannot represent 0.0. */
16600 bool
16601 aarch64_float_const_representable_p (rtx x)
16602 {
16603 /* This represents our current view of how many bits
16604 make up the mantissa. */
16605 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16606 int exponent;
16607 unsigned HOST_WIDE_INT mantissa, mask;
16608 REAL_VALUE_TYPE r, m;
16609 bool fail;
16610
16611 if (!CONST_DOUBLE_P (x))
16612 return false;
16613
16614 if (GET_MODE (x) == VOIDmode
16615 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16616 return false;
16617
16618 r = *CONST_DOUBLE_REAL_VALUE (x);
16619
16620 /* We cannot represent infinities, NaNs or +/-zero. We won't
16621 know if we have +zero until we analyse the mantissa, but we
16622 can reject the other invalid values. */
16623 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16624 || REAL_VALUE_MINUS_ZERO (r))
16625 return false;
16626
16627 /* Extract exponent. */
16628 r = real_value_abs (&r);
16629 exponent = REAL_EXP (&r);
16630
16631 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16632 highest (sign) bit, with a fixed binary point at bit point_pos.
16633 m1 holds the low part of the mantissa, m2 the high part.
16634 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16635 bits for the mantissa, this can fail (low bits will be lost). */
16636 real_ldexp (&m, &r, point_pos - exponent);
16637 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16638
16639 /* If the low part of the mantissa has bits set we cannot represent
16640 the value. */
16641 if (w.ulow () != 0)
16642 return false;
16643 /* We have rejected the lower HOST_WIDE_INT, so update our
16644 understanding of how many bits lie in the mantissa and
16645 look only at the high HOST_WIDE_INT. */
16646 mantissa = w.elt (1);
16647 point_pos -= HOST_BITS_PER_WIDE_INT;
16648
16649 /* We can only represent values with a mantissa of the form 1.xxxx. */
16650 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
16651 if ((mantissa & mask) != 0)
16652 return false;
16653
16654 /* Having filtered unrepresentable values, we may now remove all
16655 but the highest 5 bits. */
16656 mantissa >>= point_pos - 5;
16657
16658 /* We cannot represent the value 0.0, so reject it. This is handled
16659 elsewhere. */
16660 if (mantissa == 0)
16661 return false;
16662
16663 /* Then, as bit 4 is always set, we can mask it off, leaving
16664 the mantissa in the range [0, 15]. */
16665 mantissa &= ~(1 << 4);
16666 gcc_assert (mantissa <= 15);
16667
16668 /* GCC internally does not use IEEE754-like encoding (where normalized
16669 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
16670 Our mantissa values are shifted 4 places to the left relative to
16671 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16672 by 5 places to correct for GCC's representation. */
16673 exponent = 5 - exponent;
16674
16675 return (exponent >= 0 && exponent <= 7);
16676 }
16677
16678 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16679 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
16680 output MOVI/MVNI, ORR or BIC immediate. */
16681 char*
16682 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
16683 enum simd_immediate_check which)
16684 {
16685 bool is_valid;
16686 static char templ[40];
16687 const char *mnemonic;
16688 const char *shift_op;
16689 unsigned int lane_count = 0;
16690 char element_char;
16691
16692 struct simd_immediate_info info;
16693
16694 /* This will return true to show const_vector is legal for use as either
16695 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16696 It will also update INFO to show how the immediate should be generated.
16697 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
16698 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
16699 gcc_assert (is_valid);
16700
16701 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16702 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
16703
16704 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16705 {
16706 gcc_assert (info.insn == simd_immediate_info::MOV
16707 && info.u.mov.shift == 0);
16708 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16709 move immediate path. */
16710 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
16711 info.u.mov.value = GEN_INT (0);
16712 else
16713 {
16714 const unsigned int buf_size = 20;
16715 char float_buf[buf_size] = {'\0'};
16716 real_to_decimal_for_mode (float_buf,
16717 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
16718 buf_size, buf_size, 1, info.elt_mode);
16719
16720 if (lane_count == 1)
16721 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
16722 else
16723 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
16724 lane_count, element_char, float_buf);
16725 return templ;
16726 }
16727 }
16728
16729 gcc_assert (CONST_INT_P (info.u.mov.value));
16730
16731 if (which == AARCH64_CHECK_MOV)
16732 {
16733 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
16734 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
16735 ? "msl" : "lsl");
16736 if (lane_count == 1)
16737 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
16738 mnemonic, UINTVAL (info.u.mov.value));
16739 else if (info.u.mov.shift)
16740 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16741 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
16742 element_char, UINTVAL (info.u.mov.value), shift_op,
16743 info.u.mov.shift);
16744 else
16745 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16746 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
16747 element_char, UINTVAL (info.u.mov.value));
16748 }
16749 else
16750 {
16751 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
16752 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
16753 if (info.u.mov.shift)
16754 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16755 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
16756 element_char, UINTVAL (info.u.mov.value), "lsl",
16757 info.u.mov.shift);
16758 else
16759 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16760 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
16761 element_char, UINTVAL (info.u.mov.value));
16762 }
16763 return templ;
16764 }
16765
16766 char*
16767 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
16768 {
16769
16770 /* If a floating point number was passed and we desire to use it in an
16771 integer mode do the conversion to integer. */
16772 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
16773 {
16774 unsigned HOST_WIDE_INT ival;
16775 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
16776 gcc_unreachable ();
16777 immediate = gen_int_mode (ival, mode);
16778 }
16779
16780 machine_mode vmode;
16781 /* use a 64 bit mode for everything except for DI/DF mode, where we use
16782 a 128 bit vector mode. */
16783 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
16784
16785 vmode = aarch64_simd_container_mode (mode, width);
16786 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
16787 return aarch64_output_simd_mov_immediate (v_op, width);
16788 }
16789
16790 /* Return the output string to use for moving immediate CONST_VECTOR
16791 into an SVE register. */
16792
16793 char *
16794 aarch64_output_sve_mov_immediate (rtx const_vector)
16795 {
16796 static char templ[40];
16797 struct simd_immediate_info info;
16798 char element_char;
16799
16800 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
16801 gcc_assert (is_valid);
16802
16803 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16804
16805 machine_mode vec_mode = GET_MODE (const_vector);
16806 if (aarch64_sve_pred_mode_p (vec_mode))
16807 {
16808 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16809 if (info.insn == simd_immediate_info::MOV)
16810 {
16811 gcc_assert (info.u.mov.value == const0_rtx);
16812 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
16813 }
16814 else
16815 {
16816 gcc_assert (info.insn == simd_immediate_info::PTRUE);
16817 unsigned int total_bytes;
16818 if (info.u.pattern == AARCH64_SV_ALL
16819 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
16820 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
16821 total_bytes / GET_MODE_SIZE (info.elt_mode));
16822 else
16823 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
16824 svpattern_token (info.u.pattern));
16825 }
16826 return buf;
16827 }
16828
16829 if (info.insn == simd_immediate_info::INDEX)
16830 {
16831 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
16832 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
16833 element_char, INTVAL (info.u.index.base),
16834 INTVAL (info.u.index.step));
16835 return templ;
16836 }
16837
16838 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16839 {
16840 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
16841 info.u.mov.value = GEN_INT (0);
16842 else
16843 {
16844 const int buf_size = 20;
16845 char float_buf[buf_size] = {};
16846 real_to_decimal_for_mode (float_buf,
16847 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
16848 buf_size, buf_size, 1, info.elt_mode);
16849
16850 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
16851 element_char, float_buf);
16852 return templ;
16853 }
16854 }
16855
16856 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
16857 element_char, INTVAL (info.u.mov.value));
16858 return templ;
16859 }
16860
16861 /* Split operands into moves from op[1] + op[2] into op[0]. */
16862
16863 void
16864 aarch64_split_combinev16qi (rtx operands[3])
16865 {
16866 unsigned int dest = REGNO (operands[0]);
16867 unsigned int src1 = REGNO (operands[1]);
16868 unsigned int src2 = REGNO (operands[2]);
16869 machine_mode halfmode = GET_MODE (operands[1]);
16870 unsigned int halfregs = REG_NREGS (operands[1]);
16871 rtx destlo, desthi;
16872
16873 gcc_assert (halfmode == V16QImode);
16874
16875 if (src1 == dest && src2 == dest + halfregs)
16876 {
16877 /* No-op move. Can't split to nothing; emit something. */
16878 emit_note (NOTE_INSN_DELETED);
16879 return;
16880 }
16881
16882 /* Preserve register attributes for variable tracking. */
16883 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
16884 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
16885 GET_MODE_SIZE (halfmode));
16886
16887 /* Special case of reversed high/low parts. */
16888 if (reg_overlap_mentioned_p (operands[2], destlo)
16889 && reg_overlap_mentioned_p (operands[1], desthi))
16890 {
16891 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16892 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
16893 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16894 }
16895 else if (!reg_overlap_mentioned_p (operands[2], destlo))
16896 {
16897 /* Try to avoid unnecessary moves if part of the result
16898 is in the right place already. */
16899 if (src1 != dest)
16900 emit_move_insn (destlo, operands[1]);
16901 if (src2 != dest + halfregs)
16902 emit_move_insn (desthi, operands[2]);
16903 }
16904 else
16905 {
16906 if (src2 != dest + halfregs)
16907 emit_move_insn (desthi, operands[2]);
16908 if (src1 != dest)
16909 emit_move_insn (destlo, operands[1]);
16910 }
16911 }
16912
16913 /* vec_perm support. */
16914
16915 struct expand_vec_perm_d
16916 {
16917 rtx target, op0, op1;
16918 vec_perm_indices perm;
16919 machine_mode vmode;
16920 unsigned int vec_flags;
16921 bool one_vector_p;
16922 bool testing_p;
16923 };
16924
16925 /* Generate a variable permutation. */
16926
16927 static void
16928 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16929 {
16930 machine_mode vmode = GET_MODE (target);
16931 bool one_vector_p = rtx_equal_p (op0, op1);
16932
16933 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16934 gcc_checking_assert (GET_MODE (op0) == vmode);
16935 gcc_checking_assert (GET_MODE (op1) == vmode);
16936 gcc_checking_assert (GET_MODE (sel) == vmode);
16937 gcc_checking_assert (TARGET_SIMD);
16938
16939 if (one_vector_p)
16940 {
16941 if (vmode == V8QImode)
16942 {
16943 /* Expand the argument to a V16QI mode by duplicating it. */
16944 rtx pair = gen_reg_rtx (V16QImode);
16945 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16946 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16947 }
16948 else
16949 {
16950 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16951 }
16952 }
16953 else
16954 {
16955 rtx pair;
16956
16957 if (vmode == V8QImode)
16958 {
16959 pair = gen_reg_rtx (V16QImode);
16960 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16961 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16962 }
16963 else
16964 {
16965 pair = gen_reg_rtx (OImode);
16966 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16967 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16968 }
16969 }
16970 }
16971
16972 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16973 NELT is the number of elements in the vector. */
16974
16975 void
16976 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16977 unsigned int nelt)
16978 {
16979 machine_mode vmode = GET_MODE (target);
16980 bool one_vector_p = rtx_equal_p (op0, op1);
16981 rtx mask;
16982
16983 /* The TBL instruction does not use a modulo index, so we must take care
16984 of that ourselves. */
16985 mask = aarch64_simd_gen_const_vector_dup (vmode,
16986 one_vector_p ? nelt - 1 : 2 * nelt - 1);
16987 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16988
16989 /* For big-endian, we also need to reverse the index within the vector
16990 (but not which vector). */
16991 if (BYTES_BIG_ENDIAN)
16992 {
16993 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
16994 if (!one_vector_p)
16995 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16996 sel = expand_simple_binop (vmode, XOR, sel, mask,
16997 NULL, 0, OPTAB_LIB_WIDEN);
16998 }
16999 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17000 }
17001
17002 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17003
17004 static void
17005 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17006 {
17007 emit_insn (gen_rtx_SET (target,
17008 gen_rtx_UNSPEC (GET_MODE (target),
17009 gen_rtvec (2, op0, op1), code)));
17010 }
17011
17012 /* Expand an SVE vec_perm with the given operands. */
17013
17014 void
17015 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17016 {
17017 machine_mode data_mode = GET_MODE (target);
17018 machine_mode sel_mode = GET_MODE (sel);
17019 /* Enforced by the pattern condition. */
17020 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17021
17022 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17023 size of the two value vectors, i.e. the upper bits of the indices
17024 are effectively ignored. SVE TBL instead produces 0 for any
17025 out-of-range indices, so we need to modulo all the vec_perm indices
17026 to ensure they are all in range. */
17027 rtx sel_reg = force_reg (sel_mode, sel);
17028
17029 /* Check if the sel only references the first values vector. */
17030 if (GET_CODE (sel) == CONST_VECTOR
17031 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17032 {
17033 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17034 return;
17035 }
17036
17037 /* Check if the two values vectors are the same. */
17038 if (rtx_equal_p (op0, op1))
17039 {
17040 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17041 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17042 NULL, 0, OPTAB_DIRECT);
17043 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17044 return;
17045 }
17046
17047 /* Run TBL on for each value vector and combine the results. */
17048
17049 rtx res0 = gen_reg_rtx (data_mode);
17050 rtx res1 = gen_reg_rtx (data_mode);
17051 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17052 if (GET_CODE (sel) != CONST_VECTOR
17053 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17054 {
17055 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17056 2 * nunits - 1);
17057 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17058 NULL, 0, OPTAB_DIRECT);
17059 }
17060 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17061 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17062 NULL, 0, OPTAB_DIRECT);
17063 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17064 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17065 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17066 else
17067 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17068 }
17069
17070 /* Recognize patterns suitable for the TRN instructions. */
17071 static bool
17072 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17073 {
17074 HOST_WIDE_INT odd;
17075 poly_uint64 nelt = d->perm.length ();
17076 rtx out, in0, in1, x;
17077 machine_mode vmode = d->vmode;
17078
17079 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17080 return false;
17081
17082 /* Note that these are little-endian tests.
17083 We correct for big-endian later. */
17084 if (!d->perm[0].is_constant (&odd)
17085 || (odd != 0 && odd != 1)
17086 || !d->perm.series_p (0, 2, odd, 2)
17087 || !d->perm.series_p (1, 2, nelt + odd, 2))
17088 return false;
17089
17090 /* Success! */
17091 if (d->testing_p)
17092 return true;
17093
17094 in0 = d->op0;
17095 in1 = d->op1;
17096 /* We don't need a big-endian lane correction for SVE; see the comment
17097 at the head of aarch64-sve.md for details. */
17098 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17099 {
17100 x = in0, in0 = in1, in1 = x;
17101 odd = !odd;
17102 }
17103 out = d->target;
17104
17105 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17106 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17107 return true;
17108 }
17109
17110 /* Recognize patterns suitable for the UZP instructions. */
17111 static bool
17112 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17113 {
17114 HOST_WIDE_INT odd;
17115 rtx out, in0, in1, x;
17116 machine_mode vmode = d->vmode;
17117
17118 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17119 return false;
17120
17121 /* Note that these are little-endian tests.
17122 We correct for big-endian later. */
17123 if (!d->perm[0].is_constant (&odd)
17124 || (odd != 0 && odd != 1)
17125 || !d->perm.series_p (0, 1, odd, 2))
17126 return false;
17127
17128 /* Success! */
17129 if (d->testing_p)
17130 return true;
17131
17132 in0 = d->op0;
17133 in1 = d->op1;
17134 /* We don't need a big-endian lane correction for SVE; see the comment
17135 at the head of aarch64-sve.md for details. */
17136 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17137 {
17138 x = in0, in0 = in1, in1 = x;
17139 odd = !odd;
17140 }
17141 out = d->target;
17142
17143 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17144 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17145 return true;
17146 }
17147
17148 /* Recognize patterns suitable for the ZIP instructions. */
17149 static bool
17150 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17151 {
17152 unsigned int high;
17153 poly_uint64 nelt = d->perm.length ();
17154 rtx out, in0, in1, x;
17155 machine_mode vmode = d->vmode;
17156
17157 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17158 return false;
17159
17160 /* Note that these are little-endian tests.
17161 We correct for big-endian later. */
17162 poly_uint64 first = d->perm[0];
17163 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17164 || !d->perm.series_p (0, 2, first, 1)
17165 || !d->perm.series_p (1, 2, first + nelt, 1))
17166 return false;
17167 high = maybe_ne (first, 0U);
17168
17169 /* Success! */
17170 if (d->testing_p)
17171 return true;
17172
17173 in0 = d->op0;
17174 in1 = d->op1;
17175 /* We don't need a big-endian lane correction for SVE; see the comment
17176 at the head of aarch64-sve.md for details. */
17177 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17178 {
17179 x = in0, in0 = in1, in1 = x;
17180 high = !high;
17181 }
17182 out = d->target;
17183
17184 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17185 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17186 return true;
17187 }
17188
17189 /* Recognize patterns for the EXT insn. */
17190
17191 static bool
17192 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17193 {
17194 HOST_WIDE_INT location;
17195 rtx offset;
17196
17197 /* The first element always refers to the first vector.
17198 Check if the extracted indices are increasing by one. */
17199 if (d->vec_flags == VEC_SVE_PRED
17200 || !d->perm[0].is_constant (&location)
17201 || !d->perm.series_p (0, 1, location, 1))
17202 return false;
17203
17204 /* Success! */
17205 if (d->testing_p)
17206 return true;
17207
17208 /* The case where (location == 0) is a no-op for both big- and little-endian,
17209 and is removed by the mid-end at optimization levels -O1 and higher.
17210
17211 We don't need a big-endian lane correction for SVE; see the comment
17212 at the head of aarch64-sve.md for details. */
17213 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17214 {
17215 /* After setup, we want the high elements of the first vector (stored
17216 at the LSB end of the register), and the low elements of the second
17217 vector (stored at the MSB end of the register). So swap. */
17218 std::swap (d->op0, d->op1);
17219 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17220 to_constant () is safe since this is restricted to Advanced SIMD
17221 vectors. */
17222 location = d->perm.length ().to_constant () - location;
17223 }
17224
17225 offset = GEN_INT (location);
17226 emit_set_insn (d->target,
17227 gen_rtx_UNSPEC (d->vmode,
17228 gen_rtvec (3, d->op0, d->op1, offset),
17229 UNSPEC_EXT));
17230 return true;
17231 }
17232
17233 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17234 within each 64-bit, 32-bit or 16-bit granule. */
17235
17236 static bool
17237 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17238 {
17239 HOST_WIDE_INT diff;
17240 unsigned int i, size, unspec;
17241 machine_mode pred_mode;
17242
17243 if (d->vec_flags == VEC_SVE_PRED
17244 || !d->one_vector_p
17245 || !d->perm[0].is_constant (&diff))
17246 return false;
17247
17248 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17249 if (size == 8)
17250 {
17251 unspec = UNSPEC_REV64;
17252 pred_mode = VNx2BImode;
17253 }
17254 else if (size == 4)
17255 {
17256 unspec = UNSPEC_REV32;
17257 pred_mode = VNx4BImode;
17258 }
17259 else if (size == 2)
17260 {
17261 unspec = UNSPEC_REV16;
17262 pred_mode = VNx8BImode;
17263 }
17264 else
17265 return false;
17266
17267 unsigned int step = diff + 1;
17268 for (i = 0; i < step; ++i)
17269 if (!d->perm.series_p (i, step, diff - i, step))
17270 return false;
17271
17272 /* Success! */
17273 if (d->testing_p)
17274 return true;
17275
17276 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17277 if (d->vec_flags == VEC_SVE_DATA)
17278 {
17279 rtx pred = aarch64_ptrue_reg (pred_mode);
17280 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
17281 UNSPEC_MERGE_PTRUE);
17282 }
17283 emit_set_insn (d->target, src);
17284 return true;
17285 }
17286
17287 /* Recognize patterns for the REV insn, which reverses elements within
17288 a full vector. */
17289
17290 static bool
17291 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17292 {
17293 poly_uint64 nelt = d->perm.length ();
17294
17295 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17296 return false;
17297
17298 if (!d->perm.series_p (0, 1, nelt - 1, -1))
17299 return false;
17300
17301 /* Success! */
17302 if (d->testing_p)
17303 return true;
17304
17305 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17306 emit_set_insn (d->target, src);
17307 return true;
17308 }
17309
17310 static bool
17311 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17312 {
17313 rtx out = d->target;
17314 rtx in0;
17315 HOST_WIDE_INT elt;
17316 machine_mode vmode = d->vmode;
17317 rtx lane;
17318
17319 if (d->vec_flags == VEC_SVE_PRED
17320 || d->perm.encoding ().encoded_nelts () != 1
17321 || !d->perm[0].is_constant (&elt))
17322 return false;
17323
17324 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17325 return false;
17326
17327 /* Success! */
17328 if (d->testing_p)
17329 return true;
17330
17331 /* The generic preparation in aarch64_expand_vec_perm_const_1
17332 swaps the operand order and the permute indices if it finds
17333 d->perm[0] to be in the second operand. Thus, we can always
17334 use d->op0 and need not do any extra arithmetic to get the
17335 correct lane number. */
17336 in0 = d->op0;
17337 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
17338
17339 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17340 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17341 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17342 return true;
17343 }
17344
17345 static bool
17346 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17347 {
17348 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17349 machine_mode vmode = d->vmode;
17350
17351 /* Make sure that the indices are constant. */
17352 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17353 for (unsigned int i = 0; i < encoded_nelts; ++i)
17354 if (!d->perm[i].is_constant ())
17355 return false;
17356
17357 if (d->testing_p)
17358 return true;
17359
17360 /* Generic code will try constant permutation twice. Once with the
17361 original mode and again with the elements lowered to QImode.
17362 So wait and don't do the selector expansion ourselves. */
17363 if (vmode != V8QImode && vmode != V16QImode)
17364 return false;
17365
17366 /* to_constant is safe since this routine is specific to Advanced SIMD
17367 vectors. */
17368 unsigned int nelt = d->perm.length ().to_constant ();
17369 for (unsigned int i = 0; i < nelt; ++i)
17370 /* If big-endian and two vectors we end up with a weird mixed-endian
17371 mode on NEON. Reverse the index within each word but not the word
17372 itself. to_constant is safe because we checked is_constant above. */
17373 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17374 ? d->perm[i].to_constant () ^ (nelt - 1)
17375 : d->perm[i].to_constant ());
17376
17377 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17378 sel = force_reg (vmode, sel);
17379
17380 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17381 return true;
17382 }
17383
17384 /* Try to implement D using an SVE TBL instruction. */
17385
17386 static bool
17387 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17388 {
17389 unsigned HOST_WIDE_INT nelt;
17390
17391 /* Permuting two variable-length vectors could overflow the
17392 index range. */
17393 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17394 return false;
17395
17396 if (d->testing_p)
17397 return true;
17398
17399 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17400 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17401 if (d->one_vector_p)
17402 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17403 else
17404 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17405 return true;
17406 }
17407
17408 static bool
17409 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17410 {
17411 /* The pattern matching functions above are written to look for a small
17412 number to begin the sequence (0, 1, N/2). If we begin with an index
17413 from the second operand, we can swap the operands. */
17414 poly_int64 nelt = d->perm.length ();
17415 if (known_ge (d->perm[0], nelt))
17416 {
17417 d->perm.rotate_inputs (1);
17418 std::swap (d->op0, d->op1);
17419 }
17420
17421 if ((d->vec_flags == VEC_ADVSIMD
17422 || d->vec_flags == VEC_SVE_DATA
17423 || d->vec_flags == VEC_SVE_PRED)
17424 && known_gt (nelt, 1))
17425 {
17426 if (aarch64_evpc_rev_local (d))
17427 return true;
17428 else if (aarch64_evpc_rev_global (d))
17429 return true;
17430 else if (aarch64_evpc_ext (d))
17431 return true;
17432 else if (aarch64_evpc_dup (d))
17433 return true;
17434 else if (aarch64_evpc_zip (d))
17435 return true;
17436 else if (aarch64_evpc_uzp (d))
17437 return true;
17438 else if (aarch64_evpc_trn (d))
17439 return true;
17440 if (d->vec_flags == VEC_SVE_DATA)
17441 return aarch64_evpc_sve_tbl (d);
17442 else if (d->vec_flags == VEC_ADVSIMD)
17443 return aarch64_evpc_tbl (d);
17444 }
17445 return false;
17446 }
17447
17448 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
17449
17450 static bool
17451 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17452 rtx op1, const vec_perm_indices &sel)
17453 {
17454 struct expand_vec_perm_d d;
17455
17456 /* Check whether the mask can be applied to a single vector. */
17457 if (sel.ninputs () == 1
17458 || (op0 && rtx_equal_p (op0, op1)))
17459 d.one_vector_p = true;
17460 else if (sel.all_from_input_p (0))
17461 {
17462 d.one_vector_p = true;
17463 op1 = op0;
17464 }
17465 else if (sel.all_from_input_p (1))
17466 {
17467 d.one_vector_p = true;
17468 op0 = op1;
17469 }
17470 else
17471 d.one_vector_p = false;
17472
17473 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17474 sel.nelts_per_input ());
17475 d.vmode = vmode;
17476 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17477 d.target = target;
17478 d.op0 = op0;
17479 d.op1 = op1;
17480 d.testing_p = !target;
17481
17482 if (!d.testing_p)
17483 return aarch64_expand_vec_perm_const_1 (&d);
17484
17485 rtx_insn *last = get_last_insn ();
17486 bool ret = aarch64_expand_vec_perm_const_1 (&d);
17487 gcc_assert (last == get_last_insn ());
17488
17489 return ret;
17490 }
17491
17492 /* Generate a byte permute mask for a register of mode MODE,
17493 which has NUNITS units. */
17494
17495 rtx
17496 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17497 {
17498 /* We have to reverse each vector because we dont have
17499 a permuted load that can reverse-load according to ABI rules. */
17500 rtx mask;
17501 rtvec v = rtvec_alloc (16);
17502 unsigned int i, j;
17503 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17504
17505 gcc_assert (BYTES_BIG_ENDIAN);
17506 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17507
17508 for (i = 0; i < nunits; i++)
17509 for (j = 0; j < usize; j++)
17510 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17511 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17512 return force_reg (V16QImode, mask);
17513 }
17514
17515 /* Return true if X is a valid second operand for the SVE instruction
17516 that implements integer comparison OP_CODE. */
17517
17518 static bool
17519 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
17520 {
17521 if (register_operand (x, VOIDmode))
17522 return true;
17523
17524 switch (op_code)
17525 {
17526 case LTU:
17527 case LEU:
17528 case GEU:
17529 case GTU:
17530 return aarch64_sve_cmp_immediate_p (x, false);
17531 case LT:
17532 case LE:
17533 case GE:
17534 case GT:
17535 case NE:
17536 case EQ:
17537 return aarch64_sve_cmp_immediate_p (x, true);
17538 default:
17539 gcc_unreachable ();
17540 }
17541 }
17542
17543 /* Use predicated SVE instructions to implement the equivalent of:
17544
17545 (set TARGET OP)
17546
17547 given that PTRUE is an all-true predicate of the appropriate mode. */
17548
17549 static void
17550 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
17551 {
17552 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17553 gen_rtvec (2, ptrue, op),
17554 UNSPEC_MERGE_PTRUE);
17555 rtx_insn *insn = emit_set_insn (target, unspec);
17556 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17557 }
17558
17559 /* Likewise, but also clobber the condition codes. */
17560
17561 static void
17562 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
17563 {
17564 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17565 gen_rtvec (2, ptrue, op),
17566 UNSPEC_MERGE_PTRUE);
17567 rtx_insn *insn = emit_insn (gen_set_clobber_cc_nzc (target, unspec));
17568 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17569 }
17570
17571 /* Return the UNSPEC_COND_* code for comparison CODE. */
17572
17573 static unsigned int
17574 aarch64_unspec_cond_code (rtx_code code)
17575 {
17576 switch (code)
17577 {
17578 case NE:
17579 return UNSPEC_COND_FCMNE;
17580 case EQ:
17581 return UNSPEC_COND_FCMEQ;
17582 case LT:
17583 return UNSPEC_COND_FCMLT;
17584 case GT:
17585 return UNSPEC_COND_FCMGT;
17586 case LE:
17587 return UNSPEC_COND_FCMLE;
17588 case GE:
17589 return UNSPEC_COND_FCMGE;
17590 default:
17591 gcc_unreachable ();
17592 }
17593 }
17594
17595 /* Emit:
17596
17597 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17598
17599 where <X> is the operation associated with comparison CODE. This form
17600 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17601 semantics, such as when PRED might not be all-true and when comparing
17602 inactive lanes could have side effects. */
17603
17604 static void
17605 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
17606 rtx pred, rtx op0, rtx op1)
17607 {
17608 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17609 gen_rtvec (3, pred, op0, op1),
17610 aarch64_unspec_cond_code (code));
17611 emit_set_insn (target, unspec);
17612 }
17613
17614 /* Expand an SVE integer comparison using the SVE equivalent of:
17615
17616 (set TARGET (CODE OP0 OP1)). */
17617
17618 void
17619 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17620 {
17621 machine_mode pred_mode = GET_MODE (target);
17622 machine_mode data_mode = GET_MODE (op0);
17623
17624 if (!aarch64_sve_cmp_operand_p (code, op1))
17625 op1 = force_reg (data_mode, op1);
17626
17627 rtx ptrue = aarch64_ptrue_reg (pred_mode);
17628 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17629 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
17630 }
17631
17632 /* Emit the SVE equivalent of:
17633
17634 (set TMP1 (CODE1 OP0 OP1))
17635 (set TMP2 (CODE2 OP0 OP1))
17636 (set TARGET (ior:PRED_MODE TMP1 TMP2))
17637
17638 PTRUE is an all-true predicate with the same mode as TARGET. */
17639
17640 static void
17641 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
17642 rtx ptrue, rtx op0, rtx op1)
17643 {
17644 machine_mode pred_mode = GET_MODE (ptrue);
17645 rtx tmp1 = gen_reg_rtx (pred_mode);
17646 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
17647 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
17648 rtx tmp2 = gen_reg_rtx (pred_mode);
17649 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
17650 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
17651 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17652 }
17653
17654 /* Emit the SVE equivalent of:
17655
17656 (set TMP (CODE OP0 OP1))
17657 (set TARGET (not TMP))
17658
17659 PTRUE is an all-true predicate with the same mode as TARGET. */
17660
17661 static void
17662 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
17663 rtx op0, rtx op1)
17664 {
17665 machine_mode pred_mode = GET_MODE (ptrue);
17666 rtx tmp = gen_reg_rtx (pred_mode);
17667 aarch64_emit_sve_ptrue_op (tmp, ptrue,
17668 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
17669 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17670 }
17671
17672 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17673
17674 (set TARGET (CODE OP0 OP1))
17675
17676 If CAN_INVERT_P is true, the caller can also handle inverted results;
17677 return true if the result is in fact inverted. */
17678
17679 bool
17680 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17681 rtx op0, rtx op1, bool can_invert_p)
17682 {
17683 machine_mode pred_mode = GET_MODE (target);
17684 machine_mode data_mode = GET_MODE (op0);
17685
17686 rtx ptrue = aarch64_ptrue_reg (pred_mode);
17687 switch (code)
17688 {
17689 case UNORDERED:
17690 /* UNORDERED has no immediate form. */
17691 op1 = force_reg (data_mode, op1);
17692 /* fall through */
17693 case LT:
17694 case LE:
17695 case GT:
17696 case GE:
17697 case EQ:
17698 case NE:
17699 {
17700 /* There is native support for the comparison. */
17701 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17702 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17703 return false;
17704 }
17705
17706 case LTGT:
17707 /* This is a trapping operation (LT or GT). */
17708 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
17709 return false;
17710
17711 case UNEQ:
17712 if (!flag_trapping_math)
17713 {
17714 /* This would trap for signaling NaNs. */
17715 op1 = force_reg (data_mode, op1);
17716 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
17717 return false;
17718 }
17719 /* fall through */
17720 case UNLT:
17721 case UNLE:
17722 case UNGT:
17723 case UNGE:
17724 if (flag_trapping_math)
17725 {
17726 /* Work out which elements are ordered. */
17727 rtx ordered = gen_reg_rtx (pred_mode);
17728 op1 = force_reg (data_mode, op1);
17729 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
17730
17731 /* Test the opposite condition for the ordered elements,
17732 then invert the result. */
17733 if (code == UNEQ)
17734 code = NE;
17735 else
17736 code = reverse_condition_maybe_unordered (code);
17737 if (can_invert_p)
17738 {
17739 aarch64_emit_sve_predicated_cond (target, code,
17740 ordered, op0, op1);
17741 return true;
17742 }
17743 rtx tmp = gen_reg_rtx (pred_mode);
17744 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
17745 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17746 return false;
17747 }
17748 break;
17749
17750 case ORDERED:
17751 /* ORDERED has no immediate form. */
17752 op1 = force_reg (data_mode, op1);
17753 break;
17754
17755 default:
17756 gcc_unreachable ();
17757 }
17758
17759 /* There is native support for the inverse comparison. */
17760 code = reverse_condition_maybe_unordered (code);
17761 if (can_invert_p)
17762 {
17763 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17764 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17765 return true;
17766 }
17767 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
17768 return false;
17769 }
17770
17771 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
17772 of the data being selected and CMP_MODE is the mode of the values being
17773 compared. */
17774
17775 void
17776 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
17777 rtx *ops)
17778 {
17779 machine_mode pred_mode
17780 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
17781 GET_MODE_SIZE (cmp_mode)).require ();
17782 rtx pred = gen_reg_rtx (pred_mode);
17783 if (FLOAT_MODE_P (cmp_mode))
17784 {
17785 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
17786 ops[4], ops[5], true))
17787 std::swap (ops[1], ops[2]);
17788 }
17789 else
17790 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
17791
17792 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
17793 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
17794 }
17795
17796 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
17797 true. However due to issues with register allocation it is preferable
17798 to avoid tieing integer scalar and FP scalar modes. Executing integer
17799 operations in general registers is better than treating them as scalar
17800 vector operations. This reduces latency and avoids redundant int<->FP
17801 moves. So tie modes if they are either the same class, or vector modes
17802 with other vector modes, vector structs or any scalar mode. */
17803
17804 static bool
17805 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
17806 {
17807 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
17808 return true;
17809
17810 /* We specifically want to allow elements of "structure" modes to
17811 be tieable to the structure. This more general condition allows
17812 other rarer situations too. The reason we don't extend this to
17813 predicate modes is that there are no predicate structure modes
17814 nor any specific instructions for extracting part of a predicate
17815 register. */
17816 if (aarch64_vector_data_mode_p (mode1)
17817 && aarch64_vector_data_mode_p (mode2))
17818 return true;
17819
17820 /* Also allow any scalar modes with vectors. */
17821 if (aarch64_vector_mode_supported_p (mode1)
17822 || aarch64_vector_mode_supported_p (mode2))
17823 return true;
17824
17825 return false;
17826 }
17827
17828 /* Return a new RTX holding the result of moving POINTER forward by
17829 AMOUNT bytes. */
17830
17831 static rtx
17832 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17833 {
17834 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17835
17836 return adjust_automodify_address (pointer, GET_MODE (pointer),
17837 next, amount);
17838 }
17839
17840 /* Return a new RTX holding the result of moving POINTER forward by the
17841 size of the mode it points to. */
17842
17843 static rtx
17844 aarch64_progress_pointer (rtx pointer)
17845 {
17846 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
17847 }
17848
17849 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17850 MODE bytes. */
17851
17852 static void
17853 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
17854 machine_mode mode)
17855 {
17856 rtx reg = gen_reg_rtx (mode);
17857
17858 /* "Cast" the pointers to the correct mode. */
17859 *src = adjust_address (*src, mode, 0);
17860 *dst = adjust_address (*dst, mode, 0);
17861 /* Emit the memcpy. */
17862 emit_move_insn (reg, *src);
17863 emit_move_insn (*dst, reg);
17864 /* Move the pointers forward. */
17865 *src = aarch64_progress_pointer (*src);
17866 *dst = aarch64_progress_pointer (*dst);
17867 }
17868
17869 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
17870 we succeed, otherwise return false. */
17871
17872 bool
17873 aarch64_expand_cpymem (rtx *operands)
17874 {
17875 int n, mode_bits;
17876 rtx dst = operands[0];
17877 rtx src = operands[1];
17878 rtx base;
17879 machine_mode cur_mode = BLKmode, next_mode;
17880 bool speed_p = !optimize_function_for_size_p (cfun);
17881
17882 /* When optimizing for size, give a better estimate of the length of a
17883 memcpy call, but use the default otherwise. Moves larger than 8 bytes
17884 will always require an even number of instructions to do now. And each
17885 operation requires both a load+store, so devide the max number by 2. */
17886 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
17887
17888 /* We can't do anything smart if the amount to copy is not constant. */
17889 if (!CONST_INT_P (operands[2]))
17890 return false;
17891
17892 n = INTVAL (operands[2]);
17893
17894 /* Try to keep the number of instructions low. For all cases we will do at
17895 most two moves for the residual amount, since we'll always overlap the
17896 remainder. */
17897 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
17898 return false;
17899
17900 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
17901 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
17902
17903 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
17904 src = adjust_automodify_address (src, VOIDmode, base, 0);
17905
17906 /* Convert n to bits to make the rest of the code simpler. */
17907 n = n * BITS_PER_UNIT;
17908
17909 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
17910 larger than TImode, but we should not use them for loads/stores here. */
17911 const int copy_limit = GET_MODE_BITSIZE (TImode);
17912
17913 while (n > 0)
17914 {
17915 /* Find the largest mode in which to do the copy in without over reading
17916 or writing. */
17917 opt_scalar_int_mode mode_iter;
17918 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
17919 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
17920 cur_mode = mode_iter.require ();
17921
17922 gcc_assert (cur_mode != BLKmode);
17923
17924 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
17925 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
17926
17927 n -= mode_bits;
17928
17929 /* Do certain trailing copies as overlapping if it's going to be
17930 cheaper. i.e. less instructions to do so. For instance doing a 15
17931 byte copy it's more efficient to do two overlapping 8 byte copies than
17932 8 + 6 + 1. */
17933 if (n > 0 && n <= 8 * BITS_PER_UNIT)
17934 {
17935 next_mode = smallest_mode_for_size (n, MODE_INT);
17936 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17937 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17938 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17939 n = n_bits;
17940 }
17941 }
17942
17943 return true;
17944 }
17945
17946 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17947 SImode stores. Handle the case when the constant has identical
17948 bottom and top halves. This is beneficial when the two stores can be
17949 merged into an STP and we avoid synthesising potentially expensive
17950 immediates twice. Return true if such a split is possible. */
17951
17952 bool
17953 aarch64_split_dimode_const_store (rtx dst, rtx src)
17954 {
17955 rtx lo = gen_lowpart (SImode, src);
17956 rtx hi = gen_highpart_mode (SImode, DImode, src);
17957
17958 bool size_p = optimize_function_for_size_p (cfun);
17959
17960 if (!rtx_equal_p (lo, hi))
17961 return false;
17962
17963 unsigned int orig_cost
17964 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17965 unsigned int lo_cost
17966 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17967
17968 /* We want to transform:
17969 MOV x1, 49370
17970 MOVK x1, 0x140, lsl 16
17971 MOVK x1, 0xc0da, lsl 32
17972 MOVK x1, 0x140, lsl 48
17973 STR x1, [x0]
17974 into:
17975 MOV w1, 49370
17976 MOVK w1, 0x140, lsl 16
17977 STP w1, w1, [x0]
17978 So we want to perform this only when we save two instructions
17979 or more. When optimizing for size, however, accept any code size
17980 savings we can. */
17981 if (size_p && orig_cost <= lo_cost)
17982 return false;
17983
17984 if (!size_p
17985 && (orig_cost <= lo_cost + 1))
17986 return false;
17987
17988 rtx mem_lo = adjust_address (dst, SImode, 0);
17989 if (!aarch64_mem_pair_operand (mem_lo, SImode))
17990 return false;
17991
17992 rtx tmp_reg = gen_reg_rtx (SImode);
17993 aarch64_expand_mov_immediate (tmp_reg, lo);
17994 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17995 /* Don't emit an explicit store pair as this may not be always profitable.
17996 Let the sched-fusion logic decide whether to merge them. */
17997 emit_move_insn (mem_lo, tmp_reg);
17998 emit_move_insn (mem_hi, tmp_reg);
17999
18000 return true;
18001 }
18002
18003 /* Generate RTL for a conditional branch with rtx comparison CODE in
18004 mode CC_MODE. The destination of the unlikely conditional branch
18005 is LABEL_REF. */
18006
18007 void
18008 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18009 rtx label_ref)
18010 {
18011 rtx x;
18012 x = gen_rtx_fmt_ee (code, VOIDmode,
18013 gen_rtx_REG (cc_mode, CC_REGNUM),
18014 const0_rtx);
18015
18016 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18017 gen_rtx_LABEL_REF (VOIDmode, label_ref),
18018 pc_rtx);
18019 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18020 }
18021
18022 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18023
18024 OP1 represents the TImode destination operand 1
18025 OP2 represents the TImode destination operand 2
18026 LOW_DEST represents the low half (DImode) of TImode operand 0
18027 LOW_IN1 represents the low half (DImode) of TImode operand 1
18028 LOW_IN2 represents the low half (DImode) of TImode operand 2
18029 HIGH_DEST represents the high half (DImode) of TImode operand 0
18030 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18031 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18032
18033 void
18034 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18035 rtx *low_in1, rtx *low_in2,
18036 rtx *high_dest, rtx *high_in1,
18037 rtx *high_in2)
18038 {
18039 *low_dest = gen_reg_rtx (DImode);
18040 *low_in1 = gen_lowpart (DImode, op1);
18041 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18042 subreg_lowpart_offset (DImode, TImode));
18043 *high_dest = gen_reg_rtx (DImode);
18044 *high_in1 = gen_highpart (DImode, op1);
18045 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18046 subreg_highpart_offset (DImode, TImode));
18047 }
18048
18049 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18050
18051 This function differs from 'arch64_addti_scratch_regs' in that
18052 OP1 can be an immediate constant (zero). We must call
18053 subreg_highpart_offset with DImode and TImode arguments, otherwise
18054 VOIDmode will be used for the const_int which generates an internal
18055 error from subreg_size_highpart_offset which does not expect a size of zero.
18056
18057 OP1 represents the TImode destination operand 1
18058 OP2 represents the TImode destination operand 2
18059 LOW_DEST represents the low half (DImode) of TImode operand 0
18060 LOW_IN1 represents the low half (DImode) of TImode operand 1
18061 LOW_IN2 represents the low half (DImode) of TImode operand 2
18062 HIGH_DEST represents the high half (DImode) of TImode operand 0
18063 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18064 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18065
18066
18067 void
18068 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18069 rtx *low_in1, rtx *low_in2,
18070 rtx *high_dest, rtx *high_in1,
18071 rtx *high_in2)
18072 {
18073 *low_dest = gen_reg_rtx (DImode);
18074 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18075 subreg_lowpart_offset (DImode, TImode));
18076
18077 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18078 subreg_lowpart_offset (DImode, TImode));
18079 *high_dest = gen_reg_rtx (DImode);
18080
18081 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18082 subreg_highpart_offset (DImode, TImode));
18083 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18084 subreg_highpart_offset (DImode, TImode));
18085 }
18086
18087 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18088
18089 OP0 represents the TImode destination operand 0
18090 LOW_DEST represents the low half (DImode) of TImode operand 0
18091 LOW_IN1 represents the low half (DImode) of TImode operand 1
18092 LOW_IN2 represents the low half (DImode) of TImode operand 2
18093 HIGH_DEST represents the high half (DImode) of TImode operand 0
18094 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18095 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18096 UNSIGNED_P is true if the operation is being performed on unsigned
18097 values. */
18098 void
18099 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18100 rtx low_in2, rtx high_dest, rtx high_in1,
18101 rtx high_in2, bool unsigned_p)
18102 {
18103 if (low_in2 == const0_rtx)
18104 {
18105 low_dest = low_in1;
18106 high_in2 = force_reg (DImode, high_in2);
18107 if (unsigned_p)
18108 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18109 else
18110 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18111 }
18112 else
18113 {
18114 if (CONST_INT_P (low_in2))
18115 {
18116 high_in2 = force_reg (DImode, high_in2);
18117 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18118 GEN_INT (-INTVAL (low_in2))));
18119 }
18120 else
18121 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18122
18123 if (unsigned_p)
18124 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18125 else
18126 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18127 }
18128
18129 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18130 emit_move_insn (gen_highpart (DImode, op0), high_dest);
18131
18132 }
18133
18134 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18135
18136 static unsigned HOST_WIDE_INT
18137 aarch64_asan_shadow_offset (void)
18138 {
18139 if (TARGET_ILP32)
18140 return (HOST_WIDE_INT_1 << 29);
18141 else
18142 return (HOST_WIDE_INT_1 << 36);
18143 }
18144
18145 static rtx
18146 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18147 int code, tree treeop0, tree treeop1)
18148 {
18149 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18150 rtx op0, op1;
18151 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18152 insn_code icode;
18153 struct expand_operand ops[4];
18154
18155 start_sequence ();
18156 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18157
18158 op_mode = GET_MODE (op0);
18159 if (op_mode == VOIDmode)
18160 op_mode = GET_MODE (op1);
18161
18162 switch (op_mode)
18163 {
18164 case E_QImode:
18165 case E_HImode:
18166 case E_SImode:
18167 cmp_mode = SImode;
18168 icode = CODE_FOR_cmpsi;
18169 break;
18170
18171 case E_DImode:
18172 cmp_mode = DImode;
18173 icode = CODE_FOR_cmpdi;
18174 break;
18175
18176 case E_SFmode:
18177 cmp_mode = SFmode;
18178 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18179 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18180 break;
18181
18182 case E_DFmode:
18183 cmp_mode = DFmode;
18184 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18185 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18186 break;
18187
18188 default:
18189 end_sequence ();
18190 return NULL_RTX;
18191 }
18192
18193 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18194 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18195 if (!op0 || !op1)
18196 {
18197 end_sequence ();
18198 return NULL_RTX;
18199 }
18200 *prep_seq = get_insns ();
18201 end_sequence ();
18202
18203 create_fixed_operand (&ops[0], op0);
18204 create_fixed_operand (&ops[1], op1);
18205
18206 start_sequence ();
18207 if (!maybe_expand_insn (icode, 2, ops))
18208 {
18209 end_sequence ();
18210 return NULL_RTX;
18211 }
18212 *gen_seq = get_insns ();
18213 end_sequence ();
18214
18215 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18216 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18217 }
18218
18219 static rtx
18220 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18221 int cmp_code, tree treeop0, tree treeop1, int bit_code)
18222 {
18223 rtx op0, op1, target;
18224 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18225 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18226 insn_code icode;
18227 struct expand_operand ops[6];
18228 int aarch64_cond;
18229
18230 push_to_sequence (*prep_seq);
18231 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18232
18233 op_mode = GET_MODE (op0);
18234 if (op_mode == VOIDmode)
18235 op_mode = GET_MODE (op1);
18236
18237 switch (op_mode)
18238 {
18239 case E_QImode:
18240 case E_HImode:
18241 case E_SImode:
18242 cmp_mode = SImode;
18243 icode = CODE_FOR_ccmpsi;
18244 break;
18245
18246 case E_DImode:
18247 cmp_mode = DImode;
18248 icode = CODE_FOR_ccmpdi;
18249 break;
18250
18251 case E_SFmode:
18252 cmp_mode = SFmode;
18253 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18254 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18255 break;
18256
18257 case E_DFmode:
18258 cmp_mode = DFmode;
18259 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18260 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18261 break;
18262
18263 default:
18264 end_sequence ();
18265 return NULL_RTX;
18266 }
18267
18268 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18269 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18270 if (!op0 || !op1)
18271 {
18272 end_sequence ();
18273 return NULL_RTX;
18274 }
18275 *prep_seq = get_insns ();
18276 end_sequence ();
18277
18278 target = gen_rtx_REG (cc_mode, CC_REGNUM);
18279 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18280
18281 if (bit_code != AND)
18282 {
18283 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18284 GET_MODE (XEXP (prev, 0))),
18285 VOIDmode, XEXP (prev, 0), const0_rtx);
18286 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18287 }
18288
18289 create_fixed_operand (&ops[0], XEXP (prev, 0));
18290 create_fixed_operand (&ops[1], target);
18291 create_fixed_operand (&ops[2], op0);
18292 create_fixed_operand (&ops[3], op1);
18293 create_fixed_operand (&ops[4], prev);
18294 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18295
18296 push_to_sequence (*gen_seq);
18297 if (!maybe_expand_insn (icode, 6, ops))
18298 {
18299 end_sequence ();
18300 return NULL_RTX;
18301 }
18302
18303 *gen_seq = get_insns ();
18304 end_sequence ();
18305
18306 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18307 }
18308
18309 #undef TARGET_GEN_CCMP_FIRST
18310 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18311
18312 #undef TARGET_GEN_CCMP_NEXT
18313 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18314
18315 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18316 instruction fusion of some sort. */
18317
18318 static bool
18319 aarch64_macro_fusion_p (void)
18320 {
18321 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18322 }
18323
18324
18325 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
18326 should be kept together during scheduling. */
18327
18328 static bool
18329 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18330 {
18331 rtx set_dest;
18332 rtx prev_set = single_set (prev);
18333 rtx curr_set = single_set (curr);
18334 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
18335 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18336
18337 if (!aarch64_macro_fusion_p ())
18338 return false;
18339
18340 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18341 {
18342 /* We are trying to match:
18343 prev (mov) == (set (reg r0) (const_int imm16))
18344 curr (movk) == (set (zero_extract (reg r0)
18345 (const_int 16)
18346 (const_int 16))
18347 (const_int imm16_1)) */
18348
18349 set_dest = SET_DEST (curr_set);
18350
18351 if (GET_CODE (set_dest) == ZERO_EXTRACT
18352 && CONST_INT_P (SET_SRC (curr_set))
18353 && CONST_INT_P (SET_SRC (prev_set))
18354 && CONST_INT_P (XEXP (set_dest, 2))
18355 && INTVAL (XEXP (set_dest, 2)) == 16
18356 && REG_P (XEXP (set_dest, 0))
18357 && REG_P (SET_DEST (prev_set))
18358 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18359 {
18360 return true;
18361 }
18362 }
18363
18364 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18365 {
18366
18367 /* We're trying to match:
18368 prev (adrp) == (set (reg r1)
18369 (high (symbol_ref ("SYM"))))
18370 curr (add) == (set (reg r0)
18371 (lo_sum (reg r1)
18372 (symbol_ref ("SYM"))))
18373 Note that r0 need not necessarily be the same as r1, especially
18374 during pre-regalloc scheduling. */
18375
18376 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18377 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18378 {
18379 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18380 && REG_P (XEXP (SET_SRC (curr_set), 0))
18381 && REGNO (XEXP (SET_SRC (curr_set), 0))
18382 == REGNO (SET_DEST (prev_set))
18383 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18384 XEXP (SET_SRC (curr_set), 1)))
18385 return true;
18386 }
18387 }
18388
18389 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18390 {
18391
18392 /* We're trying to match:
18393 prev (movk) == (set (zero_extract (reg r0)
18394 (const_int 16)
18395 (const_int 32))
18396 (const_int imm16_1))
18397 curr (movk) == (set (zero_extract (reg r0)
18398 (const_int 16)
18399 (const_int 48))
18400 (const_int imm16_2)) */
18401
18402 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18403 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18404 && REG_P (XEXP (SET_DEST (prev_set), 0))
18405 && REG_P (XEXP (SET_DEST (curr_set), 0))
18406 && REGNO (XEXP (SET_DEST (prev_set), 0))
18407 == REGNO (XEXP (SET_DEST (curr_set), 0))
18408 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18409 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18410 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18411 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18412 && CONST_INT_P (SET_SRC (prev_set))
18413 && CONST_INT_P (SET_SRC (curr_set)))
18414 return true;
18415
18416 }
18417 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18418 {
18419 /* We're trying to match:
18420 prev (adrp) == (set (reg r0)
18421 (high (symbol_ref ("SYM"))))
18422 curr (ldr) == (set (reg r1)
18423 (mem (lo_sum (reg r0)
18424 (symbol_ref ("SYM")))))
18425 or
18426 curr (ldr) == (set (reg r1)
18427 (zero_extend (mem
18428 (lo_sum (reg r0)
18429 (symbol_ref ("SYM")))))) */
18430 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18431 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18432 {
18433 rtx curr_src = SET_SRC (curr_set);
18434
18435 if (GET_CODE (curr_src) == ZERO_EXTEND)
18436 curr_src = XEXP (curr_src, 0);
18437
18438 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18439 && REG_P (XEXP (XEXP (curr_src, 0), 0))
18440 && REGNO (XEXP (XEXP (curr_src, 0), 0))
18441 == REGNO (SET_DEST (prev_set))
18442 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18443 XEXP (SET_SRC (prev_set), 0)))
18444 return true;
18445 }
18446 }
18447
18448 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18449 && any_condjump_p (curr))
18450 {
18451 unsigned int condreg1, condreg2;
18452 rtx cc_reg_1;
18453 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18454 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18455
18456 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18457 && prev
18458 && modified_in_p (cc_reg_1, prev))
18459 {
18460 enum attr_type prev_type = get_attr_type (prev);
18461
18462 /* FIXME: this misses some which is considered simple arthematic
18463 instructions for ThunderX. Simple shifts are missed here. */
18464 if (prev_type == TYPE_ALUS_SREG
18465 || prev_type == TYPE_ALUS_IMM
18466 || prev_type == TYPE_LOGICS_REG
18467 || prev_type == TYPE_LOGICS_IMM)
18468 return true;
18469 }
18470 }
18471
18472 if (prev_set
18473 && curr_set
18474 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18475 && any_condjump_p (curr))
18476 {
18477 /* We're trying to match:
18478 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18479 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18480 (const_int 0))
18481 (label_ref ("SYM"))
18482 (pc)) */
18483 if (SET_DEST (curr_set) == (pc_rtx)
18484 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18485 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18486 && REG_P (SET_DEST (prev_set))
18487 && REGNO (SET_DEST (prev_set))
18488 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18489 {
18490 /* Fuse ALU operations followed by conditional branch instruction. */
18491 switch (get_attr_type (prev))
18492 {
18493 case TYPE_ALU_IMM:
18494 case TYPE_ALU_SREG:
18495 case TYPE_ADC_REG:
18496 case TYPE_ADC_IMM:
18497 case TYPE_ADCS_REG:
18498 case TYPE_ADCS_IMM:
18499 case TYPE_LOGIC_REG:
18500 case TYPE_LOGIC_IMM:
18501 case TYPE_CSEL:
18502 case TYPE_ADR:
18503 case TYPE_MOV_IMM:
18504 case TYPE_SHIFT_REG:
18505 case TYPE_SHIFT_IMM:
18506 case TYPE_BFM:
18507 case TYPE_RBIT:
18508 case TYPE_REV:
18509 case TYPE_EXTEND:
18510 return true;
18511
18512 default:;
18513 }
18514 }
18515 }
18516
18517 return false;
18518 }
18519
18520 /* Return true iff the instruction fusion described by OP is enabled. */
18521
18522 bool
18523 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18524 {
18525 return (aarch64_tune_params.fusible_ops & op) != 0;
18526 }
18527
18528 /* If MEM is in the form of [base+offset], extract the two parts
18529 of address and set to BASE and OFFSET, otherwise return false
18530 after clearing BASE and OFFSET. */
18531
18532 bool
18533 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18534 {
18535 rtx addr;
18536
18537 gcc_assert (MEM_P (mem));
18538
18539 addr = XEXP (mem, 0);
18540
18541 if (REG_P (addr))
18542 {
18543 *base = addr;
18544 *offset = const0_rtx;
18545 return true;
18546 }
18547
18548 if (GET_CODE (addr) == PLUS
18549 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18550 {
18551 *base = XEXP (addr, 0);
18552 *offset = XEXP (addr, 1);
18553 return true;
18554 }
18555
18556 *base = NULL_RTX;
18557 *offset = NULL_RTX;
18558
18559 return false;
18560 }
18561
18562 /* Types for scheduling fusion. */
18563 enum sched_fusion_type
18564 {
18565 SCHED_FUSION_NONE = 0,
18566 SCHED_FUSION_LD_SIGN_EXTEND,
18567 SCHED_FUSION_LD_ZERO_EXTEND,
18568 SCHED_FUSION_LD,
18569 SCHED_FUSION_ST,
18570 SCHED_FUSION_NUM
18571 };
18572
18573 /* If INSN is a load or store of address in the form of [base+offset],
18574 extract the two parts and set to BASE and OFFSET. Return scheduling
18575 fusion type this INSN is. */
18576
18577 static enum sched_fusion_type
18578 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18579 {
18580 rtx x, dest, src;
18581 enum sched_fusion_type fusion = SCHED_FUSION_LD;
18582
18583 gcc_assert (INSN_P (insn));
18584 x = PATTERN (insn);
18585 if (GET_CODE (x) != SET)
18586 return SCHED_FUSION_NONE;
18587
18588 src = SET_SRC (x);
18589 dest = SET_DEST (x);
18590
18591 machine_mode dest_mode = GET_MODE (dest);
18592
18593 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18594 return SCHED_FUSION_NONE;
18595
18596 if (GET_CODE (src) == SIGN_EXTEND)
18597 {
18598 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18599 src = XEXP (src, 0);
18600 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18601 return SCHED_FUSION_NONE;
18602 }
18603 else if (GET_CODE (src) == ZERO_EXTEND)
18604 {
18605 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18606 src = XEXP (src, 0);
18607 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18608 return SCHED_FUSION_NONE;
18609 }
18610
18611 if (GET_CODE (src) == MEM && REG_P (dest))
18612 extract_base_offset_in_addr (src, base, offset);
18613 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18614 {
18615 fusion = SCHED_FUSION_ST;
18616 extract_base_offset_in_addr (dest, base, offset);
18617 }
18618 else
18619 return SCHED_FUSION_NONE;
18620
18621 if (*base == NULL_RTX || *offset == NULL_RTX)
18622 fusion = SCHED_FUSION_NONE;
18623
18624 return fusion;
18625 }
18626
18627 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18628
18629 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18630 and PRI are only calculated for these instructions. For other instruction,
18631 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
18632 type instruction fusion can be added by returning different priorities.
18633
18634 It's important that irrelevant instructions get the largest FUSION_PRI. */
18635
18636 static void
18637 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18638 int *fusion_pri, int *pri)
18639 {
18640 int tmp, off_val;
18641 rtx base, offset;
18642 enum sched_fusion_type fusion;
18643
18644 gcc_assert (INSN_P (insn));
18645
18646 tmp = max_pri - 1;
18647 fusion = fusion_load_store (insn, &base, &offset);
18648 if (fusion == SCHED_FUSION_NONE)
18649 {
18650 *pri = tmp;
18651 *fusion_pri = tmp;
18652 return;
18653 }
18654
18655 /* Set FUSION_PRI according to fusion type and base register. */
18656 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18657
18658 /* Calculate PRI. */
18659 tmp /= 2;
18660
18661 /* INSN with smaller offset goes first. */
18662 off_val = (int)(INTVAL (offset));
18663 if (off_val >= 0)
18664 tmp -= (off_val & 0xfffff);
18665 else
18666 tmp += ((- off_val) & 0xfffff);
18667
18668 *pri = tmp;
18669 return;
18670 }
18671
18672 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18673 Adjust priority of sha1h instructions so they are scheduled before
18674 other SHA1 instructions. */
18675
18676 static int
18677 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18678 {
18679 rtx x = PATTERN (insn);
18680
18681 if (GET_CODE (x) == SET)
18682 {
18683 x = SET_SRC (x);
18684
18685 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18686 return priority + 10;
18687 }
18688
18689 return priority;
18690 }
18691
18692 /* Given OPERANDS of consecutive load/store, check if we can merge
18693 them into ldp/stp. LOAD is true if they are load instructions.
18694 MODE is the mode of memory operands. */
18695
18696 bool
18697 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18698 machine_mode mode)
18699 {
18700 HOST_WIDE_INT offval_1, offval_2, msize;
18701 enum reg_class rclass_1, rclass_2;
18702 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
18703
18704 if (load)
18705 {
18706 mem_1 = operands[1];
18707 mem_2 = operands[3];
18708 reg_1 = operands[0];
18709 reg_2 = operands[2];
18710 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
18711 if (REGNO (reg_1) == REGNO (reg_2))
18712 return false;
18713 }
18714 else
18715 {
18716 mem_1 = operands[0];
18717 mem_2 = operands[2];
18718 reg_1 = operands[1];
18719 reg_2 = operands[3];
18720 }
18721
18722 /* The mems cannot be volatile. */
18723 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
18724 return false;
18725
18726 /* If we have SImode and slow unaligned ldp,
18727 check the alignment to be at least 8 byte. */
18728 if (mode == SImode
18729 && (aarch64_tune_params.extra_tuning_flags
18730 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18731 && !optimize_size
18732 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
18733 return false;
18734
18735 /* Check if the addresses are in the form of [base+offset]. */
18736 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18737 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
18738 return false;
18739 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18740 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
18741 return false;
18742
18743 /* Check if the bases are same. */
18744 if (!rtx_equal_p (base_1, base_2))
18745 return false;
18746
18747 /* The operands must be of the same size. */
18748 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
18749 GET_MODE_SIZE (GET_MODE (mem_2))));
18750
18751 offval_1 = INTVAL (offset_1);
18752 offval_2 = INTVAL (offset_2);
18753 /* We should only be trying this for fixed-sized modes. There is no
18754 SVE LDP/STP instruction. */
18755 msize = GET_MODE_SIZE (mode).to_constant ();
18756 /* Check if the offsets are consecutive. */
18757 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
18758 return false;
18759
18760 /* Check if the addresses are clobbered by load. */
18761 if (load)
18762 {
18763 if (reg_mentioned_p (reg_1, mem_1))
18764 return false;
18765
18766 /* In increasing order, the last load can clobber the address. */
18767 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
18768 return false;
18769 }
18770
18771 /* One of the memory accesses must be a mempair operand.
18772 If it is not the first one, they need to be swapped by the
18773 peephole. */
18774 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
18775 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
18776 return false;
18777
18778 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
18779 rclass_1 = FP_REGS;
18780 else
18781 rclass_1 = GENERAL_REGS;
18782
18783 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
18784 rclass_2 = FP_REGS;
18785 else
18786 rclass_2 = GENERAL_REGS;
18787
18788 /* Check if the registers are of same class. */
18789 if (rclass_1 != rclass_2)
18790 return false;
18791
18792 return true;
18793 }
18794
18795 /* Given OPERANDS of consecutive load/store that can be merged,
18796 swap them if they are not in ascending order. */
18797 void
18798 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
18799 {
18800 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
18801 HOST_WIDE_INT offval_1, offval_2;
18802
18803 if (load)
18804 {
18805 mem_1 = operands[1];
18806 mem_2 = operands[3];
18807 }
18808 else
18809 {
18810 mem_1 = operands[0];
18811 mem_2 = operands[2];
18812 }
18813
18814 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18815 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18816
18817 offval_1 = INTVAL (offset_1);
18818 offval_2 = INTVAL (offset_2);
18819
18820 if (offval_1 > offval_2)
18821 {
18822 /* Irrespective of whether this is a load or a store,
18823 we do the same swap. */
18824 std::swap (operands[0], operands[2]);
18825 std::swap (operands[1], operands[3]);
18826 }
18827 }
18828
18829 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18830 comparison between the two. */
18831 int
18832 aarch64_host_wide_int_compare (const void *x, const void *y)
18833 {
18834 return wi::cmps (* ((const HOST_WIDE_INT *) x),
18835 * ((const HOST_WIDE_INT *) y));
18836 }
18837
18838 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18839 other pointing to a REG rtx containing an offset, compare the offsets
18840 of the two pairs.
18841
18842 Return:
18843
18844 1 iff offset (X) > offset (Y)
18845 0 iff offset (X) == offset (Y)
18846 -1 iff offset (X) < offset (Y) */
18847 int
18848 aarch64_ldrstr_offset_compare (const void *x, const void *y)
18849 {
18850 const rtx * operands_1 = (const rtx *) x;
18851 const rtx * operands_2 = (const rtx *) y;
18852 rtx mem_1, mem_2, base, offset_1, offset_2;
18853
18854 if (MEM_P (operands_1[0]))
18855 mem_1 = operands_1[0];
18856 else
18857 mem_1 = operands_1[1];
18858
18859 if (MEM_P (operands_2[0]))
18860 mem_2 = operands_2[0];
18861 else
18862 mem_2 = operands_2[1];
18863
18864 /* Extract the offsets. */
18865 extract_base_offset_in_addr (mem_1, &base, &offset_1);
18866 extract_base_offset_in_addr (mem_2, &base, &offset_2);
18867
18868 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
18869
18870 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
18871 }
18872
18873 /* Given OPERANDS of consecutive load/store, check if we can merge
18874 them into ldp/stp by adjusting the offset. LOAD is true if they
18875 are load instructions. MODE is the mode of memory operands.
18876
18877 Given below consecutive stores:
18878
18879 str w1, [xb, 0x100]
18880 str w1, [xb, 0x104]
18881 str w1, [xb, 0x108]
18882 str w1, [xb, 0x10c]
18883
18884 Though the offsets are out of the range supported by stp, we can
18885 still pair them after adjusting the offset, like:
18886
18887 add scratch, xb, 0x100
18888 stp w1, w1, [scratch]
18889 stp w1, w1, [scratch, 0x8]
18890
18891 The peephole patterns detecting this opportunity should guarantee
18892 the scratch register is avaliable. */
18893
18894 bool
18895 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
18896 scalar_mode mode)
18897 {
18898 const int num_insns = 4;
18899 enum reg_class rclass;
18900 HOST_WIDE_INT offvals[num_insns], msize;
18901 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
18902
18903 if (load)
18904 {
18905 for (int i = 0; i < num_insns; i++)
18906 {
18907 reg[i] = operands[2 * i];
18908 mem[i] = operands[2 * i + 1];
18909
18910 gcc_assert (REG_P (reg[i]));
18911 }
18912
18913 /* Do not attempt to merge the loads if the loads clobber each other. */
18914 for (int i = 0; i < 8; i += 2)
18915 for (int j = i + 2; j < 8; j += 2)
18916 if (reg_overlap_mentioned_p (operands[i], operands[j]))
18917 return false;
18918 }
18919 else
18920 for (int i = 0; i < num_insns; i++)
18921 {
18922 mem[i] = operands[2 * i];
18923 reg[i] = operands[2 * i + 1];
18924 }
18925
18926 /* Skip if memory operand is by itself valid for ldp/stp. */
18927 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18928 return false;
18929
18930 for (int i = 0; i < num_insns; i++)
18931 {
18932 /* The mems cannot be volatile. */
18933 if (MEM_VOLATILE_P (mem[i]))
18934 return false;
18935
18936 /* Check if the addresses are in the form of [base+offset]. */
18937 extract_base_offset_in_addr (mem[i], base + i, offset + i);
18938 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18939 return false;
18940 }
18941
18942 /* Check if the registers are of same class. */
18943 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18944 ? FP_REGS : GENERAL_REGS;
18945
18946 for (int i = 1; i < num_insns; i++)
18947 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18948 {
18949 if (rclass != FP_REGS)
18950 return false;
18951 }
18952 else
18953 {
18954 if (rclass != GENERAL_REGS)
18955 return false;
18956 }
18957
18958 /* Only the last register in the order in which they occur
18959 may be clobbered by the load. */
18960 if (rclass == GENERAL_REGS && load)
18961 for (int i = 0; i < num_insns - 1; i++)
18962 if (reg_mentioned_p (reg[i], mem[i]))
18963 return false;
18964
18965 /* Check if the bases are same. */
18966 for (int i = 0; i < num_insns - 1; i++)
18967 if (!rtx_equal_p (base[i], base[i + 1]))
18968 return false;
18969
18970 for (int i = 0; i < num_insns; i++)
18971 offvals[i] = INTVAL (offset[i]);
18972
18973 msize = GET_MODE_SIZE (mode);
18974
18975 /* Check if the offsets can be put in the right order to do a ldp/stp. */
18976 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18977 aarch64_host_wide_int_compare);
18978
18979 if (!(offvals[1] == offvals[0] + msize
18980 && offvals[3] == offvals[2] + msize))
18981 return false;
18982
18983 /* Check that offsets are within range of each other. The ldp/stp
18984 instructions have 7 bit immediate offsets, so use 0x80. */
18985 if (offvals[2] - offvals[0] >= msize * 0x80)
18986 return false;
18987
18988 /* The offsets must be aligned with respect to each other. */
18989 if (offvals[0] % msize != offvals[2] % msize)
18990 return false;
18991
18992 /* If we have SImode and slow unaligned ldp,
18993 check the alignment to be at least 8 byte. */
18994 if (mode == SImode
18995 && (aarch64_tune_params.extra_tuning_flags
18996 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18997 && !optimize_size
18998 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18999 return false;
19000
19001 return true;
19002 }
19003
19004 /* Given OPERANDS of consecutive load/store, this function pairs them
19005 into LDP/STP after adjusting the offset. It depends on the fact
19006 that the operands can be sorted so the offsets are correct for STP.
19007 MODE is the mode of memory operands. CODE is the rtl operator
19008 which should be applied to all memory operands, it's SIGN_EXTEND,
19009 ZERO_EXTEND or UNKNOWN. */
19010
19011 bool
19012 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19013 scalar_mode mode, RTX_CODE code)
19014 {
19015 rtx base, offset_1, offset_3, t1, t2;
19016 rtx mem_1, mem_2, mem_3, mem_4;
19017 rtx temp_operands[8];
19018 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19019 stp_off_upper_limit, stp_off_lower_limit, msize;
19020
19021 /* We make changes on a copy as we may still bail out. */
19022 for (int i = 0; i < 8; i ++)
19023 temp_operands[i] = operands[i];
19024
19025 /* Sort the operands. */
19026 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19027
19028 /* Copy the memory operands so that if we have to bail for some
19029 reason the original addresses are unchanged. */
19030 if (load)
19031 {
19032 mem_1 = copy_rtx (temp_operands[1]);
19033 mem_2 = copy_rtx (temp_operands[3]);
19034 mem_3 = copy_rtx (temp_operands[5]);
19035 mem_4 = copy_rtx (temp_operands[7]);
19036 }
19037 else
19038 {
19039 mem_1 = copy_rtx (temp_operands[0]);
19040 mem_2 = copy_rtx (temp_operands[2]);
19041 mem_3 = copy_rtx (temp_operands[4]);
19042 mem_4 = copy_rtx (temp_operands[6]);
19043 gcc_assert (code == UNKNOWN);
19044 }
19045
19046 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19047 extract_base_offset_in_addr (mem_3, &base, &offset_3);
19048 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19049 && offset_3 != NULL_RTX);
19050
19051 /* Adjust offset so it can fit in LDP/STP instruction. */
19052 msize = GET_MODE_SIZE (mode);
19053 stp_off_upper_limit = msize * (0x40 - 1);
19054 stp_off_lower_limit = - msize * 0x40;
19055
19056 off_val_1 = INTVAL (offset_1);
19057 off_val_3 = INTVAL (offset_3);
19058
19059 /* The base offset is optimally half way between the two STP/LDP offsets. */
19060 if (msize <= 4)
19061 base_off = (off_val_1 + off_val_3) / 2;
19062 else
19063 /* However, due to issues with negative LDP/STP offset generation for
19064 larger modes, for DF, DI and vector modes. we must not use negative
19065 addresses smaller than 9 signed unadjusted bits can store. This
19066 provides the most range in this case. */
19067 base_off = off_val_1;
19068
19069 /* Adjust the base so that it is aligned with the addresses but still
19070 optimal. */
19071 if (base_off % msize != off_val_1 % msize)
19072 /* Fix the offset, bearing in mind we want to make it bigger not
19073 smaller. */
19074 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19075 else if (msize <= 4)
19076 /* The negative range of LDP/STP is one larger than the positive range. */
19077 base_off += msize;
19078
19079 /* Check if base offset is too big or too small. We can attempt to resolve
19080 this issue by setting it to the maximum value and seeing if the offsets
19081 still fit. */
19082 if (base_off >= 0x1000)
19083 {
19084 base_off = 0x1000 - 1;
19085 /* We must still make sure that the base offset is aligned with respect
19086 to the address. But it may may not be made any bigger. */
19087 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19088 }
19089
19090 /* Likewise for the case where the base is too small. */
19091 if (base_off <= -0x1000)
19092 {
19093 base_off = -0x1000 + 1;
19094 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19095 }
19096
19097 /* Offset of the first STP/LDP. */
19098 new_off_1 = off_val_1 - base_off;
19099
19100 /* Offset of the second STP/LDP. */
19101 new_off_3 = off_val_3 - base_off;
19102
19103 /* The offsets must be within the range of the LDP/STP instructions. */
19104 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19105 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19106 return false;
19107
19108 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19109 new_off_1), true);
19110 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19111 new_off_1 + msize), true);
19112 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19113 new_off_3), true);
19114 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19115 new_off_3 + msize), true);
19116
19117 if (!aarch64_mem_pair_operand (mem_1, mode)
19118 || !aarch64_mem_pair_operand (mem_3, mode))
19119 return false;
19120
19121 if (code == ZERO_EXTEND)
19122 {
19123 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19124 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19125 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19126 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19127 }
19128 else if (code == SIGN_EXTEND)
19129 {
19130 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19131 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19132 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19133 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19134 }
19135
19136 if (load)
19137 {
19138 operands[0] = temp_operands[0];
19139 operands[1] = mem_1;
19140 operands[2] = temp_operands[2];
19141 operands[3] = mem_2;
19142 operands[4] = temp_operands[4];
19143 operands[5] = mem_3;
19144 operands[6] = temp_operands[6];
19145 operands[7] = mem_4;
19146 }
19147 else
19148 {
19149 operands[0] = mem_1;
19150 operands[1] = temp_operands[1];
19151 operands[2] = mem_2;
19152 operands[3] = temp_operands[3];
19153 operands[4] = mem_3;
19154 operands[5] = temp_operands[5];
19155 operands[6] = mem_4;
19156 operands[7] = temp_operands[7];
19157 }
19158
19159 /* Emit adjusting instruction. */
19160 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19161 /* Emit ldp/stp instructions. */
19162 t1 = gen_rtx_SET (operands[0], operands[1]);
19163 t2 = gen_rtx_SET (operands[2], operands[3]);
19164 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19165 t1 = gen_rtx_SET (operands[4], operands[5]);
19166 t2 = gen_rtx_SET (operands[6], operands[7]);
19167 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19168 return true;
19169 }
19170
19171 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19172 it isn't worth branching around empty masked ops (including masked
19173 stores). */
19174
19175 static bool
19176 aarch64_empty_mask_is_expensive (unsigned)
19177 {
19178 return false;
19179 }
19180
19181 /* Return 1 if pseudo register should be created and used to hold
19182 GOT address for PIC code. */
19183
19184 bool
19185 aarch64_use_pseudo_pic_reg (void)
19186 {
19187 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19188 }
19189
19190 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19191
19192 static int
19193 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19194 {
19195 switch (XINT (x, 1))
19196 {
19197 case UNSPEC_GOTSMALLPIC:
19198 case UNSPEC_GOTSMALLPIC28K:
19199 case UNSPEC_GOTTINYPIC:
19200 return 0;
19201 default:
19202 break;
19203 }
19204
19205 return default_unspec_may_trap_p (x, flags);
19206 }
19207
19208
19209 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19210 return the log2 of that value. Otherwise return -1. */
19211
19212 int
19213 aarch64_fpconst_pow_of_2 (rtx x)
19214 {
19215 const REAL_VALUE_TYPE *r;
19216
19217 if (!CONST_DOUBLE_P (x))
19218 return -1;
19219
19220 r = CONST_DOUBLE_REAL_VALUE (x);
19221
19222 if (REAL_VALUE_NEGATIVE (*r)
19223 || REAL_VALUE_ISNAN (*r)
19224 || REAL_VALUE_ISINF (*r)
19225 || !real_isinteger (r, DFmode))
19226 return -1;
19227
19228 return exact_log2 (real_to_integer (r));
19229 }
19230
19231 /* If X is a vector of equal CONST_DOUBLE values and that value is
19232 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19233
19234 int
19235 aarch64_vec_fpconst_pow_of_2 (rtx x)
19236 {
19237 int nelts;
19238 if (GET_CODE (x) != CONST_VECTOR
19239 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19240 return -1;
19241
19242 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19243 return -1;
19244
19245 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19246 if (firstval <= 0)
19247 return -1;
19248
19249 for (int i = 1; i < nelts; i++)
19250 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19251 return -1;
19252
19253 return firstval;
19254 }
19255
19256 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19257 to float.
19258
19259 __fp16 always promotes through this hook.
19260 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19261 through the generic excess precision logic rather than here. */
19262
19263 static tree
19264 aarch64_promoted_type (const_tree t)
19265 {
19266 if (SCALAR_FLOAT_TYPE_P (t)
19267 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19268 return float_type_node;
19269
19270 return NULL_TREE;
19271 }
19272
19273 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19274
19275 static bool
19276 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19277 optimization_type opt_type)
19278 {
19279 switch (op)
19280 {
19281 case rsqrt_optab:
19282 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19283
19284 default:
19285 return true;
19286 }
19287 }
19288
19289 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19290
19291 static unsigned int
19292 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19293 int *offset)
19294 {
19295 /* Polynomial invariant 1 == (VG / 2) - 1. */
19296 gcc_assert (i == 1);
19297 *factor = 2;
19298 *offset = 1;
19299 return AARCH64_DWARF_VG;
19300 }
19301
19302 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19303 if MODE is HFmode, and punt to the generic implementation otherwise. */
19304
19305 static bool
19306 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19307 {
19308 return (mode == HFmode
19309 ? true
19310 : default_libgcc_floating_mode_supported_p (mode));
19311 }
19312
19313 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19314 if MODE is HFmode, and punt to the generic implementation otherwise. */
19315
19316 static bool
19317 aarch64_scalar_mode_supported_p (scalar_mode mode)
19318 {
19319 return (mode == HFmode
19320 ? true
19321 : default_scalar_mode_supported_p (mode));
19322 }
19323
19324 /* Set the value of FLT_EVAL_METHOD.
19325 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19326
19327 0: evaluate all operations and constants, whose semantic type has at
19328 most the range and precision of type float, to the range and
19329 precision of float; evaluate all other operations and constants to
19330 the range and precision of the semantic type;
19331
19332 N, where _FloatN is a supported interchange floating type
19333 evaluate all operations and constants, whose semantic type has at
19334 most the range and precision of _FloatN type, to the range and
19335 precision of the _FloatN type; evaluate all other operations and
19336 constants to the range and precision of the semantic type;
19337
19338 If we have the ARMv8.2-A extensions then we support _Float16 in native
19339 precision, so we should set this to 16. Otherwise, we support the type,
19340 but want to evaluate expressions in float precision, so set this to
19341 0. */
19342
19343 static enum flt_eval_method
19344 aarch64_excess_precision (enum excess_precision_type type)
19345 {
19346 switch (type)
19347 {
19348 case EXCESS_PRECISION_TYPE_FAST:
19349 case EXCESS_PRECISION_TYPE_STANDARD:
19350 /* We can calculate either in 16-bit range and precision or
19351 32-bit range and precision. Make that decision based on whether
19352 we have native support for the ARMv8.2-A 16-bit floating-point
19353 instructions or not. */
19354 return (TARGET_FP_F16INST
19355 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19356 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19357 case EXCESS_PRECISION_TYPE_IMPLICIT:
19358 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19359 default:
19360 gcc_unreachable ();
19361 }
19362 return FLT_EVAL_METHOD_UNPREDICTABLE;
19363 }
19364
19365 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
19366 scheduled for speculative execution. Reject the long-running division
19367 and square-root instructions. */
19368
19369 static bool
19370 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19371 {
19372 switch (get_attr_type (insn))
19373 {
19374 case TYPE_SDIV:
19375 case TYPE_UDIV:
19376 case TYPE_FDIVS:
19377 case TYPE_FDIVD:
19378 case TYPE_FSQRTS:
19379 case TYPE_FSQRTD:
19380 case TYPE_NEON_FP_SQRT_S:
19381 case TYPE_NEON_FP_SQRT_D:
19382 case TYPE_NEON_FP_SQRT_S_Q:
19383 case TYPE_NEON_FP_SQRT_D_Q:
19384 case TYPE_NEON_FP_DIV_S:
19385 case TYPE_NEON_FP_DIV_D:
19386 case TYPE_NEON_FP_DIV_S_Q:
19387 case TYPE_NEON_FP_DIV_D_Q:
19388 return false;
19389 default:
19390 return true;
19391 }
19392 }
19393
19394 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
19395
19396 static int
19397 aarch64_compute_pressure_classes (reg_class *classes)
19398 {
19399 int i = 0;
19400 classes[i++] = GENERAL_REGS;
19401 classes[i++] = FP_REGS;
19402 /* PR_REGS isn't a useful pressure class because many predicate pseudo
19403 registers need to go in PR_LO_REGS at some point during their
19404 lifetime. Splitting it into two halves has the effect of making
19405 all predicates count against PR_LO_REGS, so that we try whenever
19406 possible to restrict the number of live predicates to 8. This
19407 greatly reduces the amount of spilling in certain loops. */
19408 classes[i++] = PR_LO_REGS;
19409 classes[i++] = PR_HI_REGS;
19410 return i;
19411 }
19412
19413 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
19414
19415 static bool
19416 aarch64_can_change_mode_class (machine_mode from,
19417 machine_mode to, reg_class_t)
19418 {
19419 if (BYTES_BIG_ENDIAN)
19420 {
19421 bool from_sve_p = aarch64_sve_data_mode_p (from);
19422 bool to_sve_p = aarch64_sve_data_mode_p (to);
19423
19424 /* Don't allow changes between SVE data modes and non-SVE modes.
19425 See the comment at the head of aarch64-sve.md for details. */
19426 if (from_sve_p != to_sve_p)
19427 return false;
19428
19429 /* Don't allow changes in element size: lane 0 of the new vector
19430 would not then be lane 0 of the old vector. See the comment
19431 above aarch64_maybe_expand_sve_subreg_move for a more detailed
19432 description.
19433
19434 In the worst case, this forces a register to be spilled in
19435 one mode and reloaded in the other, which handles the
19436 endianness correctly. */
19437 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19438 return false;
19439 }
19440 return true;
19441 }
19442
19443 /* Implement TARGET_EARLY_REMAT_MODES. */
19444
19445 static void
19446 aarch64_select_early_remat_modes (sbitmap modes)
19447 {
19448 /* SVE values are not normally live across a call, so it should be
19449 worth doing early rematerialization even in VL-specific mode. */
19450 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19451 {
19452 machine_mode mode = (machine_mode) i;
19453 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19454 if (vec_flags & VEC_ANY_SVE)
19455 bitmap_set_bit (modes, i);
19456 }
19457 }
19458
19459 /* Override the default target speculation_safe_value. */
19460 static rtx
19461 aarch64_speculation_safe_value (machine_mode mode,
19462 rtx result, rtx val, rtx failval)
19463 {
19464 /* Maybe we should warn if falling back to hard barriers. They are
19465 likely to be noticably more expensive than the alternative below. */
19466 if (!aarch64_track_speculation)
19467 return default_speculation_safe_value (mode, result, val, failval);
19468
19469 if (!REG_P (val))
19470 val = copy_to_mode_reg (mode, val);
19471
19472 if (!aarch64_reg_or_zero (failval, mode))
19473 failval = copy_to_mode_reg (mode, failval);
19474
19475 emit_insn (gen_despeculate_copy (mode, result, val, failval));
19476 return result;
19477 }
19478
19479 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19480 Look into the tuning structure for an estimate.
19481 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19482 Advanced SIMD 128 bits. */
19483
19484 static HOST_WIDE_INT
19485 aarch64_estimated_poly_value (poly_int64 val)
19486 {
19487 enum aarch64_sve_vector_bits_enum width_source
19488 = aarch64_tune_params.sve_width;
19489
19490 /* If we still don't have an estimate, use the default. */
19491 if (width_source == SVE_SCALABLE)
19492 return default_estimated_poly_value (val);
19493
19494 HOST_WIDE_INT over_128 = width_source - 128;
19495 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19496 }
19497
19498
19499 /* Return true for types that could be supported as SIMD return or
19500 argument types. */
19501
19502 static bool
19503 supported_simd_type (tree t)
19504 {
19505 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19506 {
19507 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19508 return s == 1 || s == 2 || s == 4 || s == 8;
19509 }
19510 return false;
19511 }
19512
19513 /* Return true for types that currently are supported as SIMD return
19514 or argument types. */
19515
19516 static bool
19517 currently_supported_simd_type (tree t, tree b)
19518 {
19519 if (COMPLEX_FLOAT_TYPE_P (t))
19520 return false;
19521
19522 if (TYPE_SIZE (t) != TYPE_SIZE (b))
19523 return false;
19524
19525 return supported_simd_type (t);
19526 }
19527
19528 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
19529
19530 static int
19531 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19532 struct cgraph_simd_clone *clonei,
19533 tree base_type, int num)
19534 {
19535 tree t, ret_type, arg_type;
19536 unsigned int elt_bits, vec_bits, count;
19537
19538 if (!TARGET_SIMD)
19539 return 0;
19540
19541 if (clonei->simdlen
19542 && (clonei->simdlen < 2
19543 || clonei->simdlen > 1024
19544 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19545 {
19546 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19547 "unsupported simdlen %d", clonei->simdlen);
19548 return 0;
19549 }
19550
19551 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19552 if (TREE_CODE (ret_type) != VOID_TYPE
19553 && !currently_supported_simd_type (ret_type, base_type))
19554 {
19555 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19556 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19557 "GCC does not currently support mixed size types "
19558 "for %<simd%> functions");
19559 else if (supported_simd_type (ret_type))
19560 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19561 "GCC does not currently support return type %qT "
19562 "for %<simd%> functions", ret_type);
19563 else
19564 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19565 "unsupported return type %qT for %<simd%> functions",
19566 ret_type);
19567 return 0;
19568 }
19569
19570 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19571 {
19572 arg_type = TREE_TYPE (t);
19573
19574 if (!currently_supported_simd_type (arg_type, base_type))
19575 {
19576 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19577 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19578 "GCC does not currently support mixed size types "
19579 "for %<simd%> functions");
19580 else
19581 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19582 "GCC does not currently support argument type %qT "
19583 "for %<simd%> functions", arg_type);
19584 return 0;
19585 }
19586 }
19587
19588 clonei->vecsize_mangle = 'n';
19589 clonei->mask_mode = VOIDmode;
19590 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19591 if (clonei->simdlen == 0)
19592 {
19593 count = 2;
19594 vec_bits = (num == 0 ? 64 : 128);
19595 clonei->simdlen = vec_bits / elt_bits;
19596 }
19597 else
19598 {
19599 count = 1;
19600 vec_bits = clonei->simdlen * elt_bits;
19601 if (vec_bits != 64 && vec_bits != 128)
19602 {
19603 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19604 "GCC does not currently support simdlen %d for type %qT",
19605 clonei->simdlen, base_type);
19606 return 0;
19607 }
19608 }
19609 clonei->vecsize_int = vec_bits;
19610 clonei->vecsize_float = vec_bits;
19611 return count;
19612 }
19613
19614 /* Implement TARGET_SIMD_CLONE_ADJUST. */
19615
19616 static void
19617 aarch64_simd_clone_adjust (struct cgraph_node *node)
19618 {
19619 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19620 use the correct ABI. */
19621
19622 tree t = TREE_TYPE (node->decl);
19623 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19624 TYPE_ATTRIBUTES (t));
19625 }
19626
19627 /* Implement TARGET_SIMD_CLONE_USABLE. */
19628
19629 static int
19630 aarch64_simd_clone_usable (struct cgraph_node *node)
19631 {
19632 switch (node->simdclone->vecsize_mangle)
19633 {
19634 case 'n':
19635 if (!TARGET_SIMD)
19636 return -1;
19637 return 0;
19638 default:
19639 gcc_unreachable ();
19640 }
19641 }
19642
19643 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19644
19645 static int
19646 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19647 {
19648 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19649 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19650 return 0;
19651 return 1;
19652 }
19653
19654 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19655
19656 static const char *
19657 aarch64_get_multilib_abi_name (void)
19658 {
19659 if (TARGET_BIG_END)
19660 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19661 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19662 }
19663
19664 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19665 global variable based guard use the default else
19666 return a null tree. */
19667 static tree
19668 aarch64_stack_protect_guard (void)
19669 {
19670 if (aarch64_stack_protector_guard == SSP_GLOBAL)
19671 return default_stack_protect_guard ();
19672
19673 return NULL_TREE;
19674 }
19675
19676 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
19677 section at the end if needed. */
19678 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
19679 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
19680 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
19681 void
19682 aarch64_file_end_indicate_exec_stack ()
19683 {
19684 file_end_indicate_exec_stack ();
19685
19686 unsigned feature_1_and = 0;
19687 if (aarch64_bti_enabled ())
19688 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19689
19690 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19691 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19692
19693 if (feature_1_and)
19694 {
19695 /* Generate .note.gnu.property section. */
19696 switch_to_section (get_section (".note.gnu.property",
19697 SECTION_NOTYPE, NULL));
19698
19699 /* PT_NOTE header: namesz, descsz, type.
19700 namesz = 4 ("GNU\0")
19701 descsz = 16 (Size of the program property array)
19702 [(12 + padding) * Number of array elements]
19703 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
19704 assemble_align (POINTER_SIZE);
19705 assemble_integer (GEN_INT (4), 4, 32, 1);
19706 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
19707 assemble_integer (GEN_INT (5), 4, 32, 1);
19708
19709 /* PT_NOTE name. */
19710 assemble_string ("GNU", 4);
19711
19712 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19713 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19714 datasz = 4
19715 data = feature_1_and. */
19716 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
19717 assemble_integer (GEN_INT (4), 4, 32, 1);
19718 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
19719
19720 /* Pad the size of the note to the required alignment. */
19721 assemble_align (POINTER_SIZE);
19722 }
19723 }
19724 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19725 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19726 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19727
19728 /* Target-specific selftests. */
19729
19730 #if CHECKING_P
19731
19732 namespace selftest {
19733
19734 /* Selftest for the RTL loader.
19735 Verify that the RTL loader copes with a dump from
19736 print_rtx_function. This is essentially just a test that class
19737 function_reader can handle a real dump, but it also verifies
19738 that lookup_reg_by_dump_name correctly handles hard regs.
19739 The presence of hard reg names in the dump means that the test is
19740 target-specific, hence it is in this file. */
19741
19742 static void
19743 aarch64_test_loading_full_dump ()
19744 {
19745 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
19746
19747 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
19748
19749 rtx_insn *insn_1 = get_insn_by_uid (1);
19750 ASSERT_EQ (NOTE, GET_CODE (insn_1));
19751
19752 rtx_insn *insn_15 = get_insn_by_uid (15);
19753 ASSERT_EQ (INSN, GET_CODE (insn_15));
19754 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
19755
19756 /* Verify crtl->return_rtx. */
19757 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
19758 ASSERT_EQ (0, REGNO (crtl->return_rtx));
19759 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
19760 }
19761
19762 /* Run all target-specific selftests. */
19763
19764 static void
19765 aarch64_run_selftests (void)
19766 {
19767 aarch64_test_loading_full_dump ();
19768 }
19769
19770 } // namespace selftest
19771
19772 #endif /* #if CHECKING_P */
19773
19774 #undef TARGET_STACK_PROTECT_GUARD
19775 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19776
19777 #undef TARGET_ADDRESS_COST
19778 #define TARGET_ADDRESS_COST aarch64_address_cost
19779
19780 /* This hook will determines whether unnamed bitfields affect the alignment
19781 of the containing structure. The hook returns true if the structure
19782 should inherit the alignment requirements of an unnamed bitfield's
19783 type. */
19784 #undef TARGET_ALIGN_ANON_BITFIELD
19785 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19786
19787 #undef TARGET_ASM_ALIGNED_DI_OP
19788 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19789
19790 #undef TARGET_ASM_ALIGNED_HI_OP
19791 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19792
19793 #undef TARGET_ASM_ALIGNED_SI_OP
19794 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19795
19796 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19797 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19798 hook_bool_const_tree_hwi_hwi_const_tree_true
19799
19800 #undef TARGET_ASM_FILE_START
19801 #define TARGET_ASM_FILE_START aarch64_start_file
19802
19803 #undef TARGET_ASM_OUTPUT_MI_THUNK
19804 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19805
19806 #undef TARGET_ASM_SELECT_RTX_SECTION
19807 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19808
19809 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19810 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19811
19812 #undef TARGET_BUILD_BUILTIN_VA_LIST
19813 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19814
19815 #undef TARGET_CALLEE_COPIES
19816 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19817
19818 #undef TARGET_CAN_ELIMINATE
19819 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19820
19821 #undef TARGET_CAN_INLINE_P
19822 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19823
19824 #undef TARGET_CANNOT_FORCE_CONST_MEM
19825 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19826
19827 #undef TARGET_CASE_VALUES_THRESHOLD
19828 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19829
19830 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19831 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19832
19833 /* Only the least significant bit is used for initialization guard
19834 variables. */
19835 #undef TARGET_CXX_GUARD_MASK_BIT
19836 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19837
19838 #undef TARGET_C_MODE_FOR_SUFFIX
19839 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19840
19841 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19842 #undef TARGET_DEFAULT_TARGET_FLAGS
19843 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19844 #endif
19845
19846 #undef TARGET_CLASS_MAX_NREGS
19847 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19848
19849 #undef TARGET_BUILTIN_DECL
19850 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19851
19852 #undef TARGET_BUILTIN_RECIPROCAL
19853 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19854
19855 #undef TARGET_C_EXCESS_PRECISION
19856 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19857
19858 #undef TARGET_EXPAND_BUILTIN
19859 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19860
19861 #undef TARGET_EXPAND_BUILTIN_VA_START
19862 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19863
19864 #undef TARGET_FOLD_BUILTIN
19865 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19866
19867 #undef TARGET_FUNCTION_ARG
19868 #define TARGET_FUNCTION_ARG aarch64_function_arg
19869
19870 #undef TARGET_FUNCTION_ARG_ADVANCE
19871 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19872
19873 #undef TARGET_FUNCTION_ARG_BOUNDARY
19874 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19875
19876 #undef TARGET_FUNCTION_ARG_PADDING
19877 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19878
19879 #undef TARGET_GET_RAW_RESULT_MODE
19880 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19881 #undef TARGET_GET_RAW_ARG_MODE
19882 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19883
19884 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19885 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19886
19887 #undef TARGET_FUNCTION_VALUE
19888 #define TARGET_FUNCTION_VALUE aarch64_function_value
19889
19890 #undef TARGET_FUNCTION_VALUE_REGNO_P
19891 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19892
19893 #undef TARGET_GIMPLE_FOLD_BUILTIN
19894 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19895
19896 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19897 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19898
19899 #undef TARGET_INIT_BUILTINS
19900 #define TARGET_INIT_BUILTINS aarch64_init_builtins
19901
19902 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19903 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19904 aarch64_ira_change_pseudo_allocno_class
19905
19906 #undef TARGET_LEGITIMATE_ADDRESS_P
19907 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19908
19909 #undef TARGET_LEGITIMATE_CONSTANT_P
19910 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19911
19912 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19913 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19914 aarch64_legitimize_address_displacement
19915
19916 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19917 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19918
19919 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19920 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19921 aarch64_libgcc_floating_mode_supported_p
19922
19923 #undef TARGET_MANGLE_TYPE
19924 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19925
19926 #undef TARGET_MEMORY_MOVE_COST
19927 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19928
19929 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19930 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19931
19932 #undef TARGET_MUST_PASS_IN_STACK
19933 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19934
19935 /* This target hook should return true if accesses to volatile bitfields
19936 should use the narrowest mode possible. It should return false if these
19937 accesses should use the bitfield container type. */
19938 #undef TARGET_NARROW_VOLATILE_BITFIELD
19939 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19940
19941 #undef TARGET_OPTION_OVERRIDE
19942 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19943
19944 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19945 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19946 aarch64_override_options_after_change
19947
19948 #undef TARGET_OPTION_SAVE
19949 #define TARGET_OPTION_SAVE aarch64_option_save
19950
19951 #undef TARGET_OPTION_RESTORE
19952 #define TARGET_OPTION_RESTORE aarch64_option_restore
19953
19954 #undef TARGET_OPTION_PRINT
19955 #define TARGET_OPTION_PRINT aarch64_option_print
19956
19957 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19958 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19959
19960 #undef TARGET_SET_CURRENT_FUNCTION
19961 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19962
19963 #undef TARGET_PASS_BY_REFERENCE
19964 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19965
19966 #undef TARGET_PREFERRED_RELOAD_CLASS
19967 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19968
19969 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19970 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19971
19972 #undef TARGET_PROMOTED_TYPE
19973 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19974
19975 #undef TARGET_SECONDARY_RELOAD
19976 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19977
19978 #undef TARGET_SHIFT_TRUNCATION_MASK
19979 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19980
19981 #undef TARGET_SETUP_INCOMING_VARARGS
19982 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19983
19984 #undef TARGET_STRUCT_VALUE_RTX
19985 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
19986
19987 #undef TARGET_REGISTER_MOVE_COST
19988 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19989
19990 #undef TARGET_RETURN_IN_MEMORY
19991 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19992
19993 #undef TARGET_RETURN_IN_MSB
19994 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19995
19996 #undef TARGET_RTX_COSTS
19997 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19998
19999 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20000 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20001
20002 #undef TARGET_SCHED_ISSUE_RATE
20003 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20004
20005 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20006 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20007 aarch64_sched_first_cycle_multipass_dfa_lookahead
20008
20009 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20010 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20011 aarch64_first_cycle_multipass_dfa_lookahead_guard
20012
20013 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20014 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20015 aarch64_get_separate_components
20016
20017 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20018 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20019 aarch64_components_for_bb
20020
20021 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20022 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20023 aarch64_disqualify_components
20024
20025 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20026 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20027 aarch64_emit_prologue_components
20028
20029 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20030 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20031 aarch64_emit_epilogue_components
20032
20033 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20034 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20035 aarch64_set_handled_components
20036
20037 #undef TARGET_TRAMPOLINE_INIT
20038 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20039
20040 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20041 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20042
20043 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20044 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20045
20046 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20047 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20048 aarch64_builtin_support_vector_misalignment
20049
20050 #undef TARGET_ARRAY_MODE
20051 #define TARGET_ARRAY_MODE aarch64_array_mode
20052
20053 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20054 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20055
20056 #undef TARGET_VECTORIZE_ADD_STMT_COST
20057 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20058
20059 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20060 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20061 aarch64_builtin_vectorization_cost
20062
20063 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20064 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20065
20066 #undef TARGET_VECTORIZE_BUILTINS
20067 #define TARGET_VECTORIZE_BUILTINS
20068
20069 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20070 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20071 aarch64_builtin_vectorized_function
20072
20073 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20074 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20075 aarch64_autovectorize_vector_sizes
20076
20077 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20078 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20079 aarch64_atomic_assign_expand_fenv
20080
20081 /* Section anchor support. */
20082
20083 #undef TARGET_MIN_ANCHOR_OFFSET
20084 #define TARGET_MIN_ANCHOR_OFFSET -256
20085
20086 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20087 byte offset; we can do much more for larger data types, but have no way
20088 to determine the size of the access. We assume accesses are aligned. */
20089 #undef TARGET_MAX_ANCHOR_OFFSET
20090 #define TARGET_MAX_ANCHOR_OFFSET 4095
20091
20092 #undef TARGET_VECTOR_ALIGNMENT
20093 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20094
20095 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20096 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20097 aarch64_vectorize_preferred_vector_alignment
20098 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20099 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20100 aarch64_simd_vector_alignment_reachable
20101
20102 /* vec_perm support. */
20103
20104 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20105 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20106 aarch64_vectorize_vec_perm_const
20107
20108 #undef TARGET_VECTORIZE_GET_MASK_MODE
20109 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20110 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20111 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20112 aarch64_empty_mask_is_expensive
20113 #undef TARGET_PREFERRED_ELSE_VALUE
20114 #define TARGET_PREFERRED_ELSE_VALUE \
20115 aarch64_preferred_else_value
20116
20117 #undef TARGET_INIT_LIBFUNCS
20118 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20119
20120 #undef TARGET_FIXED_CONDITION_CODE_REGS
20121 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20122
20123 #undef TARGET_FLAGS_REGNUM
20124 #define TARGET_FLAGS_REGNUM CC_REGNUM
20125
20126 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20127 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20128
20129 #undef TARGET_ASAN_SHADOW_OFFSET
20130 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20131
20132 #undef TARGET_LEGITIMIZE_ADDRESS
20133 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20134
20135 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20136 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20137
20138 #undef TARGET_CAN_USE_DOLOOP_P
20139 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20140
20141 #undef TARGET_SCHED_ADJUST_PRIORITY
20142 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20143
20144 #undef TARGET_SCHED_MACRO_FUSION_P
20145 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20146
20147 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20148 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20149
20150 #undef TARGET_SCHED_FUSION_PRIORITY
20151 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20152
20153 #undef TARGET_UNSPEC_MAY_TRAP_P
20154 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20155
20156 #undef TARGET_USE_PSEUDO_PIC_REG
20157 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20158
20159 #undef TARGET_PRINT_OPERAND
20160 #define TARGET_PRINT_OPERAND aarch64_print_operand
20161
20162 #undef TARGET_PRINT_OPERAND_ADDRESS
20163 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20164
20165 #undef TARGET_OPTAB_SUPPORTED_P
20166 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20167
20168 #undef TARGET_OMIT_STRUCT_RETURN_REG
20169 #define TARGET_OMIT_STRUCT_RETURN_REG true
20170
20171 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20172 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20173 aarch64_dwarf_poly_indeterminate_value
20174
20175 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20176 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20177 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20178
20179 #undef TARGET_HARD_REGNO_NREGS
20180 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20181 #undef TARGET_HARD_REGNO_MODE_OK
20182 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20183
20184 #undef TARGET_MODES_TIEABLE_P
20185 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20186
20187 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20188 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20189 aarch64_hard_regno_call_part_clobbered
20190
20191 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20192 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20193 aarch64_remove_extra_call_preserved_regs
20194
20195 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20196 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20197 aarch64_return_call_with_max_clobbers
20198
20199 #undef TARGET_CONSTANT_ALIGNMENT
20200 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20201
20202 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20203 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20204 aarch64_stack_clash_protection_alloca_probe_range
20205
20206 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20207 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20208
20209 #undef TARGET_CAN_CHANGE_MODE_CLASS
20210 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20211
20212 #undef TARGET_SELECT_EARLY_REMAT_MODES
20213 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20214
20215 #undef TARGET_SPECULATION_SAFE_VALUE
20216 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20217
20218 #undef TARGET_ESTIMATED_POLY_VALUE
20219 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20220
20221 #undef TARGET_ATTRIBUTE_TABLE
20222 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20223
20224 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20225 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20226 aarch64_simd_clone_compute_vecsize_and_simdlen
20227
20228 #undef TARGET_SIMD_CLONE_ADJUST
20229 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20230
20231 #undef TARGET_SIMD_CLONE_USABLE
20232 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20233
20234 #undef TARGET_COMP_TYPE_ATTRIBUTES
20235 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20236
20237 #undef TARGET_GET_MULTILIB_ABI_NAME
20238 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20239
20240 #if CHECKING_P
20241 #undef TARGET_RUN_TARGET_SELFTESTS
20242 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20243 #endif /* #if CHECKING_P */
20244
20245 #undef TARGET_ASM_POST_CFI_STARTPROC
20246 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20247
20248 struct gcc_target targetm = TARGET_INITIALIZER;
20249
20250 #include "gt-aarch64.h"