]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
Remove global call sets: LRA
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 #include "expmed.h"
77 #include "function-abi.h"
78
79 /* This file should be included last. */
80 #include "target-def.h"
81
82 /* Defined for convenience. */
83 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
84
85 /* Information about a legitimate vector immediate operand. */
86 struct simd_immediate_info
87 {
88 enum insn_type { MOV, MVN, INDEX, PTRUE };
89 enum modifier_type { LSL, MSL };
90
91 simd_immediate_info () {}
92 simd_immediate_info (scalar_float_mode, rtx);
93 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
94 insn_type = MOV, modifier_type = LSL,
95 unsigned int = 0);
96 simd_immediate_info (scalar_mode, rtx, rtx);
97 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
98
99 /* The mode of the elements. */
100 scalar_mode elt_mode;
101
102 /* The instruction to use to move the immediate into a vector. */
103 insn_type insn;
104
105 union
106 {
107 /* For MOV and MVN. */
108 struct
109 {
110 /* The value of each element. */
111 rtx value;
112
113 /* The kind of shift modifier to use, and the number of bits to shift.
114 This is (LSL, 0) if no shift is needed. */
115 modifier_type modifier;
116 unsigned int shift;
117 } mov;
118
119 /* For INDEX. */
120 struct
121 {
122 /* The value of the first element and the step to be added for each
123 subsequent element. */
124 rtx base, step;
125 } index;
126
127 /* For PTRUE. */
128 aarch64_svpattern pattern;
129 } u;
130 };
131
132 /* Construct a floating-point immediate in which each element has mode
133 ELT_MODE_IN and value VALUE_IN. */
134 inline simd_immediate_info
135 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
136 : elt_mode (elt_mode_in), insn (MOV)
137 {
138 u.mov.value = value_in;
139 u.mov.modifier = LSL;
140 u.mov.shift = 0;
141 }
142
143 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
144 and value VALUE_IN. The other parameters are as for the structure
145 fields. */
146 inline simd_immediate_info
147 ::simd_immediate_info (scalar_int_mode elt_mode_in,
148 unsigned HOST_WIDE_INT value_in,
149 insn_type insn_in, modifier_type modifier_in,
150 unsigned int shift_in)
151 : elt_mode (elt_mode_in), insn (insn_in)
152 {
153 u.mov.value = gen_int_mode (value_in, elt_mode_in);
154 u.mov.modifier = modifier_in;
155 u.mov.shift = shift_in;
156 }
157
158 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
159 and where element I is equal to BASE_IN + I * STEP_IN. */
160 inline simd_immediate_info
161 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
162 : elt_mode (elt_mode_in), insn (INDEX)
163 {
164 u.index.base = base_in;
165 u.index.step = step_in;
166 }
167
168 /* Construct a predicate that controls elements of mode ELT_MODE_IN
169 and has PTRUE pattern PATTERN_IN. */
170 inline simd_immediate_info
171 ::simd_immediate_info (scalar_int_mode elt_mode_in,
172 aarch64_svpattern pattern_in)
173 : elt_mode (elt_mode_in), insn (PTRUE)
174 {
175 u.pattern = pattern_in;
176 }
177
178 /* The current code model. */
179 enum aarch64_code_model aarch64_cmodel;
180
181 /* The number of 64-bit elements in an SVE vector. */
182 poly_uint16 aarch64_sve_vg;
183
184 #ifdef HAVE_AS_TLS
185 #undef TARGET_HAVE_TLS
186 #define TARGET_HAVE_TLS 1
187 #endif
188
189 static bool aarch64_composite_type_p (const_tree, machine_mode);
190 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
191 const_tree,
192 machine_mode *, int *,
193 bool *);
194 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
195 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
196 static void aarch64_override_options_after_change (void);
197 static bool aarch64_vector_mode_supported_p (machine_mode);
198 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
199 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
200 const_tree type,
201 int misalignment,
202 bool is_packed);
203 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
204 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
205 aarch64_addr_query_type);
206 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
207
208 /* Major revision number of the ARM Architecture implemented by the target. */
209 unsigned aarch64_architecture_version;
210
211 /* The processor for which instructions should be scheduled. */
212 enum aarch64_processor aarch64_tune = cortexa53;
213
214 /* Mask to specify which instruction scheduling options should be used. */
215 uint64_t aarch64_tune_flags = 0;
216
217 /* Global flag for PC relative loads. */
218 bool aarch64_pcrelative_literal_loads;
219
220 /* Global flag for whether frame pointer is enabled. */
221 bool aarch64_use_frame_pointer;
222
223 #define BRANCH_PROTECT_STR_MAX 255
224 char *accepted_branch_protection_string = NULL;
225
226 static enum aarch64_parse_opt_result
227 aarch64_parse_branch_protection (const char*, char**);
228
229 /* Support for command line parsing of boolean flags in the tuning
230 structures. */
231 struct aarch64_flag_desc
232 {
233 const char* name;
234 unsigned int flag;
235 };
236
237 #define AARCH64_FUSION_PAIR(name, internal_name) \
238 { name, AARCH64_FUSE_##internal_name },
239 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
240 {
241 { "none", AARCH64_FUSE_NOTHING },
242 #include "aarch64-fusion-pairs.def"
243 { "all", AARCH64_FUSE_ALL },
244 { NULL, AARCH64_FUSE_NOTHING }
245 };
246
247 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
248 { name, AARCH64_EXTRA_TUNE_##internal_name },
249 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
250 {
251 { "none", AARCH64_EXTRA_TUNE_NONE },
252 #include "aarch64-tuning-flags.def"
253 { "all", AARCH64_EXTRA_TUNE_ALL },
254 { NULL, AARCH64_EXTRA_TUNE_NONE }
255 };
256
257 /* Tuning parameters. */
258
259 static const struct cpu_addrcost_table generic_addrcost_table =
260 {
261 {
262 1, /* hi */
263 0, /* si */
264 0, /* di */
265 1, /* ti */
266 },
267 0, /* pre_modify */
268 0, /* post_modify */
269 0, /* register_offset */
270 0, /* register_sextend */
271 0, /* register_zextend */
272 0 /* imm_offset */
273 };
274
275 static const struct cpu_addrcost_table exynosm1_addrcost_table =
276 {
277 {
278 0, /* hi */
279 0, /* si */
280 0, /* di */
281 2, /* ti */
282 },
283 0, /* pre_modify */
284 0, /* post_modify */
285 1, /* register_offset */
286 1, /* register_sextend */
287 2, /* register_zextend */
288 0, /* imm_offset */
289 };
290
291 static const struct cpu_addrcost_table xgene1_addrcost_table =
292 {
293 {
294 1, /* hi */
295 0, /* si */
296 0, /* di */
297 1, /* ti */
298 },
299 1, /* pre_modify */
300 1, /* post_modify */
301 0, /* register_offset */
302 1, /* register_sextend */
303 1, /* register_zextend */
304 0, /* imm_offset */
305 };
306
307 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
308 {
309 {
310 1, /* hi */
311 1, /* si */
312 1, /* di */
313 2, /* ti */
314 },
315 0, /* pre_modify */
316 0, /* post_modify */
317 2, /* register_offset */
318 3, /* register_sextend */
319 3, /* register_zextend */
320 0, /* imm_offset */
321 };
322
323 static const struct cpu_addrcost_table tsv110_addrcost_table =
324 {
325 {
326 1, /* hi */
327 0, /* si */
328 0, /* di */
329 1, /* ti */
330 },
331 0, /* pre_modify */
332 0, /* post_modify */
333 0, /* register_offset */
334 1, /* register_sextend */
335 1, /* register_zextend */
336 0, /* imm_offset */
337 };
338
339 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
340 {
341 {
342 1, /* hi */
343 1, /* si */
344 1, /* di */
345 2, /* ti */
346 },
347 1, /* pre_modify */
348 1, /* post_modify */
349 3, /* register_offset */
350 3, /* register_sextend */
351 3, /* register_zextend */
352 2, /* imm_offset */
353 };
354
355 static const struct cpu_regmove_cost generic_regmove_cost =
356 {
357 1, /* GP2GP */
358 /* Avoid the use of slow int<->fp moves for spilling by setting
359 their cost higher than memmov_cost. */
360 5, /* GP2FP */
361 5, /* FP2GP */
362 2 /* FP2FP */
363 };
364
365 static const struct cpu_regmove_cost cortexa57_regmove_cost =
366 {
367 1, /* GP2GP */
368 /* Avoid the use of slow int<->fp moves for spilling by setting
369 their cost higher than memmov_cost. */
370 5, /* GP2FP */
371 5, /* FP2GP */
372 2 /* FP2FP */
373 };
374
375 static const struct cpu_regmove_cost cortexa53_regmove_cost =
376 {
377 1, /* GP2GP */
378 /* Avoid the use of slow int<->fp moves for spilling by setting
379 their cost higher than memmov_cost. */
380 5, /* GP2FP */
381 5, /* FP2GP */
382 2 /* FP2FP */
383 };
384
385 static const struct cpu_regmove_cost exynosm1_regmove_cost =
386 {
387 1, /* GP2GP */
388 /* Avoid the use of slow int<->fp moves for spilling by setting
389 their cost higher than memmov_cost (actual, 4 and 9). */
390 9, /* GP2FP */
391 9, /* FP2GP */
392 1 /* FP2FP */
393 };
394
395 static const struct cpu_regmove_cost thunderx_regmove_cost =
396 {
397 2, /* GP2GP */
398 2, /* GP2FP */
399 6, /* FP2GP */
400 4 /* FP2FP */
401 };
402
403 static const struct cpu_regmove_cost xgene1_regmove_cost =
404 {
405 1, /* GP2GP */
406 /* Avoid the use of slow int<->fp moves for spilling by setting
407 their cost higher than memmov_cost. */
408 8, /* GP2FP */
409 8, /* FP2GP */
410 2 /* FP2FP */
411 };
412
413 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
414 {
415 2, /* GP2GP */
416 /* Avoid the use of int<->fp moves for spilling. */
417 6, /* GP2FP */
418 6, /* FP2GP */
419 4 /* FP2FP */
420 };
421
422 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
423 {
424 1, /* GP2GP */
425 /* Avoid the use of int<->fp moves for spilling. */
426 8, /* GP2FP */
427 8, /* FP2GP */
428 4 /* FP2FP */
429 };
430
431 static const struct cpu_regmove_cost tsv110_regmove_cost =
432 {
433 1, /* GP2GP */
434 /* Avoid the use of slow int<->fp moves for spilling by setting
435 their cost higher than memmov_cost. */
436 2, /* GP2FP */
437 3, /* FP2GP */
438 2 /* FP2FP */
439 };
440
441 /* Generic costs for vector insn classes. */
442 static const struct cpu_vector_cost generic_vector_cost =
443 {
444 1, /* scalar_int_stmt_cost */
445 1, /* scalar_fp_stmt_cost */
446 1, /* scalar_load_cost */
447 1, /* scalar_store_cost */
448 1, /* vec_int_stmt_cost */
449 1, /* vec_fp_stmt_cost */
450 2, /* vec_permute_cost */
451 1, /* vec_to_scalar_cost */
452 1, /* scalar_to_vec_cost */
453 1, /* vec_align_load_cost */
454 1, /* vec_unalign_load_cost */
455 1, /* vec_unalign_store_cost */
456 1, /* vec_store_cost */
457 3, /* cond_taken_branch_cost */
458 1 /* cond_not_taken_branch_cost */
459 };
460
461 /* QDF24XX costs for vector insn classes. */
462 static const struct cpu_vector_cost qdf24xx_vector_cost =
463 {
464 1, /* scalar_int_stmt_cost */
465 1, /* scalar_fp_stmt_cost */
466 1, /* scalar_load_cost */
467 1, /* scalar_store_cost */
468 1, /* vec_int_stmt_cost */
469 3, /* vec_fp_stmt_cost */
470 2, /* vec_permute_cost */
471 1, /* vec_to_scalar_cost */
472 1, /* scalar_to_vec_cost */
473 1, /* vec_align_load_cost */
474 1, /* vec_unalign_load_cost */
475 1, /* vec_unalign_store_cost */
476 1, /* vec_store_cost */
477 3, /* cond_taken_branch_cost */
478 1 /* cond_not_taken_branch_cost */
479 };
480
481 /* ThunderX costs for vector insn classes. */
482 static const struct cpu_vector_cost thunderx_vector_cost =
483 {
484 1, /* scalar_int_stmt_cost */
485 1, /* scalar_fp_stmt_cost */
486 3, /* scalar_load_cost */
487 1, /* scalar_store_cost */
488 4, /* vec_int_stmt_cost */
489 1, /* vec_fp_stmt_cost */
490 4, /* vec_permute_cost */
491 2, /* vec_to_scalar_cost */
492 2, /* scalar_to_vec_cost */
493 3, /* vec_align_load_cost */
494 5, /* vec_unalign_load_cost */
495 5, /* vec_unalign_store_cost */
496 1, /* vec_store_cost */
497 3, /* cond_taken_branch_cost */
498 3 /* cond_not_taken_branch_cost */
499 };
500
501 static const struct cpu_vector_cost tsv110_vector_cost =
502 {
503 1, /* scalar_int_stmt_cost */
504 1, /* scalar_fp_stmt_cost */
505 5, /* scalar_load_cost */
506 1, /* scalar_store_cost */
507 2, /* vec_int_stmt_cost */
508 2, /* vec_fp_stmt_cost */
509 2, /* vec_permute_cost */
510 3, /* vec_to_scalar_cost */
511 2, /* scalar_to_vec_cost */
512 5, /* vec_align_load_cost */
513 5, /* vec_unalign_load_cost */
514 1, /* vec_unalign_store_cost */
515 1, /* vec_store_cost */
516 1, /* cond_taken_branch_cost */
517 1 /* cond_not_taken_branch_cost */
518 };
519
520 /* Generic costs for vector insn classes. */
521 static const struct cpu_vector_cost cortexa57_vector_cost =
522 {
523 1, /* scalar_int_stmt_cost */
524 1, /* scalar_fp_stmt_cost */
525 4, /* scalar_load_cost */
526 1, /* scalar_store_cost */
527 2, /* vec_int_stmt_cost */
528 2, /* vec_fp_stmt_cost */
529 3, /* vec_permute_cost */
530 8, /* vec_to_scalar_cost */
531 8, /* scalar_to_vec_cost */
532 4, /* vec_align_load_cost */
533 4, /* vec_unalign_load_cost */
534 1, /* vec_unalign_store_cost */
535 1, /* vec_store_cost */
536 1, /* cond_taken_branch_cost */
537 1 /* cond_not_taken_branch_cost */
538 };
539
540 static const struct cpu_vector_cost exynosm1_vector_cost =
541 {
542 1, /* scalar_int_stmt_cost */
543 1, /* scalar_fp_stmt_cost */
544 5, /* scalar_load_cost */
545 1, /* scalar_store_cost */
546 3, /* vec_int_stmt_cost */
547 3, /* vec_fp_stmt_cost */
548 3, /* vec_permute_cost */
549 3, /* vec_to_scalar_cost */
550 3, /* scalar_to_vec_cost */
551 5, /* vec_align_load_cost */
552 5, /* vec_unalign_load_cost */
553 1, /* vec_unalign_store_cost */
554 1, /* vec_store_cost */
555 1, /* cond_taken_branch_cost */
556 1 /* cond_not_taken_branch_cost */
557 };
558
559 /* Generic costs for vector insn classes. */
560 static const struct cpu_vector_cost xgene1_vector_cost =
561 {
562 1, /* scalar_int_stmt_cost */
563 1, /* scalar_fp_stmt_cost */
564 5, /* scalar_load_cost */
565 1, /* scalar_store_cost */
566 2, /* vec_int_stmt_cost */
567 2, /* vec_fp_stmt_cost */
568 2, /* vec_permute_cost */
569 4, /* vec_to_scalar_cost */
570 4, /* scalar_to_vec_cost */
571 10, /* vec_align_load_cost */
572 10, /* vec_unalign_load_cost */
573 2, /* vec_unalign_store_cost */
574 2, /* vec_store_cost */
575 2, /* cond_taken_branch_cost */
576 1 /* cond_not_taken_branch_cost */
577 };
578
579 /* Costs for vector insn classes for Vulcan. */
580 static const struct cpu_vector_cost thunderx2t99_vector_cost =
581 {
582 1, /* scalar_int_stmt_cost */
583 6, /* scalar_fp_stmt_cost */
584 4, /* scalar_load_cost */
585 1, /* scalar_store_cost */
586 5, /* vec_int_stmt_cost */
587 6, /* vec_fp_stmt_cost */
588 3, /* vec_permute_cost */
589 6, /* vec_to_scalar_cost */
590 5, /* scalar_to_vec_cost */
591 8, /* vec_align_load_cost */
592 8, /* vec_unalign_load_cost */
593 4, /* vec_unalign_store_cost */
594 4, /* vec_store_cost */
595 2, /* cond_taken_branch_cost */
596 1 /* cond_not_taken_branch_cost */
597 };
598
599 /* Generic costs for branch instructions. */
600 static const struct cpu_branch_cost generic_branch_cost =
601 {
602 1, /* Predictable. */
603 3 /* Unpredictable. */
604 };
605
606 /* Generic approximation modes. */
607 static const cpu_approx_modes generic_approx_modes =
608 {
609 AARCH64_APPROX_NONE, /* division */
610 AARCH64_APPROX_NONE, /* sqrt */
611 AARCH64_APPROX_NONE /* recip_sqrt */
612 };
613
614 /* Approximation modes for Exynos M1. */
615 static const cpu_approx_modes exynosm1_approx_modes =
616 {
617 AARCH64_APPROX_NONE, /* division */
618 AARCH64_APPROX_ALL, /* sqrt */
619 AARCH64_APPROX_ALL /* recip_sqrt */
620 };
621
622 /* Approximation modes for X-Gene 1. */
623 static const cpu_approx_modes xgene1_approx_modes =
624 {
625 AARCH64_APPROX_NONE, /* division */
626 AARCH64_APPROX_NONE, /* sqrt */
627 AARCH64_APPROX_ALL /* recip_sqrt */
628 };
629
630 /* Generic prefetch settings (which disable prefetch). */
631 static const cpu_prefetch_tune generic_prefetch_tune =
632 {
633 0, /* num_slots */
634 -1, /* l1_cache_size */
635 -1, /* l1_cache_line_size */
636 -1, /* l2_cache_size */
637 true, /* prefetch_dynamic_strides */
638 -1, /* minimum_stride */
639 -1 /* default_opt_level */
640 };
641
642 static const cpu_prefetch_tune exynosm1_prefetch_tune =
643 {
644 0, /* num_slots */
645 -1, /* l1_cache_size */
646 64, /* l1_cache_line_size */
647 -1, /* l2_cache_size */
648 true, /* prefetch_dynamic_strides */
649 -1, /* minimum_stride */
650 -1 /* default_opt_level */
651 };
652
653 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
654 {
655 4, /* num_slots */
656 32, /* l1_cache_size */
657 64, /* l1_cache_line_size */
658 512, /* l2_cache_size */
659 false, /* prefetch_dynamic_strides */
660 2048, /* minimum_stride */
661 3 /* default_opt_level */
662 };
663
664 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
665 {
666 8, /* num_slots */
667 32, /* l1_cache_size */
668 128, /* l1_cache_line_size */
669 16*1024, /* l2_cache_size */
670 true, /* prefetch_dynamic_strides */
671 -1, /* minimum_stride */
672 3 /* default_opt_level */
673 };
674
675 static const cpu_prefetch_tune thunderx_prefetch_tune =
676 {
677 8, /* num_slots */
678 32, /* l1_cache_size */
679 128, /* l1_cache_line_size */
680 -1, /* l2_cache_size */
681 true, /* prefetch_dynamic_strides */
682 -1, /* minimum_stride */
683 -1 /* default_opt_level */
684 };
685
686 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
687 {
688 8, /* num_slots */
689 32, /* l1_cache_size */
690 64, /* l1_cache_line_size */
691 256, /* l2_cache_size */
692 true, /* prefetch_dynamic_strides */
693 -1, /* minimum_stride */
694 -1 /* default_opt_level */
695 };
696
697 static const cpu_prefetch_tune tsv110_prefetch_tune =
698 {
699 0, /* num_slots */
700 64, /* l1_cache_size */
701 64, /* l1_cache_line_size */
702 512, /* l2_cache_size */
703 true, /* prefetch_dynamic_strides */
704 -1, /* minimum_stride */
705 -1 /* default_opt_level */
706 };
707
708 static const cpu_prefetch_tune xgene1_prefetch_tune =
709 {
710 8, /* num_slots */
711 32, /* l1_cache_size */
712 64, /* l1_cache_line_size */
713 256, /* l2_cache_size */
714 true, /* prefetch_dynamic_strides */
715 -1, /* minimum_stride */
716 -1 /* default_opt_level */
717 };
718
719 static const struct tune_params generic_tunings =
720 {
721 &cortexa57_extra_costs,
722 &generic_addrcost_table,
723 &generic_regmove_cost,
724 &generic_vector_cost,
725 &generic_branch_cost,
726 &generic_approx_modes,
727 SVE_NOT_IMPLEMENTED, /* sve_width */
728 4, /* memmov_cost */
729 2, /* issue_rate */
730 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
731 "16:12", /* function_align. */
732 "4", /* jump_align. */
733 "8", /* loop_align. */
734 2, /* int_reassoc_width. */
735 4, /* fp_reassoc_width. */
736 1, /* vec_reassoc_width. */
737 2, /* min_div_recip_mul_sf. */
738 2, /* min_div_recip_mul_df. */
739 0, /* max_case_values. */
740 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
741 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
742 &generic_prefetch_tune
743 };
744
745 static const struct tune_params cortexa35_tunings =
746 {
747 &cortexa53_extra_costs,
748 &generic_addrcost_table,
749 &cortexa53_regmove_cost,
750 &generic_vector_cost,
751 &generic_branch_cost,
752 &generic_approx_modes,
753 SVE_NOT_IMPLEMENTED, /* sve_width */
754 4, /* memmov_cost */
755 1, /* issue_rate */
756 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
757 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
758 "16", /* function_align. */
759 "4", /* jump_align. */
760 "8", /* loop_align. */
761 2, /* int_reassoc_width. */
762 4, /* fp_reassoc_width. */
763 1, /* vec_reassoc_width. */
764 2, /* min_div_recip_mul_sf. */
765 2, /* min_div_recip_mul_df. */
766 0, /* max_case_values. */
767 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
768 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
769 &generic_prefetch_tune
770 };
771
772 static const struct tune_params cortexa53_tunings =
773 {
774 &cortexa53_extra_costs,
775 &generic_addrcost_table,
776 &cortexa53_regmove_cost,
777 &generic_vector_cost,
778 &generic_branch_cost,
779 &generic_approx_modes,
780 SVE_NOT_IMPLEMENTED, /* sve_width */
781 4, /* memmov_cost */
782 2, /* issue_rate */
783 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
784 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
785 "16", /* function_align. */
786 "4", /* jump_align. */
787 "8", /* loop_align. */
788 2, /* int_reassoc_width. */
789 4, /* fp_reassoc_width. */
790 1, /* vec_reassoc_width. */
791 2, /* min_div_recip_mul_sf. */
792 2, /* min_div_recip_mul_df. */
793 0, /* max_case_values. */
794 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
795 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
796 &generic_prefetch_tune
797 };
798
799 static const struct tune_params cortexa57_tunings =
800 {
801 &cortexa57_extra_costs,
802 &generic_addrcost_table,
803 &cortexa57_regmove_cost,
804 &cortexa57_vector_cost,
805 &generic_branch_cost,
806 &generic_approx_modes,
807 SVE_NOT_IMPLEMENTED, /* sve_width */
808 4, /* memmov_cost */
809 3, /* issue_rate */
810 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
811 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
812 "16", /* function_align. */
813 "4", /* jump_align. */
814 "8", /* loop_align. */
815 2, /* int_reassoc_width. */
816 4, /* fp_reassoc_width. */
817 1, /* vec_reassoc_width. */
818 2, /* min_div_recip_mul_sf. */
819 2, /* min_div_recip_mul_df. */
820 0, /* max_case_values. */
821 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
822 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
823 &generic_prefetch_tune
824 };
825
826 static const struct tune_params cortexa72_tunings =
827 {
828 &cortexa57_extra_costs,
829 &generic_addrcost_table,
830 &cortexa57_regmove_cost,
831 &cortexa57_vector_cost,
832 &generic_branch_cost,
833 &generic_approx_modes,
834 SVE_NOT_IMPLEMENTED, /* sve_width */
835 4, /* memmov_cost */
836 3, /* issue_rate */
837 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
838 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
839 "16", /* function_align. */
840 "4", /* jump_align. */
841 "8", /* loop_align. */
842 2, /* int_reassoc_width. */
843 4, /* fp_reassoc_width. */
844 1, /* vec_reassoc_width. */
845 2, /* min_div_recip_mul_sf. */
846 2, /* min_div_recip_mul_df. */
847 0, /* max_case_values. */
848 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
849 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
850 &generic_prefetch_tune
851 };
852
853 static const struct tune_params cortexa73_tunings =
854 {
855 &cortexa57_extra_costs,
856 &generic_addrcost_table,
857 &cortexa57_regmove_cost,
858 &cortexa57_vector_cost,
859 &generic_branch_cost,
860 &generic_approx_modes,
861 SVE_NOT_IMPLEMENTED, /* sve_width */
862 4, /* memmov_cost. */
863 2, /* issue_rate. */
864 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
865 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
866 "16", /* function_align. */
867 "4", /* jump_align. */
868 "8", /* loop_align. */
869 2, /* int_reassoc_width. */
870 4, /* fp_reassoc_width. */
871 1, /* vec_reassoc_width. */
872 2, /* min_div_recip_mul_sf. */
873 2, /* min_div_recip_mul_df. */
874 0, /* max_case_values. */
875 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
876 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
877 &generic_prefetch_tune
878 };
879
880
881
882 static const struct tune_params exynosm1_tunings =
883 {
884 &exynosm1_extra_costs,
885 &exynosm1_addrcost_table,
886 &exynosm1_regmove_cost,
887 &exynosm1_vector_cost,
888 &generic_branch_cost,
889 &exynosm1_approx_modes,
890 SVE_NOT_IMPLEMENTED, /* sve_width */
891 4, /* memmov_cost */
892 3, /* issue_rate */
893 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
894 "4", /* function_align. */
895 "4", /* jump_align. */
896 "4", /* loop_align. */
897 2, /* int_reassoc_width. */
898 4, /* fp_reassoc_width. */
899 1, /* vec_reassoc_width. */
900 2, /* min_div_recip_mul_sf. */
901 2, /* min_div_recip_mul_df. */
902 48, /* max_case_values. */
903 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
904 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
905 &exynosm1_prefetch_tune
906 };
907
908 static const struct tune_params thunderxt88_tunings =
909 {
910 &thunderx_extra_costs,
911 &generic_addrcost_table,
912 &thunderx_regmove_cost,
913 &thunderx_vector_cost,
914 &generic_branch_cost,
915 &generic_approx_modes,
916 SVE_NOT_IMPLEMENTED, /* sve_width */
917 6, /* memmov_cost */
918 2, /* issue_rate */
919 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
920 "8", /* function_align. */
921 "8", /* jump_align. */
922 "8", /* loop_align. */
923 2, /* int_reassoc_width. */
924 4, /* fp_reassoc_width. */
925 1, /* vec_reassoc_width. */
926 2, /* min_div_recip_mul_sf. */
927 2, /* min_div_recip_mul_df. */
928 0, /* max_case_values. */
929 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
930 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
931 &thunderxt88_prefetch_tune
932 };
933
934 static const struct tune_params thunderx_tunings =
935 {
936 &thunderx_extra_costs,
937 &generic_addrcost_table,
938 &thunderx_regmove_cost,
939 &thunderx_vector_cost,
940 &generic_branch_cost,
941 &generic_approx_modes,
942 SVE_NOT_IMPLEMENTED, /* sve_width */
943 6, /* memmov_cost */
944 2, /* issue_rate */
945 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
946 "8", /* function_align. */
947 "8", /* jump_align. */
948 "8", /* loop_align. */
949 2, /* int_reassoc_width. */
950 4, /* fp_reassoc_width. */
951 1, /* vec_reassoc_width. */
952 2, /* min_div_recip_mul_sf. */
953 2, /* min_div_recip_mul_df. */
954 0, /* max_case_values. */
955 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
956 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
957 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
958 &thunderx_prefetch_tune
959 };
960
961 static const struct tune_params tsv110_tunings =
962 {
963 &tsv110_extra_costs,
964 &tsv110_addrcost_table,
965 &tsv110_regmove_cost,
966 &tsv110_vector_cost,
967 &generic_branch_cost,
968 &generic_approx_modes,
969 SVE_NOT_IMPLEMENTED, /* sve_width */
970 4, /* memmov_cost */
971 4, /* issue_rate */
972 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
973 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
974 "16", /* function_align. */
975 "4", /* jump_align. */
976 "8", /* loop_align. */
977 2, /* int_reassoc_width. */
978 4, /* fp_reassoc_width. */
979 1, /* vec_reassoc_width. */
980 2, /* min_div_recip_mul_sf. */
981 2, /* min_div_recip_mul_df. */
982 0, /* max_case_values. */
983 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
984 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
985 &tsv110_prefetch_tune
986 };
987
988 static const struct tune_params xgene1_tunings =
989 {
990 &xgene1_extra_costs,
991 &xgene1_addrcost_table,
992 &xgene1_regmove_cost,
993 &xgene1_vector_cost,
994 &generic_branch_cost,
995 &xgene1_approx_modes,
996 SVE_NOT_IMPLEMENTED, /* sve_width */
997 6, /* memmov_cost */
998 4, /* issue_rate */
999 AARCH64_FUSE_NOTHING, /* fusible_ops */
1000 "16", /* function_align. */
1001 "16", /* jump_align. */
1002 "16", /* loop_align. */
1003 2, /* int_reassoc_width. */
1004 4, /* fp_reassoc_width. */
1005 1, /* vec_reassoc_width. */
1006 2, /* min_div_recip_mul_sf. */
1007 2, /* min_div_recip_mul_df. */
1008 17, /* max_case_values. */
1009 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1010 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1011 &xgene1_prefetch_tune
1012 };
1013
1014 static const struct tune_params emag_tunings =
1015 {
1016 &xgene1_extra_costs,
1017 &xgene1_addrcost_table,
1018 &xgene1_regmove_cost,
1019 &xgene1_vector_cost,
1020 &generic_branch_cost,
1021 &xgene1_approx_modes,
1022 SVE_NOT_IMPLEMENTED,
1023 6, /* memmov_cost */
1024 4, /* issue_rate */
1025 AARCH64_FUSE_NOTHING, /* fusible_ops */
1026 "16", /* function_align. */
1027 "16", /* jump_align. */
1028 "16", /* loop_align. */
1029 2, /* int_reassoc_width. */
1030 4, /* fp_reassoc_width. */
1031 1, /* vec_reassoc_width. */
1032 2, /* min_div_recip_mul_sf. */
1033 2, /* min_div_recip_mul_df. */
1034 17, /* max_case_values. */
1035 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1036 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1037 &xgene1_prefetch_tune
1038 };
1039
1040 static const struct tune_params qdf24xx_tunings =
1041 {
1042 &qdf24xx_extra_costs,
1043 &qdf24xx_addrcost_table,
1044 &qdf24xx_regmove_cost,
1045 &qdf24xx_vector_cost,
1046 &generic_branch_cost,
1047 &generic_approx_modes,
1048 SVE_NOT_IMPLEMENTED, /* sve_width */
1049 4, /* memmov_cost */
1050 4, /* issue_rate */
1051 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1052 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1053 "16", /* function_align. */
1054 "8", /* jump_align. */
1055 "16", /* loop_align. */
1056 2, /* int_reassoc_width. */
1057 4, /* fp_reassoc_width. */
1058 1, /* vec_reassoc_width. */
1059 2, /* min_div_recip_mul_sf. */
1060 2, /* min_div_recip_mul_df. */
1061 0, /* max_case_values. */
1062 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1063 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1064 &qdf24xx_prefetch_tune
1065 };
1066
1067 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1068 for now. */
1069 static const struct tune_params saphira_tunings =
1070 {
1071 &generic_extra_costs,
1072 &generic_addrcost_table,
1073 &generic_regmove_cost,
1074 &generic_vector_cost,
1075 &generic_branch_cost,
1076 &generic_approx_modes,
1077 SVE_NOT_IMPLEMENTED, /* sve_width */
1078 4, /* memmov_cost */
1079 4, /* issue_rate */
1080 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1081 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1082 "16", /* function_align. */
1083 "8", /* jump_align. */
1084 "16", /* loop_align. */
1085 2, /* int_reassoc_width. */
1086 4, /* fp_reassoc_width. */
1087 1, /* vec_reassoc_width. */
1088 2, /* min_div_recip_mul_sf. */
1089 2, /* min_div_recip_mul_df. */
1090 0, /* max_case_values. */
1091 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1092 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1093 &generic_prefetch_tune
1094 };
1095
1096 static const struct tune_params thunderx2t99_tunings =
1097 {
1098 &thunderx2t99_extra_costs,
1099 &thunderx2t99_addrcost_table,
1100 &thunderx2t99_regmove_cost,
1101 &thunderx2t99_vector_cost,
1102 &generic_branch_cost,
1103 &generic_approx_modes,
1104 SVE_NOT_IMPLEMENTED, /* sve_width */
1105 4, /* memmov_cost. */
1106 4, /* issue_rate. */
1107 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1108 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1109 "16", /* function_align. */
1110 "8", /* jump_align. */
1111 "16", /* loop_align. */
1112 3, /* int_reassoc_width. */
1113 2, /* fp_reassoc_width. */
1114 2, /* vec_reassoc_width. */
1115 2, /* min_div_recip_mul_sf. */
1116 2, /* min_div_recip_mul_df. */
1117 0, /* max_case_values. */
1118 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1119 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1120 &thunderx2t99_prefetch_tune
1121 };
1122
1123 static const struct tune_params neoversen1_tunings =
1124 {
1125 &cortexa57_extra_costs,
1126 &generic_addrcost_table,
1127 &generic_regmove_cost,
1128 &cortexa57_vector_cost,
1129 &generic_branch_cost,
1130 &generic_approx_modes,
1131 SVE_NOT_IMPLEMENTED, /* sve_width */
1132 4, /* memmov_cost */
1133 3, /* issue_rate */
1134 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1135 "32:16", /* function_align. */
1136 "32:16", /* jump_align. */
1137 "32:16", /* loop_align. */
1138 2, /* int_reassoc_width. */
1139 4, /* fp_reassoc_width. */
1140 2, /* vec_reassoc_width. */
1141 2, /* min_div_recip_mul_sf. */
1142 2, /* min_div_recip_mul_df. */
1143 0, /* max_case_values. */
1144 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1145 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1146 &generic_prefetch_tune
1147 };
1148
1149 /* Support for fine-grained override of the tuning structures. */
1150 struct aarch64_tuning_override_function
1151 {
1152 const char* name;
1153 void (*parse_override)(const char*, struct tune_params*);
1154 };
1155
1156 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1157 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1158 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1159
1160 static const struct aarch64_tuning_override_function
1161 aarch64_tuning_override_functions[] =
1162 {
1163 { "fuse", aarch64_parse_fuse_string },
1164 { "tune", aarch64_parse_tune_string },
1165 { "sve_width", aarch64_parse_sve_width_string },
1166 { NULL, NULL }
1167 };
1168
1169 /* A processor implementing AArch64. */
1170 struct processor
1171 {
1172 const char *const name;
1173 enum aarch64_processor ident;
1174 enum aarch64_processor sched_core;
1175 enum aarch64_arch arch;
1176 unsigned architecture_version;
1177 const uint64_t flags;
1178 const struct tune_params *const tune;
1179 };
1180
1181 /* Architectures implementing AArch64. */
1182 static const struct processor all_architectures[] =
1183 {
1184 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1185 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1186 #include "aarch64-arches.def"
1187 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1188 };
1189
1190 /* Processor cores implementing AArch64. */
1191 static const struct processor all_cores[] =
1192 {
1193 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1194 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1195 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1196 FLAGS, &COSTS##_tunings},
1197 #include "aarch64-cores.def"
1198 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1199 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1200 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1201 };
1202
1203
1204 /* Target specification. These are populated by the -march, -mtune, -mcpu
1205 handling code or by target attributes. */
1206 static const struct processor *selected_arch;
1207 static const struct processor *selected_cpu;
1208 static const struct processor *selected_tune;
1209
1210 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1211
1212 /* The current tuning set. */
1213 struct tune_params aarch64_tune_params = generic_tunings;
1214
1215 /* Table of machine attributes. */
1216 static const struct attribute_spec aarch64_attribute_table[] =
1217 {
1218 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1219 affects_type_identity, handler, exclude } */
1220 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1221 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1222 };
1223
1224 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1225
1226 /* An ISA extension in the co-processor and main instruction set space. */
1227 struct aarch64_option_extension
1228 {
1229 const char *const name;
1230 const unsigned long flags_on;
1231 const unsigned long flags_off;
1232 };
1233
1234 typedef enum aarch64_cond_code
1235 {
1236 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1237 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1238 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1239 }
1240 aarch64_cc;
1241
1242 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1243
1244 struct aarch64_branch_protect_type
1245 {
1246 /* The type's name that the user passes to the branch-protection option
1247 string. */
1248 const char* name;
1249 /* Function to handle the protection type and set global variables.
1250 First argument is the string token corresponding with this type and the
1251 second argument is the next token in the option string.
1252 Return values:
1253 * AARCH64_PARSE_OK: Handling was sucessful.
1254 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1255 should print an error.
1256 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1257 own error. */
1258 enum aarch64_parse_opt_result (*handler)(char*, char*);
1259 /* A list of types that can follow this type in the option string. */
1260 const aarch64_branch_protect_type* subtypes;
1261 unsigned int num_subtypes;
1262 };
1263
1264 static enum aarch64_parse_opt_result
1265 aarch64_handle_no_branch_protection (char* str, char* rest)
1266 {
1267 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1268 aarch64_enable_bti = 0;
1269 if (rest)
1270 {
1271 error ("unexpected %<%s%> after %<%s%>", rest, str);
1272 return AARCH64_PARSE_INVALID_FEATURE;
1273 }
1274 return AARCH64_PARSE_OK;
1275 }
1276
1277 static enum aarch64_parse_opt_result
1278 aarch64_handle_standard_branch_protection (char* str, char* rest)
1279 {
1280 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1281 aarch64_ra_sign_key = AARCH64_KEY_A;
1282 aarch64_enable_bti = 1;
1283 if (rest)
1284 {
1285 error ("unexpected %<%s%> after %<%s%>", rest, str);
1286 return AARCH64_PARSE_INVALID_FEATURE;
1287 }
1288 return AARCH64_PARSE_OK;
1289 }
1290
1291 static enum aarch64_parse_opt_result
1292 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1293 char* rest ATTRIBUTE_UNUSED)
1294 {
1295 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1296 aarch64_ra_sign_key = AARCH64_KEY_A;
1297 return AARCH64_PARSE_OK;
1298 }
1299
1300 static enum aarch64_parse_opt_result
1301 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1302 char* rest ATTRIBUTE_UNUSED)
1303 {
1304 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1305 return AARCH64_PARSE_OK;
1306 }
1307
1308 static enum aarch64_parse_opt_result
1309 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1310 char* rest ATTRIBUTE_UNUSED)
1311 {
1312 aarch64_ra_sign_key = AARCH64_KEY_B;
1313 return AARCH64_PARSE_OK;
1314 }
1315
1316 static enum aarch64_parse_opt_result
1317 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1318 char* rest ATTRIBUTE_UNUSED)
1319 {
1320 aarch64_enable_bti = 1;
1321 return AARCH64_PARSE_OK;
1322 }
1323
1324 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1325 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1326 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1327 { NULL, NULL, NULL, 0 }
1328 };
1329
1330 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1331 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1332 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1333 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1334 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1335 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1336 { NULL, NULL, NULL, 0 }
1337 };
1338
1339 /* The condition codes of the processor, and the inverse function. */
1340 static const char * const aarch64_condition_codes[] =
1341 {
1342 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1343 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1344 };
1345
1346 /* The preferred condition codes for SVE conditions. */
1347 static const char *const aarch64_sve_condition_codes[] =
1348 {
1349 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1350 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1351 };
1352
1353 /* Return the assembly token for svpattern value VALUE. */
1354
1355 static const char *
1356 svpattern_token (enum aarch64_svpattern pattern)
1357 {
1358 switch (pattern)
1359 {
1360 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1361 AARCH64_FOR_SVPATTERN (CASE)
1362 #undef CASE
1363 case AARCH64_NUM_SVPATTERNS:
1364 break;
1365 }
1366 gcc_unreachable ();
1367 }
1368
1369 /* Return the descriptor of the SIMD ABI. */
1370
1371 static const predefined_function_abi &
1372 aarch64_simd_abi (void)
1373 {
1374 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1375 if (!simd_abi.initialized_p ())
1376 {
1377 HARD_REG_SET full_reg_clobbers
1378 = default_function_abi.full_reg_clobbers ();
1379 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1380 if (FP_SIMD_SAVED_REGNUM_P (regno))
1381 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1382 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1383 }
1384 return simd_abi;
1385 }
1386
1387 /* Generate code to enable conditional branches in functions over 1 MiB. */
1388 const char *
1389 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1390 const char * branch_format)
1391 {
1392 rtx_code_label * tmp_label = gen_label_rtx ();
1393 char label_buf[256];
1394 char buffer[128];
1395 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1396 CODE_LABEL_NUMBER (tmp_label));
1397 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1398 rtx dest_label = operands[pos_label];
1399 operands[pos_label] = tmp_label;
1400
1401 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1402 output_asm_insn (buffer, operands);
1403
1404 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1405 operands[pos_label] = dest_label;
1406 output_asm_insn (buffer, operands);
1407 return "";
1408 }
1409
1410 void
1411 aarch64_err_no_fpadvsimd (machine_mode mode)
1412 {
1413 if (TARGET_GENERAL_REGS_ONLY)
1414 if (FLOAT_MODE_P (mode))
1415 error ("%qs is incompatible with the use of floating-point types",
1416 "-mgeneral-regs-only");
1417 else
1418 error ("%qs is incompatible with the use of vector types",
1419 "-mgeneral-regs-only");
1420 else
1421 if (FLOAT_MODE_P (mode))
1422 error ("%qs feature modifier is incompatible with the use of"
1423 " floating-point types", "+nofp");
1424 else
1425 error ("%qs feature modifier is incompatible with the use of"
1426 " vector types", "+nofp");
1427 }
1428
1429 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1430 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1431 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1432 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1433 and GENERAL_REGS is lower than the memory cost (in this case the best class
1434 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1435 cost results in bad allocations with many redundant int<->FP moves which
1436 are expensive on various cores.
1437 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1438 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1439 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1440 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1441 The result of this is that it is no longer inefficient to have a higher
1442 memory move cost than the register move cost.
1443 */
1444
1445 static reg_class_t
1446 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1447 reg_class_t best_class)
1448 {
1449 machine_mode mode;
1450
1451 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1452 || !reg_class_subset_p (FP_REGS, allocno_class))
1453 return allocno_class;
1454
1455 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1456 || !reg_class_subset_p (FP_REGS, best_class))
1457 return best_class;
1458
1459 mode = PSEUDO_REGNO_MODE (regno);
1460 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1461 }
1462
1463 static unsigned int
1464 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1465 {
1466 if (GET_MODE_UNIT_SIZE (mode) == 4)
1467 return aarch64_tune_params.min_div_recip_mul_sf;
1468 return aarch64_tune_params.min_div_recip_mul_df;
1469 }
1470
1471 /* Return the reassociation width of treeop OPC with mode MODE. */
1472 static int
1473 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1474 {
1475 if (VECTOR_MODE_P (mode))
1476 return aarch64_tune_params.vec_reassoc_width;
1477 if (INTEGRAL_MODE_P (mode))
1478 return aarch64_tune_params.int_reassoc_width;
1479 /* Avoid reassociating floating point addition so we emit more FMAs. */
1480 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1481 return aarch64_tune_params.fp_reassoc_width;
1482 return 1;
1483 }
1484
1485 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1486 unsigned
1487 aarch64_dbx_register_number (unsigned regno)
1488 {
1489 if (GP_REGNUM_P (regno))
1490 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1491 else if (regno == SP_REGNUM)
1492 return AARCH64_DWARF_SP;
1493 else if (FP_REGNUM_P (regno))
1494 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1495 else if (PR_REGNUM_P (regno))
1496 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1497 else if (regno == VG_REGNUM)
1498 return AARCH64_DWARF_VG;
1499
1500 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1501 equivalent DWARF register. */
1502 return DWARF_FRAME_REGISTERS;
1503 }
1504
1505 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1506 integer, otherwise return X unmodified. */
1507 static rtx
1508 aarch64_bit_representation (rtx x)
1509 {
1510 if (CONST_DOUBLE_P (x))
1511 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1512 return x;
1513 }
1514
1515 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1516 static bool
1517 aarch64_advsimd_struct_mode_p (machine_mode mode)
1518 {
1519 return (TARGET_SIMD
1520 && (mode == OImode || mode == CImode || mode == XImode));
1521 }
1522
1523 /* Return true if MODE is an SVE predicate mode. */
1524 static bool
1525 aarch64_sve_pred_mode_p (machine_mode mode)
1526 {
1527 return (TARGET_SVE
1528 && (mode == VNx16BImode
1529 || mode == VNx8BImode
1530 || mode == VNx4BImode
1531 || mode == VNx2BImode));
1532 }
1533
1534 /* Three mutually-exclusive flags describing a vector or predicate type. */
1535 const unsigned int VEC_ADVSIMD = 1;
1536 const unsigned int VEC_SVE_DATA = 2;
1537 const unsigned int VEC_SVE_PRED = 4;
1538 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1539 a structure of 2, 3 or 4 vectors. */
1540 const unsigned int VEC_STRUCT = 8;
1541 /* Useful combinations of the above. */
1542 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1543 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1544
1545 /* Return a set of flags describing the vector properties of mode MODE.
1546 Ignore modes that are not supported by the current target. */
1547 static unsigned int
1548 aarch64_classify_vector_mode (machine_mode mode)
1549 {
1550 if (aarch64_advsimd_struct_mode_p (mode))
1551 return VEC_ADVSIMD | VEC_STRUCT;
1552
1553 if (aarch64_sve_pred_mode_p (mode))
1554 return VEC_SVE_PRED;
1555
1556 /* Make the decision based on the mode's enum value rather than its
1557 properties, so that we keep the correct classification regardless
1558 of -msve-vector-bits. */
1559 switch (mode)
1560 {
1561 /* Single SVE vectors. */
1562 case E_VNx16QImode:
1563 case E_VNx8HImode:
1564 case E_VNx4SImode:
1565 case E_VNx2DImode:
1566 case E_VNx8HFmode:
1567 case E_VNx4SFmode:
1568 case E_VNx2DFmode:
1569 return TARGET_SVE ? VEC_SVE_DATA : 0;
1570
1571 /* x2 SVE vectors. */
1572 case E_VNx32QImode:
1573 case E_VNx16HImode:
1574 case E_VNx8SImode:
1575 case E_VNx4DImode:
1576 case E_VNx16HFmode:
1577 case E_VNx8SFmode:
1578 case E_VNx4DFmode:
1579 /* x3 SVE vectors. */
1580 case E_VNx48QImode:
1581 case E_VNx24HImode:
1582 case E_VNx12SImode:
1583 case E_VNx6DImode:
1584 case E_VNx24HFmode:
1585 case E_VNx12SFmode:
1586 case E_VNx6DFmode:
1587 /* x4 SVE vectors. */
1588 case E_VNx64QImode:
1589 case E_VNx32HImode:
1590 case E_VNx16SImode:
1591 case E_VNx8DImode:
1592 case E_VNx32HFmode:
1593 case E_VNx16SFmode:
1594 case E_VNx8DFmode:
1595 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1596
1597 /* 64-bit Advanced SIMD vectors. */
1598 case E_V8QImode:
1599 case E_V4HImode:
1600 case E_V2SImode:
1601 /* ...E_V1DImode doesn't exist. */
1602 case E_V4HFmode:
1603 case E_V2SFmode:
1604 case E_V1DFmode:
1605 /* 128-bit Advanced SIMD vectors. */
1606 case E_V16QImode:
1607 case E_V8HImode:
1608 case E_V4SImode:
1609 case E_V2DImode:
1610 case E_V8HFmode:
1611 case E_V4SFmode:
1612 case E_V2DFmode:
1613 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1614
1615 default:
1616 return 0;
1617 }
1618 }
1619
1620 /* Return true if MODE is any of the data vector modes, including
1621 structure modes. */
1622 static bool
1623 aarch64_vector_data_mode_p (machine_mode mode)
1624 {
1625 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1626 }
1627
1628 /* Return true if MODE is any form of SVE mode, including predicates,
1629 vectors and structures. */
1630 bool
1631 aarch64_sve_mode_p (machine_mode mode)
1632 {
1633 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1634 }
1635
1636 /* Return true if MODE is an SVE data vector mode; either a single vector
1637 or a structure of vectors. */
1638 static bool
1639 aarch64_sve_data_mode_p (machine_mode mode)
1640 {
1641 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1642 }
1643
1644 /* Implement target hook TARGET_ARRAY_MODE. */
1645 static opt_machine_mode
1646 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1647 {
1648 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1649 && IN_RANGE (nelems, 2, 4))
1650 return mode_for_vector (GET_MODE_INNER (mode),
1651 GET_MODE_NUNITS (mode) * nelems);
1652
1653 return opt_machine_mode ();
1654 }
1655
1656 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1657 static bool
1658 aarch64_array_mode_supported_p (machine_mode mode,
1659 unsigned HOST_WIDE_INT nelems)
1660 {
1661 if (TARGET_SIMD
1662 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1663 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1664 && (nelems >= 2 && nelems <= 4))
1665 return true;
1666
1667 return false;
1668 }
1669
1670 /* Return the SVE predicate mode to use for elements that have
1671 ELEM_NBYTES bytes, if such a mode exists. */
1672
1673 opt_machine_mode
1674 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1675 {
1676 if (TARGET_SVE)
1677 {
1678 if (elem_nbytes == 1)
1679 return VNx16BImode;
1680 if (elem_nbytes == 2)
1681 return VNx8BImode;
1682 if (elem_nbytes == 4)
1683 return VNx4BImode;
1684 if (elem_nbytes == 8)
1685 return VNx2BImode;
1686 }
1687 return opt_machine_mode ();
1688 }
1689
1690 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1691
1692 static opt_machine_mode
1693 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1694 {
1695 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1696 {
1697 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1698 machine_mode pred_mode;
1699 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1700 return pred_mode;
1701 }
1702
1703 return default_get_mask_mode (nunits, nbytes);
1704 }
1705
1706 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1707
1708 static opt_machine_mode
1709 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1710 {
1711 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1712 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1713 machine_mode mode;
1714 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1715 if (inner_mode == GET_MODE_INNER (mode)
1716 && known_eq (nunits, GET_MODE_NUNITS (mode))
1717 && aarch64_sve_data_mode_p (mode))
1718 return mode;
1719 return opt_machine_mode ();
1720 }
1721
1722 /* Return the integer element mode associated with SVE mode MODE. */
1723
1724 static scalar_int_mode
1725 aarch64_sve_element_int_mode (machine_mode mode)
1726 {
1727 unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1728 GET_MODE_NUNITS (mode));
1729 return int_mode_for_size (elt_bits, 0).require ();
1730 }
1731
1732 /* Return the integer vector mode associated with SVE mode MODE.
1733 Unlike mode_for_int_vector, this can handle the case in which
1734 MODE is a predicate (and thus has a different total size). */
1735
1736 static machine_mode
1737 aarch64_sve_int_mode (machine_mode mode)
1738 {
1739 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1740 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1741 }
1742
1743 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1744 prefer to use the first arithmetic operand as the else value if
1745 the else value doesn't matter, since that exactly matches the SVE
1746 destructive merging form. For ternary operations we could either
1747 pick the first operand and use FMAD-like instructions or the last
1748 operand and use FMLA-like instructions; the latter seems more
1749 natural. */
1750
1751 static tree
1752 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1753 {
1754 return nops == 3 ? ops[2] : ops[0];
1755 }
1756
1757 /* Implement TARGET_HARD_REGNO_NREGS. */
1758
1759 static unsigned int
1760 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1761 {
1762 /* ??? Logically we should only need to provide a value when
1763 HARD_REGNO_MODE_OK says that the combination is valid,
1764 but at the moment we need to handle all modes. Just ignore
1765 any runtime parts for registers that can't store them. */
1766 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1767 switch (aarch64_regno_regclass (regno))
1768 {
1769 case FP_REGS:
1770 case FP_LO_REGS:
1771 case FP_LO8_REGS:
1772 if (aarch64_sve_data_mode_p (mode))
1773 return exact_div (GET_MODE_SIZE (mode),
1774 BYTES_PER_SVE_VECTOR).to_constant ();
1775 return CEIL (lowest_size, UNITS_PER_VREG);
1776 case PR_REGS:
1777 case PR_LO_REGS:
1778 case PR_HI_REGS:
1779 return 1;
1780 default:
1781 return CEIL (lowest_size, UNITS_PER_WORD);
1782 }
1783 gcc_unreachable ();
1784 }
1785
1786 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1787
1788 static bool
1789 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1790 {
1791 if (GET_MODE_CLASS (mode) == MODE_CC)
1792 return regno == CC_REGNUM;
1793
1794 if (regno == VG_REGNUM)
1795 /* This must have the same size as _Unwind_Word. */
1796 return mode == DImode;
1797
1798 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1799 if (vec_flags & VEC_SVE_PRED)
1800 return PR_REGNUM_P (regno);
1801
1802 if (PR_REGNUM_P (regno))
1803 return 0;
1804
1805 if (regno == SP_REGNUM)
1806 /* The purpose of comparing with ptr_mode is to support the
1807 global register variable associated with the stack pointer
1808 register via the syntax of asm ("wsp") in ILP32. */
1809 return mode == Pmode || mode == ptr_mode;
1810
1811 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1812 return mode == Pmode;
1813
1814 if (GP_REGNUM_P (regno))
1815 {
1816 if (known_le (GET_MODE_SIZE (mode), 8))
1817 return true;
1818 else if (known_le (GET_MODE_SIZE (mode), 16))
1819 return (regno & 1) == 0;
1820 }
1821 else if (FP_REGNUM_P (regno))
1822 {
1823 if (vec_flags & VEC_STRUCT)
1824 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1825 else
1826 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1827 }
1828
1829 return false;
1830 }
1831
1832 /* Implement TARGET_FNTYPE_ABI. */
1833
1834 static const predefined_function_abi &
1835 aarch64_fntype_abi (const_tree fntype)
1836 {
1837 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
1838 return aarch64_simd_abi ();
1839 return default_function_abi;
1840 }
1841
1842 /* Return true if this is a definition of a vectorized simd function. */
1843
1844 static bool
1845 aarch64_simd_decl_p (tree fndecl)
1846 {
1847 tree fntype;
1848
1849 if (fndecl == NULL)
1850 return false;
1851 fntype = TREE_TYPE (fndecl);
1852 if (fntype == NULL)
1853 return false;
1854
1855 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1856 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1857 return true;
1858
1859 return false;
1860 }
1861
1862 /* Return the mode a register save/restore should use. DImode for integer
1863 registers, DFmode for FP registers in non-SIMD functions (they only save
1864 the bottom half of a 128 bit register), or TFmode for FP registers in
1865 SIMD functions. */
1866
1867 static machine_mode
1868 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1869 {
1870 return GP_REGNUM_P (regno)
1871 ? E_DImode
1872 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1873 }
1874
1875 /* Return true if the instruction is a call to a SIMD function, false
1876 if it is not a SIMD function or if we do not know anything about
1877 the function. */
1878
1879 static bool
1880 aarch64_simd_call_p (const rtx_insn *insn)
1881 {
1882 rtx symbol;
1883 rtx call;
1884 tree fndecl;
1885
1886 gcc_assert (CALL_P (insn));
1887 call = get_call_rtx_from (insn);
1888 symbol = XEXP (XEXP (call, 0), 0);
1889 if (GET_CODE (symbol) != SYMBOL_REF)
1890 return false;
1891 fndecl = SYMBOL_REF_DECL (symbol);
1892 if (!fndecl)
1893 return false;
1894
1895 return aarch64_simd_decl_p (fndecl);
1896 }
1897
1898 /* Implement TARGET_INSN_CALLEE_ABI. */
1899
1900 const predefined_function_abi &
1901 aarch64_insn_callee_abi (const rtx_insn *insn)
1902 {
1903 if (aarch64_simd_call_p (insn))
1904 return aarch64_simd_abi ();
1905 return default_function_abi;
1906 }
1907
1908 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1909 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1910 clobbers the top 64 bits when restoring the bottom 64 bits. */
1911
1912 static bool
1913 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
1914 unsigned int regno,
1915 machine_mode mode)
1916 {
1917 if (FP_REGNUM_P (regno))
1918 {
1919 bool simd_p = (abi_id == ARM_PCS_SIMD);
1920 poly_int64 per_register_size = GET_MODE_SIZE (mode);
1921 unsigned int nregs = hard_regno_nregs (regno, mode);
1922 if (nregs > 1)
1923 per_register_size = exact_div (per_register_size, nregs);
1924 return maybe_gt (per_register_size, simd_p ? 16 : 8);
1925 }
1926 return false;
1927 }
1928
1929 /* Implement REGMODE_NATURAL_SIZE. */
1930 poly_uint64
1931 aarch64_regmode_natural_size (machine_mode mode)
1932 {
1933 /* The natural size for SVE data modes is one SVE data vector,
1934 and similarly for predicates. We can't independently modify
1935 anything smaller than that. */
1936 /* ??? For now, only do this for variable-width SVE registers.
1937 Doing it for constant-sized registers breaks lower-subreg.c. */
1938 /* ??? And once that's fixed, we should probably have similar
1939 code for Advanced SIMD. */
1940 if (!aarch64_sve_vg.is_constant ())
1941 {
1942 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1943 if (vec_flags & VEC_SVE_PRED)
1944 return BYTES_PER_SVE_PRED;
1945 if (vec_flags & VEC_SVE_DATA)
1946 return BYTES_PER_SVE_VECTOR;
1947 }
1948 return UNITS_PER_WORD;
1949 }
1950
1951 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1952 machine_mode
1953 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1954 machine_mode mode)
1955 {
1956 /* The predicate mode determines which bits are significant and
1957 which are "don't care". Decreasing the number of lanes would
1958 lose data while increasing the number of lanes would make bits
1959 unnecessarily significant. */
1960 if (PR_REGNUM_P (regno))
1961 return mode;
1962 if (known_ge (GET_MODE_SIZE (mode), 4))
1963 return mode;
1964 else
1965 return SImode;
1966 }
1967
1968 /* Return true if I's bits are consecutive ones from the MSB. */
1969 bool
1970 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1971 {
1972 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1973 }
1974
1975 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1976 that strcpy from constants will be faster. */
1977
1978 static HOST_WIDE_INT
1979 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1980 {
1981 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1982 return MAX (align, BITS_PER_WORD);
1983 return align;
1984 }
1985
1986 /* Return true if calls to DECL should be treated as
1987 long-calls (ie called via a register). */
1988 static bool
1989 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1990 {
1991 return false;
1992 }
1993
1994 /* Return true if calls to symbol-ref SYM should be treated as
1995 long-calls (ie called via a register). */
1996 bool
1997 aarch64_is_long_call_p (rtx sym)
1998 {
1999 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2000 }
2001
2002 /* Return true if calls to symbol-ref SYM should not go through
2003 plt stubs. */
2004
2005 bool
2006 aarch64_is_noplt_call_p (rtx sym)
2007 {
2008 const_tree decl = SYMBOL_REF_DECL (sym);
2009
2010 if (flag_pic
2011 && decl
2012 && (!flag_plt
2013 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2014 && !targetm.binds_local_p (decl))
2015 return true;
2016
2017 return false;
2018 }
2019
2020 /* Return true if the offsets to a zero/sign-extract operation
2021 represent an expression that matches an extend operation. The
2022 operands represent the paramters from
2023
2024 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2025 bool
2026 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2027 rtx extract_imm)
2028 {
2029 HOST_WIDE_INT mult_val, extract_val;
2030
2031 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2032 return false;
2033
2034 mult_val = INTVAL (mult_imm);
2035 extract_val = INTVAL (extract_imm);
2036
2037 if (extract_val > 8
2038 && extract_val < GET_MODE_BITSIZE (mode)
2039 && exact_log2 (extract_val & ~7) > 0
2040 && (extract_val & 7) <= 4
2041 && mult_val == (1 << (extract_val & 7)))
2042 return true;
2043
2044 return false;
2045 }
2046
2047 /* Emit an insn that's a simple single-set. Both the operands must be
2048 known to be valid. */
2049 inline static rtx_insn *
2050 emit_set_insn (rtx x, rtx y)
2051 {
2052 return emit_insn (gen_rtx_SET (x, y));
2053 }
2054
2055 /* X and Y are two things to compare using CODE. Emit the compare insn and
2056 return the rtx for register 0 in the proper mode. */
2057 rtx
2058 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2059 {
2060 machine_mode cmp_mode = GET_MODE (x);
2061 machine_mode cc_mode;
2062 rtx cc_reg;
2063
2064 if (cmp_mode == TImode)
2065 {
2066 gcc_assert (code == NE);
2067
2068 cc_mode = CCmode;
2069 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2070
2071 rtx x_lo = operand_subword (x, 0, 0, TImode);
2072 rtx y_lo = operand_subword (y, 0, 0, TImode);
2073 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2074
2075 rtx x_hi = operand_subword (x, 1, 0, TImode);
2076 rtx y_hi = operand_subword (y, 1, 0, TImode);
2077 emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
2078 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2079 GEN_INT (AARCH64_EQ)));
2080 }
2081 else
2082 {
2083 cc_mode = SELECT_CC_MODE (code, x, y);
2084 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2085 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2086 }
2087 return cc_reg;
2088 }
2089
2090 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2091
2092 static rtx
2093 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2094 machine_mode y_mode)
2095 {
2096 if (y_mode == E_QImode || y_mode == E_HImode)
2097 {
2098 if (CONST_INT_P (y))
2099 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2100 else
2101 {
2102 rtx t, cc_reg;
2103 machine_mode cc_mode;
2104
2105 t = gen_rtx_ZERO_EXTEND (SImode, y);
2106 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2107 cc_mode = CC_SWPmode;
2108 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2109 emit_set_insn (cc_reg, t);
2110 return cc_reg;
2111 }
2112 }
2113
2114 if (!aarch64_plus_operand (y, y_mode))
2115 y = force_reg (y_mode, y);
2116
2117 return aarch64_gen_compare_reg (code, x, y);
2118 }
2119
2120 /* Build the SYMBOL_REF for __tls_get_addr. */
2121
2122 static GTY(()) rtx tls_get_addr_libfunc;
2123
2124 rtx
2125 aarch64_tls_get_addr (void)
2126 {
2127 if (!tls_get_addr_libfunc)
2128 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2129 return tls_get_addr_libfunc;
2130 }
2131
2132 /* Return the TLS model to use for ADDR. */
2133
2134 static enum tls_model
2135 tls_symbolic_operand_type (rtx addr)
2136 {
2137 enum tls_model tls_kind = TLS_MODEL_NONE;
2138 if (GET_CODE (addr) == CONST)
2139 {
2140 poly_int64 addend;
2141 rtx sym = strip_offset (addr, &addend);
2142 if (GET_CODE (sym) == SYMBOL_REF)
2143 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2144 }
2145 else if (GET_CODE (addr) == SYMBOL_REF)
2146 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2147
2148 return tls_kind;
2149 }
2150
2151 /* We'll allow lo_sum's in addresses in our legitimate addresses
2152 so that combine would take care of combining addresses where
2153 necessary, but for generation purposes, we'll generate the address
2154 as :
2155 RTL Absolute
2156 tmp = hi (symbol_ref); adrp x1, foo
2157 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2158 nop
2159
2160 PIC TLS
2161 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2162 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2163 bl __tls_get_addr
2164 nop
2165
2166 Load TLS symbol, depending on TLS mechanism and TLS access model.
2167
2168 Global Dynamic - Traditional TLS:
2169 adrp tmp, :tlsgd:imm
2170 add dest, tmp, #:tlsgd_lo12:imm
2171 bl __tls_get_addr
2172
2173 Global Dynamic - TLS Descriptors:
2174 adrp dest, :tlsdesc:imm
2175 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2176 add dest, dest, #:tlsdesc_lo12:imm
2177 blr tmp
2178 mrs tp, tpidr_el0
2179 add dest, dest, tp
2180
2181 Initial Exec:
2182 mrs tp, tpidr_el0
2183 adrp tmp, :gottprel:imm
2184 ldr dest, [tmp, #:gottprel_lo12:imm]
2185 add dest, dest, tp
2186
2187 Local Exec:
2188 mrs tp, tpidr_el0
2189 add t0, tp, #:tprel_hi12:imm, lsl #12
2190 add t0, t0, #:tprel_lo12_nc:imm
2191 */
2192
2193 static void
2194 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2195 enum aarch64_symbol_type type)
2196 {
2197 switch (type)
2198 {
2199 case SYMBOL_SMALL_ABSOLUTE:
2200 {
2201 /* In ILP32, the mode of dest can be either SImode or DImode. */
2202 rtx tmp_reg = dest;
2203 machine_mode mode = GET_MODE (dest);
2204
2205 gcc_assert (mode == Pmode || mode == ptr_mode);
2206
2207 if (can_create_pseudo_p ())
2208 tmp_reg = gen_reg_rtx (mode);
2209
2210 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2211 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2212 return;
2213 }
2214
2215 case SYMBOL_TINY_ABSOLUTE:
2216 emit_insn (gen_rtx_SET (dest, imm));
2217 return;
2218
2219 case SYMBOL_SMALL_GOT_28K:
2220 {
2221 machine_mode mode = GET_MODE (dest);
2222 rtx gp_rtx = pic_offset_table_rtx;
2223 rtx insn;
2224 rtx mem;
2225
2226 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2227 here before rtl expand. Tree IVOPT will generate rtl pattern to
2228 decide rtx costs, in which case pic_offset_table_rtx is not
2229 initialized. For that case no need to generate the first adrp
2230 instruction as the final cost for global variable access is
2231 one instruction. */
2232 if (gp_rtx != NULL)
2233 {
2234 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2235 using the page base as GOT base, the first page may be wasted,
2236 in the worst scenario, there is only 28K space for GOT).
2237
2238 The generate instruction sequence for accessing global variable
2239 is:
2240
2241 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2242
2243 Only one instruction needed. But we must initialize
2244 pic_offset_table_rtx properly. We generate initialize insn for
2245 every global access, and allow CSE to remove all redundant.
2246
2247 The final instruction sequences will look like the following
2248 for multiply global variables access.
2249
2250 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2251
2252 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2253 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2254 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2255 ... */
2256
2257 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2258 crtl->uses_pic_offset_table = 1;
2259 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2260
2261 if (mode != GET_MODE (gp_rtx))
2262 gp_rtx = gen_lowpart (mode, gp_rtx);
2263
2264 }
2265
2266 if (mode == ptr_mode)
2267 {
2268 if (mode == DImode)
2269 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2270 else
2271 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2272
2273 mem = XVECEXP (SET_SRC (insn), 0, 0);
2274 }
2275 else
2276 {
2277 gcc_assert (mode == Pmode);
2278
2279 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2280 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2281 }
2282
2283 /* The operand is expected to be MEM. Whenever the related insn
2284 pattern changed, above code which calculate mem should be
2285 updated. */
2286 gcc_assert (GET_CODE (mem) == MEM);
2287 MEM_READONLY_P (mem) = 1;
2288 MEM_NOTRAP_P (mem) = 1;
2289 emit_insn (insn);
2290 return;
2291 }
2292
2293 case SYMBOL_SMALL_GOT_4G:
2294 {
2295 /* In ILP32, the mode of dest can be either SImode or DImode,
2296 while the got entry is always of SImode size. The mode of
2297 dest depends on how dest is used: if dest is assigned to a
2298 pointer (e.g. in the memory), it has SImode; it may have
2299 DImode if dest is dereferenced to access the memeory.
2300 This is why we have to handle three different ldr_got_small
2301 patterns here (two patterns for ILP32). */
2302
2303 rtx insn;
2304 rtx mem;
2305 rtx tmp_reg = dest;
2306 machine_mode mode = GET_MODE (dest);
2307
2308 if (can_create_pseudo_p ())
2309 tmp_reg = gen_reg_rtx (mode);
2310
2311 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2312 if (mode == ptr_mode)
2313 {
2314 if (mode == DImode)
2315 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2316 else
2317 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2318
2319 mem = XVECEXP (SET_SRC (insn), 0, 0);
2320 }
2321 else
2322 {
2323 gcc_assert (mode == Pmode);
2324
2325 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2326 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2327 }
2328
2329 gcc_assert (GET_CODE (mem) == MEM);
2330 MEM_READONLY_P (mem) = 1;
2331 MEM_NOTRAP_P (mem) = 1;
2332 emit_insn (insn);
2333 return;
2334 }
2335
2336 case SYMBOL_SMALL_TLSGD:
2337 {
2338 rtx_insn *insns;
2339 machine_mode mode = GET_MODE (dest);
2340 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2341
2342 start_sequence ();
2343 if (TARGET_ILP32)
2344 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2345 else
2346 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2347 insns = get_insns ();
2348 end_sequence ();
2349
2350 RTL_CONST_CALL_P (insns) = 1;
2351 emit_libcall_block (insns, dest, result, imm);
2352 return;
2353 }
2354
2355 case SYMBOL_SMALL_TLSDESC:
2356 {
2357 machine_mode mode = GET_MODE (dest);
2358 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2359 rtx tp;
2360
2361 gcc_assert (mode == Pmode || mode == ptr_mode);
2362
2363 /* In ILP32, the got entry is always of SImode size. Unlike
2364 small GOT, the dest is fixed at reg 0. */
2365 if (TARGET_ILP32)
2366 emit_insn (gen_tlsdesc_small_si (imm));
2367 else
2368 emit_insn (gen_tlsdesc_small_di (imm));
2369 tp = aarch64_load_tp (NULL);
2370
2371 if (mode != Pmode)
2372 tp = gen_lowpart (mode, tp);
2373
2374 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2375 if (REG_P (dest))
2376 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2377 return;
2378 }
2379
2380 case SYMBOL_SMALL_TLSIE:
2381 {
2382 /* In ILP32, the mode of dest can be either SImode or DImode,
2383 while the got entry is always of SImode size. The mode of
2384 dest depends on how dest is used: if dest is assigned to a
2385 pointer (e.g. in the memory), it has SImode; it may have
2386 DImode if dest is dereferenced to access the memeory.
2387 This is why we have to handle three different tlsie_small
2388 patterns here (two patterns for ILP32). */
2389 machine_mode mode = GET_MODE (dest);
2390 rtx tmp_reg = gen_reg_rtx (mode);
2391 rtx tp = aarch64_load_tp (NULL);
2392
2393 if (mode == ptr_mode)
2394 {
2395 if (mode == DImode)
2396 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2397 else
2398 {
2399 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2400 tp = gen_lowpart (mode, tp);
2401 }
2402 }
2403 else
2404 {
2405 gcc_assert (mode == Pmode);
2406 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2407 }
2408
2409 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2410 if (REG_P (dest))
2411 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2412 return;
2413 }
2414
2415 case SYMBOL_TLSLE12:
2416 case SYMBOL_TLSLE24:
2417 case SYMBOL_TLSLE32:
2418 case SYMBOL_TLSLE48:
2419 {
2420 machine_mode mode = GET_MODE (dest);
2421 rtx tp = aarch64_load_tp (NULL);
2422
2423 if (mode != Pmode)
2424 tp = gen_lowpart (mode, tp);
2425
2426 switch (type)
2427 {
2428 case SYMBOL_TLSLE12:
2429 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2430 (dest, tp, imm));
2431 break;
2432 case SYMBOL_TLSLE24:
2433 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2434 (dest, tp, imm));
2435 break;
2436 case SYMBOL_TLSLE32:
2437 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2438 (dest, imm));
2439 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2440 (dest, dest, tp));
2441 break;
2442 case SYMBOL_TLSLE48:
2443 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2444 (dest, imm));
2445 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2446 (dest, dest, tp));
2447 break;
2448 default:
2449 gcc_unreachable ();
2450 }
2451
2452 if (REG_P (dest))
2453 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2454 return;
2455 }
2456
2457 case SYMBOL_TINY_GOT:
2458 emit_insn (gen_ldr_got_tiny (dest, imm));
2459 return;
2460
2461 case SYMBOL_TINY_TLSIE:
2462 {
2463 machine_mode mode = GET_MODE (dest);
2464 rtx tp = aarch64_load_tp (NULL);
2465
2466 if (mode == ptr_mode)
2467 {
2468 if (mode == DImode)
2469 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2470 else
2471 {
2472 tp = gen_lowpart (mode, tp);
2473 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2474 }
2475 }
2476 else
2477 {
2478 gcc_assert (mode == Pmode);
2479 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2480 }
2481
2482 if (REG_P (dest))
2483 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2484 return;
2485 }
2486
2487 default:
2488 gcc_unreachable ();
2489 }
2490 }
2491
2492 /* Emit a move from SRC to DEST. Assume that the move expanders can
2493 handle all moves if !can_create_pseudo_p (). The distinction is
2494 important because, unlike emit_move_insn, the move expanders know
2495 how to force Pmode objects into the constant pool even when the
2496 constant pool address is not itself legitimate. */
2497 static rtx
2498 aarch64_emit_move (rtx dest, rtx src)
2499 {
2500 return (can_create_pseudo_p ()
2501 ? emit_move_insn (dest, src)
2502 : emit_move_insn_1 (dest, src));
2503 }
2504
2505 /* Apply UNOPTAB to OP and store the result in DEST. */
2506
2507 static void
2508 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2509 {
2510 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2511 if (dest != tmp)
2512 emit_move_insn (dest, tmp);
2513 }
2514
2515 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2516
2517 static void
2518 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2519 {
2520 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2521 OPTAB_DIRECT);
2522 if (dest != tmp)
2523 emit_move_insn (dest, tmp);
2524 }
2525
2526 /* Split a 128-bit move operation into two 64-bit move operations,
2527 taking care to handle partial overlap of register to register
2528 copies. Special cases are needed when moving between GP regs and
2529 FP regs. SRC can be a register, constant or memory; DST a register
2530 or memory. If either operand is memory it must not have any side
2531 effects. */
2532 void
2533 aarch64_split_128bit_move (rtx dst, rtx src)
2534 {
2535 rtx dst_lo, dst_hi;
2536 rtx src_lo, src_hi;
2537
2538 machine_mode mode = GET_MODE (dst);
2539
2540 gcc_assert (mode == TImode || mode == TFmode);
2541 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2542 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2543
2544 if (REG_P (dst) && REG_P (src))
2545 {
2546 int src_regno = REGNO (src);
2547 int dst_regno = REGNO (dst);
2548
2549 /* Handle FP <-> GP regs. */
2550 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2551 {
2552 src_lo = gen_lowpart (word_mode, src);
2553 src_hi = gen_highpart (word_mode, src);
2554
2555 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2556 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2557 return;
2558 }
2559 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2560 {
2561 dst_lo = gen_lowpart (word_mode, dst);
2562 dst_hi = gen_highpart (word_mode, dst);
2563
2564 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2565 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2566 return;
2567 }
2568 }
2569
2570 dst_lo = gen_lowpart (word_mode, dst);
2571 dst_hi = gen_highpart (word_mode, dst);
2572 src_lo = gen_lowpart (word_mode, src);
2573 src_hi = gen_highpart_mode (word_mode, mode, src);
2574
2575 /* At most one pairing may overlap. */
2576 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2577 {
2578 aarch64_emit_move (dst_hi, src_hi);
2579 aarch64_emit_move (dst_lo, src_lo);
2580 }
2581 else
2582 {
2583 aarch64_emit_move (dst_lo, src_lo);
2584 aarch64_emit_move (dst_hi, src_hi);
2585 }
2586 }
2587
2588 bool
2589 aarch64_split_128bit_move_p (rtx dst, rtx src)
2590 {
2591 return (! REG_P (src)
2592 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2593 }
2594
2595 /* Split a complex SIMD combine. */
2596
2597 void
2598 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2599 {
2600 machine_mode src_mode = GET_MODE (src1);
2601 machine_mode dst_mode = GET_MODE (dst);
2602
2603 gcc_assert (VECTOR_MODE_P (dst_mode));
2604 gcc_assert (register_operand (dst, dst_mode)
2605 && register_operand (src1, src_mode)
2606 && register_operand (src2, src_mode));
2607
2608 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2609 return;
2610 }
2611
2612 /* Split a complex SIMD move. */
2613
2614 void
2615 aarch64_split_simd_move (rtx dst, rtx src)
2616 {
2617 machine_mode src_mode = GET_MODE (src);
2618 machine_mode dst_mode = GET_MODE (dst);
2619
2620 gcc_assert (VECTOR_MODE_P (dst_mode));
2621
2622 if (REG_P (dst) && REG_P (src))
2623 {
2624 gcc_assert (VECTOR_MODE_P (src_mode));
2625 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2626 }
2627 }
2628
2629 bool
2630 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2631 machine_mode ymode, rtx y)
2632 {
2633 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2634 gcc_assert (r != NULL);
2635 return rtx_equal_p (x, r);
2636 }
2637
2638 /* Return TARGET if it is nonnull and a register of mode MODE.
2639 Otherwise, return a fresh register of mode MODE if we can,
2640 or TARGET reinterpreted as MODE if we can't. */
2641
2642 static rtx
2643 aarch64_target_reg (rtx target, machine_mode mode)
2644 {
2645 if (target && REG_P (target) && GET_MODE (target) == mode)
2646 return target;
2647 if (!can_create_pseudo_p ())
2648 {
2649 gcc_assert (target);
2650 return gen_lowpart (mode, target);
2651 }
2652 return gen_reg_rtx (mode);
2653 }
2654
2655 /* Return a register that contains the constant in BUILDER, given that
2656 the constant is a legitimate move operand. Use TARGET as the register
2657 if it is nonnull and convenient. */
2658
2659 static rtx
2660 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2661 {
2662 rtx src = builder.build ();
2663 target = aarch64_target_reg (target, GET_MODE (src));
2664 emit_insn (gen_rtx_SET (target, src));
2665 return target;
2666 }
2667
2668 static rtx
2669 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2670 {
2671 if (can_create_pseudo_p ())
2672 return force_reg (mode, value);
2673 else
2674 {
2675 gcc_assert (x);
2676 aarch64_emit_move (x, value);
2677 return x;
2678 }
2679 }
2680
2681 /* Return true if predicate value X is a constant in which every element
2682 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2683 value, i.e. as a predicate in which all bits are significant. */
2684
2685 static bool
2686 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2687 {
2688 if (GET_CODE (x) != CONST_VECTOR)
2689 return false;
2690
2691 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2692 GET_MODE_NUNITS (GET_MODE (x)));
2693 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2694 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2695 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2696
2697 unsigned int nelts = const_vector_encoded_nelts (x);
2698 for (unsigned int i = 0; i < nelts; ++i)
2699 {
2700 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2701 if (!CONST_INT_P (elt))
2702 return false;
2703
2704 builder.quick_push (elt);
2705 for (unsigned int j = 1; j < factor; ++j)
2706 builder.quick_push (const0_rtx);
2707 }
2708 builder.finalize ();
2709 return true;
2710 }
2711
2712 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2713 widest predicate element size it can have (that is, the largest size
2714 for which each element would still be 0 or 1). */
2715
2716 unsigned int
2717 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2718 {
2719 /* Start with the most optimistic assumption: that we only need
2720 one bit per pattern. This is what we will use if only the first
2721 bit in each pattern is ever set. */
2722 unsigned int mask = GET_MODE_SIZE (DImode);
2723 mask |= builder.npatterns ();
2724
2725 /* Look for set bits. */
2726 unsigned int nelts = builder.encoded_nelts ();
2727 for (unsigned int i = 1; i < nelts; ++i)
2728 if (INTVAL (builder.elt (i)) != 0)
2729 {
2730 if (i & 1)
2731 return 1;
2732 mask |= i;
2733 }
2734 return mask & -mask;
2735 }
2736
2737 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2738 that the constant would have with predicate element size ELT_SIZE
2739 (ignoring the upper bits in each element) and return:
2740
2741 * -1 if all bits are set
2742 * N if the predicate has N leading set bits followed by all clear bits
2743 * 0 if the predicate does not have any of these forms. */
2744
2745 int
2746 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2747 unsigned int elt_size)
2748 {
2749 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2750 followed by set bits. */
2751 if (builder.nelts_per_pattern () == 3)
2752 return 0;
2753
2754 /* Skip over leading set bits. */
2755 unsigned int nelts = builder.encoded_nelts ();
2756 unsigned int i = 0;
2757 for (; i < nelts; i += elt_size)
2758 if (INTVAL (builder.elt (i)) == 0)
2759 break;
2760 unsigned int vl = i / elt_size;
2761
2762 /* Check for the all-true case. */
2763 if (i == nelts)
2764 return -1;
2765
2766 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2767 repeating pattern of set bits followed by clear bits. */
2768 if (builder.nelts_per_pattern () != 2)
2769 return 0;
2770
2771 /* We have a "foreground" value and a duplicated "background" value.
2772 If the background might repeat and the last set bit belongs to it,
2773 we might have set bits followed by clear bits followed by set bits. */
2774 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2775 return 0;
2776
2777 /* Make sure that the rest are all clear. */
2778 for (; i < nelts; i += elt_size)
2779 if (INTVAL (builder.elt (i)) != 0)
2780 return 0;
2781
2782 return vl;
2783 }
2784
2785 /* See if there is an svpattern that encodes an SVE predicate of mode
2786 PRED_MODE in which the first VL bits are set and the rest are clear.
2787 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2788 A VL of -1 indicates an all-true vector. */
2789
2790 aarch64_svpattern
2791 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2792 {
2793 if (vl < 0)
2794 return AARCH64_SV_ALL;
2795
2796 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2797 return AARCH64_NUM_SVPATTERNS;
2798
2799 if (vl >= 1 && vl <= 8)
2800 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2801
2802 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2803 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2804
2805 int max_vl;
2806 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2807 {
2808 if (vl == (max_vl / 3) * 3)
2809 return AARCH64_SV_MUL3;
2810 /* These would only trigger for non-power-of-2 lengths. */
2811 if (vl == (max_vl & -4))
2812 return AARCH64_SV_MUL4;
2813 if (vl == (1 << floor_log2 (max_vl)))
2814 return AARCH64_SV_POW2;
2815 if (vl == max_vl)
2816 return AARCH64_SV_ALL;
2817 }
2818 return AARCH64_NUM_SVPATTERNS;
2819 }
2820
2821 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2822 bits has the lowest bit set and the upper bits clear. This is the
2823 VNx16BImode equivalent of a PTRUE for controlling elements of
2824 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2825 all bits are significant, even the upper zeros. */
2826
2827 rtx
2828 aarch64_ptrue_all (unsigned int elt_size)
2829 {
2830 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2831 builder.quick_push (const1_rtx);
2832 for (unsigned int i = 1; i < elt_size; ++i)
2833 builder.quick_push (const0_rtx);
2834 return builder.build ();
2835 }
2836
2837 /* Return an all-true predicate register of mode MODE. */
2838
2839 rtx
2840 aarch64_ptrue_reg (machine_mode mode)
2841 {
2842 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2843 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2844 return gen_lowpart (mode, reg);
2845 }
2846
2847 /* Return an all-false predicate register of mode MODE. */
2848
2849 rtx
2850 aarch64_pfalse_reg (machine_mode mode)
2851 {
2852 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2853 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2854 return gen_lowpart (mode, reg);
2855 }
2856
2857 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2858 true, or alternatively if we know that the operation predicated by
2859 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2860 aarch64_sve_gp_strictness operand that describes the operation
2861 predicated by PRED1[0]. */
2862
2863 bool
2864 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2865 {
2866 machine_mode mode = GET_MODE (pred2);
2867 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2868 && mode == GET_MODE (pred1[0])
2869 && aarch64_sve_gp_strictness (pred1[1], SImode));
2870 return (pred1[0] == CONSTM1_RTX (mode)
2871 || INTVAL (pred1[1]) == SVE_RELAXED_GP
2872 || rtx_equal_p (pred1[0], pred2));
2873 }
2874
2875 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2876 for it. PRED2[0] is the predicate for the instruction whose result
2877 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2878 for it. Return true if we can prove that the two predicates are
2879 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2880 with PRED1[0] without changing behavior. */
2881
2882 bool
2883 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2884 {
2885 machine_mode mode = GET_MODE (pred1[0]);
2886 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2887 && mode == GET_MODE (pred2[0])
2888 && aarch64_sve_ptrue_flag (pred1[1], SImode)
2889 && aarch64_sve_ptrue_flag (pred2[1], SImode));
2890
2891 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2892 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2893 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2894 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2895 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2896 }
2897
2898 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2899 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2900 Use TARGET as the target register if nonnull and convenient. */
2901
2902 static rtx
2903 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2904 machine_mode data_mode, rtx op1, rtx op2)
2905 {
2906 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2907 expand_operand ops[5];
2908 create_output_operand (&ops[0], target, pred_mode);
2909 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2910 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2911 create_input_operand (&ops[3], op1, data_mode);
2912 create_input_operand (&ops[4], op2, data_mode);
2913 expand_insn (icode, 5, ops);
2914 return ops[0].value;
2915 }
2916
2917 /* Use a comparison to convert integer vector SRC into MODE, which is
2918 the corresponding SVE predicate mode. Use TARGET for the result
2919 if it's nonnull and convenient. */
2920
2921 static rtx
2922 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2923 {
2924 machine_mode src_mode = GET_MODE (src);
2925 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2926 src, CONST0_RTX (src_mode));
2927 }
2928
2929 /* Return true if we can move VALUE into a register using a single
2930 CNT[BHWD] instruction. */
2931
2932 static bool
2933 aarch64_sve_cnt_immediate_p (poly_int64 value)
2934 {
2935 HOST_WIDE_INT factor = value.coeffs[0];
2936 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2937 return (value.coeffs[1] == factor
2938 && IN_RANGE (factor, 2, 16 * 16)
2939 && (factor & 1) == 0
2940 && factor <= 16 * (factor & -factor));
2941 }
2942
2943 /* Likewise for rtx X. */
2944
2945 bool
2946 aarch64_sve_cnt_immediate_p (rtx x)
2947 {
2948 poly_int64 value;
2949 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2950 }
2951
2952 /* Return the asm string for an instruction with a CNT-like vector size
2953 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2954 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2955 first part of the operands template (the part that comes before the
2956 vector size itself). PATTERN is the pattern to use. FACTOR is the
2957 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
2958 in each quadword. If it is zero, we can use any element size. */
2959
2960 static char *
2961 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2962 aarch64_svpattern pattern,
2963 unsigned int factor,
2964 unsigned int nelts_per_vq)
2965 {
2966 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
2967
2968 if (nelts_per_vq == 0)
2969 /* There is some overlap in the ranges of the four CNT instructions.
2970 Here we always use the smallest possible element size, so that the
2971 multiplier is 1 whereever possible. */
2972 nelts_per_vq = factor & -factor;
2973 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2974 gcc_assert (IN_RANGE (shift, 1, 4));
2975 char suffix = "dwhb"[shift - 1];
2976
2977 factor >>= shift;
2978 unsigned int written;
2979 if (pattern == AARCH64_SV_ALL && factor == 1)
2980 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2981 prefix, suffix, operands);
2982 else if (factor == 1)
2983 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
2984 prefix, suffix, operands, svpattern_token (pattern));
2985 else
2986 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
2987 prefix, suffix, operands, svpattern_token (pattern),
2988 factor);
2989 gcc_assert (written < sizeof (buffer));
2990 return buffer;
2991 }
2992
2993 /* Return the asm string for an instruction with a CNT-like vector size
2994 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2995 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2996 first part of the operands template (the part that comes before the
2997 vector size itself). X is the value of the vector size operand,
2998 as a polynomial integer rtx; we need to convert this into an "all"
2999 pattern with a multiplier. */
3000
3001 char *
3002 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3003 rtx x)
3004 {
3005 poly_int64 value = rtx_to_poly_int64 (x);
3006 gcc_assert (aarch64_sve_cnt_immediate_p (value));
3007 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3008 value.coeffs[1], 0);
3009 }
3010
3011 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3012
3013 bool
3014 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3015 {
3016 poly_int64 value;
3017 return (poly_int_rtx_p (x, &value)
3018 && (aarch64_sve_cnt_immediate_p (value)
3019 || aarch64_sve_cnt_immediate_p (-value)));
3020 }
3021
3022 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3023 operand 0. */
3024
3025 char *
3026 aarch64_output_sve_scalar_inc_dec (rtx offset)
3027 {
3028 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3029 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3030 if (offset_value.coeffs[1] > 0)
3031 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3032 offset_value.coeffs[1], 0);
3033 else
3034 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3035 -offset_value.coeffs[1], 0);
3036 }
3037
3038 /* Return true if we can add VALUE to a register using a single ADDVL
3039 or ADDPL instruction. */
3040
3041 static bool
3042 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3043 {
3044 HOST_WIDE_INT factor = value.coeffs[0];
3045 if (factor == 0 || value.coeffs[1] != factor)
3046 return false;
3047 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3048 and a value of 16 is one vector width. */
3049 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3050 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3051 }
3052
3053 /* Likewise for rtx X. */
3054
3055 bool
3056 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3057 {
3058 poly_int64 value;
3059 return (poly_int_rtx_p (x, &value)
3060 && aarch64_sve_addvl_addpl_immediate_p (value));
3061 }
3062
3063 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3064 to operand 1 and storing the result in operand 0. */
3065
3066 char *
3067 aarch64_output_sve_addvl_addpl (rtx offset)
3068 {
3069 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3070 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3071 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3072
3073 int factor = offset_value.coeffs[1];
3074 if ((factor & 15) == 0)
3075 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3076 else
3077 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3078 return buffer;
3079 }
3080
3081 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3082 instruction. If it is, store the number of elements in each vector
3083 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3084 factor in *FACTOR_OUT (if nonnull). */
3085
3086 bool
3087 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3088 unsigned int *nelts_per_vq_out)
3089 {
3090 rtx elt;
3091 poly_int64 value;
3092
3093 if (!const_vec_duplicate_p (x, &elt)
3094 || !poly_int_rtx_p (elt, &value))
3095 return false;
3096
3097 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3098 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3099 /* There's no vector INCB. */
3100 return false;
3101
3102 HOST_WIDE_INT factor = value.coeffs[0];
3103 if (value.coeffs[1] != factor)
3104 return false;
3105
3106 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3107 if ((factor % nelts_per_vq) != 0
3108 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3109 return false;
3110
3111 if (factor_out)
3112 *factor_out = factor;
3113 if (nelts_per_vq_out)
3114 *nelts_per_vq_out = nelts_per_vq;
3115 return true;
3116 }
3117
3118 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3119 instruction. */
3120
3121 bool
3122 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3123 {
3124 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3125 }
3126
3127 /* Return the asm template for an SVE vector INC or DEC instruction.
3128 OPERANDS gives the operands before the vector count and X is the
3129 value of the vector count operand itself. */
3130
3131 char *
3132 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3133 {
3134 int factor;
3135 unsigned int nelts_per_vq;
3136 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3137 gcc_unreachable ();
3138 if (factor < 0)
3139 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3140 -factor, nelts_per_vq);
3141 else
3142 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3143 factor, nelts_per_vq);
3144 }
3145
3146 static int
3147 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3148 scalar_int_mode mode)
3149 {
3150 int i;
3151 unsigned HOST_WIDE_INT val, val2, mask;
3152 int one_match, zero_match;
3153 int num_insns;
3154
3155 val = INTVAL (imm);
3156
3157 if (aarch64_move_imm (val, mode))
3158 {
3159 if (generate)
3160 emit_insn (gen_rtx_SET (dest, imm));
3161 return 1;
3162 }
3163
3164 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3165 (with XXXX non-zero). In that case check to see if the move can be done in
3166 a smaller mode. */
3167 val2 = val & 0xffffffff;
3168 if (mode == DImode
3169 && aarch64_move_imm (val2, SImode)
3170 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3171 {
3172 if (generate)
3173 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3174
3175 /* Check if we have to emit a second instruction by checking to see
3176 if any of the upper 32 bits of the original DI mode value is set. */
3177 if (val == val2)
3178 return 1;
3179
3180 i = (val >> 48) ? 48 : 32;
3181
3182 if (generate)
3183 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3184 GEN_INT ((val >> i) & 0xffff)));
3185
3186 return 2;
3187 }
3188
3189 if ((val >> 32) == 0 || mode == SImode)
3190 {
3191 if (generate)
3192 {
3193 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3194 if (mode == SImode)
3195 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3196 GEN_INT ((val >> 16) & 0xffff)));
3197 else
3198 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3199 GEN_INT ((val >> 16) & 0xffff)));
3200 }
3201 return 2;
3202 }
3203
3204 /* Remaining cases are all for DImode. */
3205
3206 mask = 0xffff;
3207 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3208 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3209 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3210 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3211
3212 if (zero_match != 2 && one_match != 2)
3213 {
3214 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3215 For a 64-bit bitmask try whether changing 16 bits to all ones or
3216 zeroes creates a valid bitmask. To check any repeated bitmask,
3217 try using 16 bits from the other 32-bit half of val. */
3218
3219 for (i = 0; i < 64; i += 16, mask <<= 16)
3220 {
3221 val2 = val & ~mask;
3222 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3223 break;
3224 val2 = val | mask;
3225 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3226 break;
3227 val2 = val2 & ~mask;
3228 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3229 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3230 break;
3231 }
3232 if (i != 64)
3233 {
3234 if (generate)
3235 {
3236 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3237 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3238 GEN_INT ((val >> i) & 0xffff)));
3239 }
3240 return 2;
3241 }
3242 }
3243
3244 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3245 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3246 otherwise skip zero bits. */
3247
3248 num_insns = 1;
3249 mask = 0xffff;
3250 val2 = one_match > zero_match ? ~val : val;
3251 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3252
3253 if (generate)
3254 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3255 ? (val | ~(mask << i))
3256 : (val & (mask << i)))));
3257 for (i += 16; i < 64; i += 16)
3258 {
3259 if ((val2 & (mask << i)) == 0)
3260 continue;
3261 if (generate)
3262 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3263 GEN_INT ((val >> i) & 0xffff)));
3264 num_insns ++;
3265 }
3266
3267 return num_insns;
3268 }
3269
3270 /* Return whether imm is a 128-bit immediate which is simple enough to
3271 expand inline. */
3272 bool
3273 aarch64_mov128_immediate (rtx imm)
3274 {
3275 if (GET_CODE (imm) == CONST_INT)
3276 return true;
3277
3278 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3279
3280 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3281 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3282
3283 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3284 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3285 }
3286
3287
3288 /* Return the number of temporary registers that aarch64_add_offset_1
3289 would need to add OFFSET to a register. */
3290
3291 static unsigned int
3292 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3293 {
3294 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3295 }
3296
3297 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3298 a non-polynomial OFFSET. MODE is the mode of the addition.
3299 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3300 be set and CFA adjustments added to the generated instructions.
3301
3302 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3303 temporary if register allocation is already complete. This temporary
3304 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3305 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3306 the immediate again.
3307
3308 Since this function may be used to adjust the stack pointer, we must
3309 ensure that it cannot cause transient stack deallocation (for example
3310 by first incrementing SP and then decrementing when adjusting by a
3311 large immediate). */
3312
3313 static void
3314 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3315 rtx src, HOST_WIDE_INT offset, rtx temp1,
3316 bool frame_related_p, bool emit_move_imm)
3317 {
3318 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3319 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3320
3321 HOST_WIDE_INT moffset = abs_hwi (offset);
3322 rtx_insn *insn;
3323
3324 if (!moffset)
3325 {
3326 if (!rtx_equal_p (dest, src))
3327 {
3328 insn = emit_insn (gen_rtx_SET (dest, src));
3329 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3330 }
3331 return;
3332 }
3333
3334 /* Single instruction adjustment. */
3335 if (aarch64_uimm12_shift (moffset))
3336 {
3337 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3338 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3339 return;
3340 }
3341
3342 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3343 and either:
3344
3345 a) the offset cannot be loaded by a 16-bit move or
3346 b) there is no spare register into which we can move it. */
3347 if (moffset < 0x1000000
3348 && ((!temp1 && !can_create_pseudo_p ())
3349 || !aarch64_move_imm (moffset, mode)))
3350 {
3351 HOST_WIDE_INT low_off = moffset & 0xfff;
3352
3353 low_off = offset < 0 ? -low_off : low_off;
3354 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3355 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3356 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3357 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3358 return;
3359 }
3360
3361 /* Emit a move immediate if required and an addition/subtraction. */
3362 if (emit_move_imm)
3363 {
3364 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3365 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3366 }
3367 insn = emit_insn (offset < 0
3368 ? gen_sub3_insn (dest, src, temp1)
3369 : gen_add3_insn (dest, src, temp1));
3370 if (frame_related_p)
3371 {
3372 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3373 rtx adj = plus_constant (mode, src, offset);
3374 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3375 }
3376 }
3377
3378 /* Return the number of temporary registers that aarch64_add_offset
3379 would need to move OFFSET into a register or add OFFSET to a register;
3380 ADD_P is true if we want the latter rather than the former. */
3381
3382 static unsigned int
3383 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3384 {
3385 /* This follows the same structure as aarch64_add_offset. */
3386 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3387 return 0;
3388
3389 unsigned int count = 0;
3390 HOST_WIDE_INT factor = offset.coeffs[1];
3391 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3392 poly_int64 poly_offset (factor, factor);
3393 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3394 /* Need one register for the ADDVL/ADDPL result. */
3395 count += 1;
3396 else if (factor != 0)
3397 {
3398 factor = abs (factor);
3399 if (factor > 16 * (factor & -factor))
3400 /* Need one register for the CNT result and one for the multiplication
3401 factor. If necessary, the second temporary can be reused for the
3402 constant part of the offset. */
3403 return 2;
3404 /* Need one register for the CNT result (which might then
3405 be shifted). */
3406 count += 1;
3407 }
3408 return count + aarch64_add_offset_1_temporaries (constant);
3409 }
3410
3411 /* If X can be represented as a poly_int64, return the number
3412 of temporaries that are required to add it to a register.
3413 Return -1 otherwise. */
3414
3415 int
3416 aarch64_add_offset_temporaries (rtx x)
3417 {
3418 poly_int64 offset;
3419 if (!poly_int_rtx_p (x, &offset))
3420 return -1;
3421 return aarch64_offset_temporaries (true, offset);
3422 }
3423
3424 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3425 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3426 be set and CFA adjustments added to the generated instructions.
3427
3428 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3429 temporary if register allocation is already complete. This temporary
3430 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3431 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3432 false to avoid emitting the immediate again.
3433
3434 TEMP2, if nonnull, is a second temporary register that doesn't
3435 overlap either DEST or REG.
3436
3437 Since this function may be used to adjust the stack pointer, we must
3438 ensure that it cannot cause transient stack deallocation (for example
3439 by first incrementing SP and then decrementing when adjusting by a
3440 large immediate). */
3441
3442 static void
3443 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3444 poly_int64 offset, rtx temp1, rtx temp2,
3445 bool frame_related_p, bool emit_move_imm = true)
3446 {
3447 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3448 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3449 gcc_assert (temp1 == NULL_RTX
3450 || !frame_related_p
3451 || !reg_overlap_mentioned_p (temp1, dest));
3452 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3453
3454 /* Try using ADDVL or ADDPL to add the whole value. */
3455 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3456 {
3457 rtx offset_rtx = gen_int_mode (offset, mode);
3458 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3459 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3460 return;
3461 }
3462
3463 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3464 SVE vector register, over and above the minimum size of 128 bits.
3465 This is equivalent to half the value returned by CNTD with a
3466 vector shape of ALL. */
3467 HOST_WIDE_INT factor = offset.coeffs[1];
3468 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3469
3470 /* Try using ADDVL or ADDPL to add the VG-based part. */
3471 poly_int64 poly_offset (factor, factor);
3472 if (src != const0_rtx
3473 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3474 {
3475 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3476 if (frame_related_p)
3477 {
3478 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3479 RTX_FRAME_RELATED_P (insn) = true;
3480 src = dest;
3481 }
3482 else
3483 {
3484 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3485 src = aarch64_force_temporary (mode, temp1, addr);
3486 temp1 = temp2;
3487 temp2 = NULL_RTX;
3488 }
3489 }
3490 /* Otherwise use a CNT-based sequence. */
3491 else if (factor != 0)
3492 {
3493 /* Use a subtraction if we have a negative factor. */
3494 rtx_code code = PLUS;
3495 if (factor < 0)
3496 {
3497 factor = -factor;
3498 code = MINUS;
3499 }
3500
3501 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3502 into the multiplication. */
3503 rtx val;
3504 int shift = 0;
3505 if (factor & 1)
3506 /* Use a right shift by 1. */
3507 shift = -1;
3508 else
3509 factor /= 2;
3510 HOST_WIDE_INT low_bit = factor & -factor;
3511 if (factor <= 16 * low_bit)
3512 {
3513 if (factor > 16 * 8)
3514 {
3515 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3516 the value with the minimum multiplier and shift it into
3517 position. */
3518 int extra_shift = exact_log2 (low_bit);
3519 shift += extra_shift;
3520 factor >>= extra_shift;
3521 }
3522 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3523 }
3524 else
3525 {
3526 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3527 directly, since that should increase the chances of being
3528 able to use a shift and add sequence. If LOW_BIT itself
3529 is out of range, just use CNTD. */
3530 if (low_bit <= 16 * 8)
3531 factor /= low_bit;
3532 else
3533 low_bit = 1;
3534
3535 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3536 val = aarch64_force_temporary (mode, temp1, val);
3537
3538 if (can_create_pseudo_p ())
3539 {
3540 rtx coeff1 = gen_int_mode (factor, mode);
3541 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3542 }
3543 else
3544 {
3545 /* Go back to using a negative multiplication factor if we have
3546 no register from which to subtract. */
3547 if (code == MINUS && src == const0_rtx)
3548 {
3549 factor = -factor;
3550 code = PLUS;
3551 }
3552 rtx coeff1 = gen_int_mode (factor, mode);
3553 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3554 val = gen_rtx_MULT (mode, val, coeff1);
3555 }
3556 }
3557
3558 if (shift > 0)
3559 {
3560 /* Multiply by 1 << SHIFT. */
3561 val = aarch64_force_temporary (mode, temp1, val);
3562 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3563 }
3564 else if (shift == -1)
3565 {
3566 /* Divide by 2. */
3567 val = aarch64_force_temporary (mode, temp1, val);
3568 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3569 }
3570
3571 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3572 if (src != const0_rtx)
3573 {
3574 val = aarch64_force_temporary (mode, temp1, val);
3575 val = gen_rtx_fmt_ee (code, mode, src, val);
3576 }
3577 else if (code == MINUS)
3578 {
3579 val = aarch64_force_temporary (mode, temp1, val);
3580 val = gen_rtx_NEG (mode, val);
3581 }
3582
3583 if (constant == 0 || frame_related_p)
3584 {
3585 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3586 if (frame_related_p)
3587 {
3588 RTX_FRAME_RELATED_P (insn) = true;
3589 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3590 gen_rtx_SET (dest, plus_constant (Pmode, src,
3591 poly_offset)));
3592 }
3593 src = dest;
3594 if (constant == 0)
3595 return;
3596 }
3597 else
3598 {
3599 src = aarch64_force_temporary (mode, temp1, val);
3600 temp1 = temp2;
3601 temp2 = NULL_RTX;
3602 }
3603
3604 emit_move_imm = true;
3605 }
3606
3607 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3608 frame_related_p, emit_move_imm);
3609 }
3610
3611 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3612 than a poly_int64. */
3613
3614 void
3615 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3616 rtx offset_rtx, rtx temp1, rtx temp2)
3617 {
3618 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3619 temp1, temp2, false);
3620 }
3621
3622 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3623 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3624 if TEMP1 already contains abs (DELTA). */
3625
3626 static inline void
3627 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3628 {
3629 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3630 temp1, temp2, true, emit_move_imm);
3631 }
3632
3633 /* Subtract DELTA from the stack pointer, marking the instructions
3634 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3635 if nonnull. */
3636
3637 static inline void
3638 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3639 bool emit_move_imm = true)
3640 {
3641 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3642 temp1, temp2, frame_related_p, emit_move_imm);
3643 }
3644
3645 /* Set DEST to (vec_series BASE STEP). */
3646
3647 static void
3648 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3649 {
3650 machine_mode mode = GET_MODE (dest);
3651 scalar_mode inner = GET_MODE_INNER (mode);
3652
3653 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3654 if (!aarch64_sve_index_immediate_p (base))
3655 base = force_reg (inner, base);
3656 if (!aarch64_sve_index_immediate_p (step))
3657 step = force_reg (inner, step);
3658
3659 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3660 }
3661
3662 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3663 register of mode MODE. Use TARGET for the result if it's nonnull
3664 and convenient.
3665
3666 The two vector modes must have the same element mode. The behavior
3667 is to duplicate architectural lane N of SRC into architectural lanes
3668 N + I * STEP of the result. On big-endian targets, architectural
3669 lane 0 of an Advanced SIMD vector is the last element of the vector
3670 in memory layout, so for big-endian targets this operation has the
3671 effect of reversing SRC before duplicating it. Callers need to
3672 account for this. */
3673
3674 rtx
3675 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3676 {
3677 machine_mode src_mode = GET_MODE (src);
3678 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3679 insn_code icode = (BYTES_BIG_ENDIAN
3680 ? code_for_aarch64_vec_duplicate_vq_be (mode)
3681 : code_for_aarch64_vec_duplicate_vq_le (mode));
3682
3683 unsigned int i = 0;
3684 expand_operand ops[3];
3685 create_output_operand (&ops[i++], target, mode);
3686 create_output_operand (&ops[i++], src, src_mode);
3687 if (BYTES_BIG_ENDIAN)
3688 {
3689 /* Create a PARALLEL describing the reversal of SRC. */
3690 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3691 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3692 nelts_per_vq - 1, -1);
3693 create_fixed_operand (&ops[i++], sel);
3694 }
3695 expand_insn (icode, i, ops);
3696 return ops[0].value;
3697 }
3698
3699 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3700 the memory image into DEST. Return true on success. */
3701
3702 static bool
3703 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3704 {
3705 src = force_const_mem (GET_MODE (src), src);
3706 if (!src)
3707 return false;
3708
3709 /* Make sure that the address is legitimate. */
3710 if (!aarch64_sve_ld1rq_operand_p (src))
3711 {
3712 rtx addr = force_reg (Pmode, XEXP (src, 0));
3713 src = replace_equiv_address (src, addr);
3714 }
3715
3716 machine_mode mode = GET_MODE (dest);
3717 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3718 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3719 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3720 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3721 return true;
3722 }
3723
3724 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3725 SVE data mode and isn't a legitimate constant. Use TARGET for the
3726 result if convenient.
3727
3728 The returned register can have whatever mode seems most natural
3729 given the contents of SRC. */
3730
3731 static rtx
3732 aarch64_expand_sve_const_vector (rtx target, rtx src)
3733 {
3734 machine_mode mode = GET_MODE (src);
3735 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3736 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3737 scalar_mode elt_mode = GET_MODE_INNER (mode);
3738 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3739 unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3740
3741 if (nelts_per_pattern == 1 && encoded_bits == 128)
3742 {
3743 /* The constant is a duplicated quadword but can't be narrowed
3744 beyond a quadword. Get the memory image of the first quadword
3745 as a 128-bit vector and try using LD1RQ to load it from memory.
3746
3747 The effect for both endiannesses is to load memory lane N into
3748 architectural lanes N + I * STEP of the result. On big-endian
3749 targets, the layout of the 128-bit vector in an Advanced SIMD
3750 register would be different from its layout in an SVE register,
3751 but this 128-bit vector is a memory value only. */
3752 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3753 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3754 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3755 return target;
3756 }
3757
3758 if (nelts_per_pattern == 1 && encoded_bits < 128)
3759 {
3760 /* The vector is a repeating sequence of 64 bits or fewer.
3761 See if we can load them using an Advanced SIMD move and then
3762 duplicate it to fill a vector. This is better than using a GPR
3763 move because it keeps everything in the same register file. */
3764 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3765 rtx_vector_builder builder (vq_mode, npatterns, 1);
3766 for (unsigned int i = 0; i < npatterns; ++i)
3767 {
3768 /* We want memory lane N to go into architectural lane N,
3769 so reverse for big-endian targets. The DUP .Q pattern
3770 has a compensating reverse built-in. */
3771 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3772 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3773 }
3774 rtx vq_src = builder.build ();
3775 if (aarch64_simd_valid_immediate (vq_src, NULL))
3776 {
3777 vq_src = force_reg (vq_mode, vq_src);
3778 return aarch64_expand_sve_dupq (target, mode, vq_src);
3779 }
3780
3781 /* Get an integer representation of the repeating part of Advanced
3782 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3783 which for big-endian targets is lane-swapped wrt a normal
3784 Advanced SIMD vector. This means that for both endiannesses,
3785 memory lane N of SVE vector SRC corresponds to architectural
3786 lane N of a register holding VQ_SRC. This in turn means that
3787 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3788 as a single 128-bit value) and thus that memory lane 0 of SRC is
3789 in the lsb of the integer. Duplicating the integer therefore
3790 ensures that memory lane N of SRC goes into architectural lane
3791 N + I * INDEX of the SVE register. */
3792 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3793 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3794 if (elt_value)
3795 {
3796 /* Pretend that we had a vector of INT_MODE to start with. */
3797 elt_mode = int_mode;
3798 mode = aarch64_full_sve_mode (int_mode).require ();
3799
3800 /* If the integer can be moved into a general register by a
3801 single instruction, do that and duplicate the result. */
3802 if (CONST_INT_P (elt_value)
3803 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3804 {
3805 elt_value = force_reg (elt_mode, elt_value);
3806 return expand_vector_broadcast (mode, elt_value);
3807 }
3808 }
3809 else if (npatterns == 1)
3810 /* We're duplicating a single value, but can't do better than
3811 force it to memory and load from there. This handles things
3812 like symbolic constants. */
3813 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3814
3815 if (elt_value)
3816 {
3817 /* Load the element from memory if we can, otherwise move it into
3818 a register and use a DUP. */
3819 rtx op = force_const_mem (elt_mode, elt_value);
3820 if (!op)
3821 op = force_reg (elt_mode, elt_value);
3822 return expand_vector_broadcast (mode, op);
3823 }
3824 }
3825
3826 /* Try using INDEX. */
3827 rtx base, step;
3828 if (const_vec_series_p (src, &base, &step))
3829 {
3830 aarch64_expand_vec_series (target, base, step);
3831 return target;
3832 }
3833
3834 /* From here on, it's better to force the whole constant to memory
3835 if we can. */
3836 if (GET_MODE_NUNITS (mode).is_constant ())
3837 return NULL_RTX;
3838
3839 /* Expand each pattern individually. */
3840 gcc_assert (npatterns > 1);
3841 rtx_vector_builder builder;
3842 auto_vec<rtx, 16> vectors (npatterns);
3843 for (unsigned int i = 0; i < npatterns; ++i)
3844 {
3845 builder.new_vector (mode, 1, nelts_per_pattern);
3846 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3847 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3848 vectors.quick_push (force_reg (mode, builder.build ()));
3849 }
3850
3851 /* Use permutes to interleave the separate vectors. */
3852 while (npatterns > 1)
3853 {
3854 npatterns /= 2;
3855 for (unsigned int i = 0; i < npatterns; ++i)
3856 {
3857 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3858 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3859 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3860 vectors[i] = tmp;
3861 }
3862 }
3863 gcc_assert (vectors[0] == target);
3864 return target;
3865 }
3866
3867 /* Use WHILE to set a predicate register of mode MODE in which the first
3868 VL bits are set and the rest are clear. Use TARGET for the register
3869 if it's nonnull and convenient. */
3870
3871 static rtx
3872 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3873 unsigned int vl)
3874 {
3875 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3876 target = aarch64_target_reg (target, mode);
3877 emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3878 return target;
3879 }
3880
3881 static rtx
3882 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3883
3884 /* BUILDER is a constant predicate in which the index of every set bit
3885 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3886 by inverting every element at a multiple of ELT_SIZE and EORing the
3887 result with an ELT_SIZE PTRUE.
3888
3889 Return a register that contains the constant on success, otherwise
3890 return null. Use TARGET as the register if it is nonnull and
3891 convenient. */
3892
3893 static rtx
3894 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3895 unsigned int elt_size)
3896 {
3897 /* Invert every element at a multiple of ELT_SIZE, keeping the
3898 other bits zero. */
3899 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3900 builder.nelts_per_pattern ());
3901 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3902 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3903 inv_builder.quick_push (const1_rtx);
3904 else
3905 inv_builder.quick_push (const0_rtx);
3906 inv_builder.finalize ();
3907
3908 /* See if we can load the constant cheaply. */
3909 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3910 if (!inv)
3911 return NULL_RTX;
3912
3913 /* EOR the result with an ELT_SIZE PTRUE. */
3914 rtx mask = aarch64_ptrue_all (elt_size);
3915 mask = force_reg (VNx16BImode, mask);
3916 target = aarch64_target_reg (target, VNx16BImode);
3917 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3918 return target;
3919 }
3920
3921 /* BUILDER is a constant predicate in which the index of every set bit
3922 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3923 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3924 register on success, otherwise return null. Use TARGET as the register
3925 if nonnull and convenient. */
3926
3927 static rtx
3928 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3929 unsigned int elt_size,
3930 unsigned int permute_size)
3931 {
3932 /* We're going to split the constant into two new constants A and B,
3933 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3934 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3935
3936 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3937 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3938
3939 where _ indicates elements that will be discarded by the permute.
3940
3941 First calculate the ELT_SIZEs for A and B. */
3942 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3943 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3944 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3945 if (INTVAL (builder.elt (i)) != 0)
3946 {
3947 if (i & permute_size)
3948 b_elt_size |= i - permute_size;
3949 else
3950 a_elt_size |= i;
3951 }
3952 a_elt_size &= -a_elt_size;
3953 b_elt_size &= -b_elt_size;
3954
3955 /* Now construct the vectors themselves. */
3956 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3957 builder.nelts_per_pattern ());
3958 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3959 builder.nelts_per_pattern ());
3960 unsigned int nelts = builder.encoded_nelts ();
3961 for (unsigned int i = 0; i < nelts; ++i)
3962 if (i & (elt_size - 1))
3963 {
3964 a_builder.quick_push (const0_rtx);
3965 b_builder.quick_push (const0_rtx);
3966 }
3967 else if ((i & permute_size) == 0)
3968 {
3969 /* The A and B elements are significant. */
3970 a_builder.quick_push (builder.elt (i));
3971 b_builder.quick_push (builder.elt (i + permute_size));
3972 }
3973 else
3974 {
3975 /* The A and B elements are going to be discarded, so pick whatever
3976 is likely to give a nice constant. We are targeting element
3977 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3978 with the aim of each being a sequence of ones followed by
3979 a sequence of zeros. So:
3980
3981 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3982 duplicate the last X_ELT_SIZE element, to extend the
3983 current sequence of ones or zeros.
3984
3985 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3986 zero, so that the constant really does have X_ELT_SIZE and
3987 not a smaller size. */
3988 if (a_elt_size > permute_size)
3989 a_builder.quick_push (const0_rtx);
3990 else
3991 a_builder.quick_push (a_builder.elt (i - a_elt_size));
3992 if (b_elt_size > permute_size)
3993 b_builder.quick_push (const0_rtx);
3994 else
3995 b_builder.quick_push (b_builder.elt (i - b_elt_size));
3996 }
3997 a_builder.finalize ();
3998 b_builder.finalize ();
3999
4000 /* Try loading A into a register. */
4001 rtx_insn *last = get_last_insn ();
4002 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4003 if (!a)
4004 return NULL_RTX;
4005
4006 /* Try loading B into a register. */
4007 rtx b = a;
4008 if (a_builder != b_builder)
4009 {
4010 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4011 if (!b)
4012 {
4013 delete_insns_since (last);
4014 return NULL_RTX;
4015 }
4016 }
4017
4018 /* Emit the TRN1 itself. */
4019 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4020 target = aarch64_target_reg (target, mode);
4021 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4022 gen_lowpart (mode, a),
4023 gen_lowpart (mode, b)));
4024 return target;
4025 }
4026
4027 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4028 constant in BUILDER into an SVE predicate register. Return the register
4029 on success, otherwise return null. Use TARGET for the register if
4030 nonnull and convenient.
4031
4032 ALLOW_RECURSE_P is true if we can use methods that would call this
4033 function recursively. */
4034
4035 static rtx
4036 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4037 bool allow_recurse_p)
4038 {
4039 if (builder.encoded_nelts () == 1)
4040 /* A PFALSE or a PTRUE .B ALL. */
4041 return aarch64_emit_set_immediate (target, builder);
4042
4043 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4044 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4045 {
4046 /* If we can load the constant using PTRUE, use it as-is. */
4047 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4048 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4049 return aarch64_emit_set_immediate (target, builder);
4050
4051 /* Otherwise use WHILE to set the first VL bits. */
4052 return aarch64_sve_move_pred_via_while (target, mode, vl);
4053 }
4054
4055 if (!allow_recurse_p)
4056 return NULL_RTX;
4057
4058 /* Try inverting the vector in element size ELT_SIZE and then EORing
4059 the result with an ELT_SIZE PTRUE. */
4060 if (INTVAL (builder.elt (0)) == 0)
4061 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4062 elt_size))
4063 return res;
4064
4065 /* Try using TRN1 to permute two simpler constants. */
4066 for (unsigned int i = elt_size; i <= 8; i *= 2)
4067 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4068 elt_size, i))
4069 return res;
4070
4071 return NULL_RTX;
4072 }
4073
4074 /* Return an SVE predicate register that contains the VNx16BImode
4075 constant in BUILDER, without going through the move expanders.
4076
4077 The returned register can have whatever mode seems most natural
4078 given the contents of BUILDER. Use TARGET for the result if
4079 convenient. */
4080
4081 static rtx
4082 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4083 {
4084 /* Try loading the constant using pure predicate operations. */
4085 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4086 return res;
4087
4088 /* Try forcing the constant to memory. */
4089 if (builder.full_nelts ().is_constant ())
4090 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4091 {
4092 target = aarch64_target_reg (target, VNx16BImode);
4093 emit_move_insn (target, mem);
4094 return target;
4095 }
4096
4097 /* The last resort is to load the constant as an integer and then
4098 compare it against zero. Use -1 for set bits in order to increase
4099 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4100 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4101 builder.nelts_per_pattern ());
4102 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4103 int_builder.quick_push (INTVAL (builder.elt (i))
4104 ? constm1_rtx : const0_rtx);
4105 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4106 int_builder.build ());
4107 }
4108
4109 /* Set DEST to immediate IMM. */
4110
4111 void
4112 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4113 {
4114 machine_mode mode = GET_MODE (dest);
4115
4116 /* Check on what type of symbol it is. */
4117 scalar_int_mode int_mode;
4118 if ((GET_CODE (imm) == SYMBOL_REF
4119 || GET_CODE (imm) == LABEL_REF
4120 || GET_CODE (imm) == CONST
4121 || GET_CODE (imm) == CONST_POLY_INT)
4122 && is_a <scalar_int_mode> (mode, &int_mode))
4123 {
4124 rtx mem;
4125 poly_int64 offset;
4126 HOST_WIDE_INT const_offset;
4127 enum aarch64_symbol_type sty;
4128
4129 /* If we have (const (plus symbol offset)), separate out the offset
4130 before we start classifying the symbol. */
4131 rtx base = strip_offset (imm, &offset);
4132
4133 /* We must always add an offset involving VL separately, rather than
4134 folding it into the relocation. */
4135 if (!offset.is_constant (&const_offset))
4136 {
4137 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4138 emit_insn (gen_rtx_SET (dest, imm));
4139 else
4140 {
4141 /* Do arithmetic on 32-bit values if the result is smaller
4142 than that. */
4143 if (partial_subreg_p (int_mode, SImode))
4144 {
4145 /* It is invalid to do symbol calculations in modes
4146 narrower than SImode. */
4147 gcc_assert (base == const0_rtx);
4148 dest = gen_lowpart (SImode, dest);
4149 int_mode = SImode;
4150 }
4151 if (base != const0_rtx)
4152 {
4153 base = aarch64_force_temporary (int_mode, dest, base);
4154 aarch64_add_offset (int_mode, dest, base, offset,
4155 NULL_RTX, NULL_RTX, false);
4156 }
4157 else
4158 aarch64_add_offset (int_mode, dest, base, offset,
4159 dest, NULL_RTX, false);
4160 }
4161 return;
4162 }
4163
4164 sty = aarch64_classify_symbol (base, const_offset);
4165 switch (sty)
4166 {
4167 case SYMBOL_FORCE_TO_MEM:
4168 if (const_offset != 0
4169 && targetm.cannot_force_const_mem (int_mode, imm))
4170 {
4171 gcc_assert (can_create_pseudo_p ());
4172 base = aarch64_force_temporary (int_mode, dest, base);
4173 aarch64_add_offset (int_mode, dest, base, const_offset,
4174 NULL_RTX, NULL_RTX, false);
4175 return;
4176 }
4177
4178 mem = force_const_mem (ptr_mode, imm);
4179 gcc_assert (mem);
4180
4181 /* If we aren't generating PC relative literals, then
4182 we need to expand the literal pool access carefully.
4183 This is something that needs to be done in a number
4184 of places, so could well live as a separate function. */
4185 if (!aarch64_pcrelative_literal_loads)
4186 {
4187 gcc_assert (can_create_pseudo_p ());
4188 base = gen_reg_rtx (ptr_mode);
4189 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4190 if (ptr_mode != Pmode)
4191 base = convert_memory_address (Pmode, base);
4192 mem = gen_rtx_MEM (ptr_mode, base);
4193 }
4194
4195 if (int_mode != ptr_mode)
4196 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4197
4198 emit_insn (gen_rtx_SET (dest, mem));
4199
4200 return;
4201
4202 case SYMBOL_SMALL_TLSGD:
4203 case SYMBOL_SMALL_TLSDESC:
4204 case SYMBOL_SMALL_TLSIE:
4205 case SYMBOL_SMALL_GOT_28K:
4206 case SYMBOL_SMALL_GOT_4G:
4207 case SYMBOL_TINY_GOT:
4208 case SYMBOL_TINY_TLSIE:
4209 if (const_offset != 0)
4210 {
4211 gcc_assert(can_create_pseudo_p ());
4212 base = aarch64_force_temporary (int_mode, dest, base);
4213 aarch64_add_offset (int_mode, dest, base, const_offset,
4214 NULL_RTX, NULL_RTX, false);
4215 return;
4216 }
4217 /* FALLTHRU */
4218
4219 case SYMBOL_SMALL_ABSOLUTE:
4220 case SYMBOL_TINY_ABSOLUTE:
4221 case SYMBOL_TLSLE12:
4222 case SYMBOL_TLSLE24:
4223 case SYMBOL_TLSLE32:
4224 case SYMBOL_TLSLE48:
4225 aarch64_load_symref_appropriately (dest, imm, sty);
4226 return;
4227
4228 default:
4229 gcc_unreachable ();
4230 }
4231 }
4232
4233 if (!CONST_INT_P (imm))
4234 {
4235 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4236 {
4237 /* Only the low bit of each .H, .S and .D element is defined,
4238 so we can set the upper bits to whatever we like. If the
4239 predicate is all-true in MODE, prefer to set all the undefined
4240 bits as well, so that we can share a single .B predicate for
4241 all modes. */
4242 if (imm == CONSTM1_RTX (mode))
4243 imm = CONSTM1_RTX (VNx16BImode);
4244
4245 /* All methods for constructing predicate modes wider than VNx16BI
4246 will set the upper bits of each element to zero. Expose this
4247 by moving such constants as a VNx16BI, so that all bits are
4248 significant and so that constants for different modes can be
4249 shared. The wider constant will still be available as a
4250 REG_EQUAL note. */
4251 rtx_vector_builder builder;
4252 if (aarch64_get_sve_pred_bits (builder, imm))
4253 {
4254 rtx res = aarch64_expand_sve_const_pred (dest, builder);
4255 if (dest != res)
4256 emit_move_insn (dest, gen_lowpart (mode, res));
4257 return;
4258 }
4259 }
4260
4261 if (GET_CODE (imm) == HIGH
4262 || aarch64_simd_valid_immediate (imm, NULL))
4263 {
4264 emit_insn (gen_rtx_SET (dest, imm));
4265 return;
4266 }
4267
4268 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4269 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4270 {
4271 if (dest != res)
4272 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4273 return;
4274 }
4275
4276 rtx mem = force_const_mem (mode, imm);
4277 gcc_assert (mem);
4278 emit_move_insn (dest, mem);
4279 return;
4280 }
4281
4282 aarch64_internal_mov_immediate (dest, imm, true,
4283 as_a <scalar_int_mode> (mode));
4284 }
4285
4286 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4287 that is known to contain PTRUE. */
4288
4289 void
4290 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4291 {
4292 expand_operand ops[3];
4293 machine_mode mode = GET_MODE (dest);
4294 create_output_operand (&ops[0], dest, mode);
4295 create_input_operand (&ops[1], pred, GET_MODE(pred));
4296 create_input_operand (&ops[2], src, mode);
4297 temporary_volatile_ok v (true);
4298 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4299 }
4300
4301 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4302 operand is in memory. In this case we need to use the predicated LD1
4303 and ST1 instead of LDR and STR, both for correctness on big-endian
4304 targets and because LD1 and ST1 support a wider range of addressing modes.
4305 PRED_MODE is the mode of the predicate.
4306
4307 See the comment at the head of aarch64-sve.md for details about the
4308 big-endian handling. */
4309
4310 void
4311 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4312 {
4313 machine_mode mode = GET_MODE (dest);
4314 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4315 if (!register_operand (src, mode)
4316 && !register_operand (dest, mode))
4317 {
4318 rtx tmp = gen_reg_rtx (mode);
4319 if (MEM_P (src))
4320 aarch64_emit_sve_pred_move (tmp, ptrue, src);
4321 else
4322 emit_move_insn (tmp, src);
4323 src = tmp;
4324 }
4325 aarch64_emit_sve_pred_move (dest, ptrue, src);
4326 }
4327
4328 /* Called only on big-endian targets. See whether an SVE vector move
4329 from SRC to DEST is effectively a REV[BHW] instruction, because at
4330 least one operand is a subreg of an SVE vector that has wider or
4331 narrower elements. Return true and emit the instruction if so.
4332
4333 For example:
4334
4335 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4336
4337 represents a VIEW_CONVERT between the following vectors, viewed
4338 in memory order:
4339
4340 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4341 R1: { [0], [1], [2], [3], ... }
4342
4343 The high part of lane X in R2 should therefore correspond to lane X*2
4344 of R1, but the register representations are:
4345
4346 msb lsb
4347 R2: ...... [1].high [1].low [0].high [0].low
4348 R1: ...... [3] [2] [1] [0]
4349
4350 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4351 We therefore need a reverse operation to swap the high and low values
4352 around.
4353
4354 This is purely an optimization. Without it we would spill the
4355 subreg operand to the stack in one mode and reload it in the
4356 other mode, which has the same effect as the REV. */
4357
4358 bool
4359 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4360 {
4361 gcc_assert (BYTES_BIG_ENDIAN);
4362 if (GET_CODE (dest) == SUBREG)
4363 dest = SUBREG_REG (dest);
4364 if (GET_CODE (src) == SUBREG)
4365 src = SUBREG_REG (src);
4366
4367 /* The optimization handles two single SVE REGs with different element
4368 sizes. */
4369 if (!REG_P (dest)
4370 || !REG_P (src)
4371 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4372 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4373 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4374 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4375 return false;
4376
4377 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4378 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4379 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4380 UNSPEC_REV_SUBREG);
4381 emit_insn (gen_rtx_SET (dest, unspec));
4382 return true;
4383 }
4384
4385 /* Return a copy of X with mode MODE, without changing its other
4386 attributes. Unlike gen_lowpart, this doesn't care whether the
4387 mode change is valid. */
4388
4389 static rtx
4390 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4391 {
4392 if (GET_MODE (x) == mode)
4393 return x;
4394
4395 x = shallow_copy_rtx (x);
4396 set_mode_and_regno (x, mode, REGNO (x));
4397 return x;
4398 }
4399
4400 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4401 stored in wider integer containers. */
4402
4403 static unsigned int
4404 aarch64_sve_rev_unspec (machine_mode mode)
4405 {
4406 switch (GET_MODE_UNIT_SIZE (mode))
4407 {
4408 case 1: return UNSPEC_REVB;
4409 case 2: return UNSPEC_REVH;
4410 case 4: return UNSPEC_REVW;
4411 }
4412 gcc_unreachable ();
4413 }
4414
4415 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4416 operands. */
4417
4418 void
4419 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4420 {
4421 /* Decide which REV operation we need. The mode with wider elements
4422 determines the mode of the operands and the mode with the narrower
4423 elements determines the reverse width. */
4424 machine_mode mode_with_wider_elts = GET_MODE (dest);
4425 machine_mode mode_with_narrower_elts = GET_MODE (src);
4426 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4427 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4428 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4429
4430 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4431 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4432 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4433
4434 /* Get the operands in the appropriate modes and emit the instruction. */
4435 ptrue = gen_lowpart (pred_mode, ptrue);
4436 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4437 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4438 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4439 dest, ptrue, src));
4440 }
4441
4442 static bool
4443 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4444 tree exp ATTRIBUTE_UNUSED)
4445 {
4446 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4447 return false;
4448
4449 return true;
4450 }
4451
4452 /* Implement TARGET_PASS_BY_REFERENCE. */
4453
4454 static bool
4455 aarch64_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
4456 {
4457 HOST_WIDE_INT size;
4458 machine_mode dummymode;
4459 int nregs;
4460
4461 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4462 if (arg.mode == BLKmode && arg.type)
4463 size = int_size_in_bytes (arg.type);
4464 else
4465 /* No frontends can create types with variable-sized modes, so we
4466 shouldn't be asked to pass or return them. */
4467 size = GET_MODE_SIZE (arg.mode).to_constant ();
4468
4469 /* Aggregates are passed by reference based on their size. */
4470 if (arg.aggregate_type_p ())
4471 size = int_size_in_bytes (arg.type);
4472
4473 /* Variable sized arguments are always returned by reference. */
4474 if (size < 0)
4475 return true;
4476
4477 /* Can this be a candidate to be passed in fp/simd register(s)? */
4478 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4479 &dummymode, &nregs,
4480 NULL))
4481 return false;
4482
4483 /* Arguments which are variable sized or larger than 2 registers are
4484 passed by reference unless they are a homogenous floating point
4485 aggregate. */
4486 return size > 2 * UNITS_PER_WORD;
4487 }
4488
4489 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4490 static bool
4491 aarch64_return_in_msb (const_tree valtype)
4492 {
4493 machine_mode dummy_mode;
4494 int dummy_int;
4495
4496 /* Never happens in little-endian mode. */
4497 if (!BYTES_BIG_ENDIAN)
4498 return false;
4499
4500 /* Only composite types smaller than or equal to 16 bytes can
4501 be potentially returned in registers. */
4502 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4503 || int_size_in_bytes (valtype) <= 0
4504 || int_size_in_bytes (valtype) > 16)
4505 return false;
4506
4507 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4508 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4509 is always passed/returned in the least significant bits of fp/simd
4510 register(s). */
4511 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4512 &dummy_mode, &dummy_int, NULL))
4513 return false;
4514
4515 return true;
4516 }
4517
4518 /* Implement TARGET_FUNCTION_VALUE.
4519 Define how to find the value returned by a function. */
4520
4521 static rtx
4522 aarch64_function_value (const_tree type, const_tree func,
4523 bool outgoing ATTRIBUTE_UNUSED)
4524 {
4525 machine_mode mode;
4526 int unsignedp;
4527 int count;
4528 machine_mode ag_mode;
4529
4530 mode = TYPE_MODE (type);
4531 if (INTEGRAL_TYPE_P (type))
4532 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4533
4534 if (aarch64_return_in_msb (type))
4535 {
4536 HOST_WIDE_INT size = int_size_in_bytes (type);
4537
4538 if (size % UNITS_PER_WORD != 0)
4539 {
4540 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4541 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4542 }
4543 }
4544
4545 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4546 &ag_mode, &count, NULL))
4547 {
4548 if (!aarch64_composite_type_p (type, mode))
4549 {
4550 gcc_assert (count == 1 && mode == ag_mode);
4551 return gen_rtx_REG (mode, V0_REGNUM);
4552 }
4553 else
4554 {
4555 int i;
4556 rtx par;
4557
4558 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4559 for (i = 0; i < count; i++)
4560 {
4561 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4562 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4563 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4564 XVECEXP (par, 0, i) = tmp;
4565 }
4566 return par;
4567 }
4568 }
4569 else
4570 return gen_rtx_REG (mode, R0_REGNUM);
4571 }
4572
4573 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4574 Return true if REGNO is the number of a hard register in which the values
4575 of called function may come back. */
4576
4577 static bool
4578 aarch64_function_value_regno_p (const unsigned int regno)
4579 {
4580 /* Maximum of 16 bytes can be returned in the general registers. Examples
4581 of 16-byte return values are: 128-bit integers and 16-byte small
4582 structures (excluding homogeneous floating-point aggregates). */
4583 if (regno == R0_REGNUM || regno == R1_REGNUM)
4584 return true;
4585
4586 /* Up to four fp/simd registers can return a function value, e.g. a
4587 homogeneous floating-point aggregate having four members. */
4588 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4589 return TARGET_FLOAT;
4590
4591 return false;
4592 }
4593
4594 /* Implement TARGET_RETURN_IN_MEMORY.
4595
4596 If the type T of the result of a function is such that
4597 void func (T arg)
4598 would require that arg be passed as a value in a register (or set of
4599 registers) according to the parameter passing rules, then the result
4600 is returned in the same registers as would be used for such an
4601 argument. */
4602
4603 static bool
4604 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4605 {
4606 HOST_WIDE_INT size;
4607 machine_mode ag_mode;
4608 int count;
4609
4610 if (!AGGREGATE_TYPE_P (type)
4611 && TREE_CODE (type) != COMPLEX_TYPE
4612 && TREE_CODE (type) != VECTOR_TYPE)
4613 /* Simple scalar types always returned in registers. */
4614 return false;
4615
4616 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4617 type,
4618 &ag_mode,
4619 &count,
4620 NULL))
4621 return false;
4622
4623 /* Types larger than 2 registers returned in memory. */
4624 size = int_size_in_bytes (type);
4625 return (size < 0 || size > 2 * UNITS_PER_WORD);
4626 }
4627
4628 static bool
4629 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4630 const_tree type, int *nregs)
4631 {
4632 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4633 return aarch64_vfp_is_call_or_return_candidate (mode,
4634 type,
4635 &pcum->aapcs_vfp_rmode,
4636 nregs,
4637 NULL);
4638 }
4639
4640 /* Given MODE and TYPE of a function argument, return the alignment in
4641 bits. The idea is to suppress any stronger alignment requested by
4642 the user and opt for the natural alignment (specified in AAPCS64 \S
4643 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4644 calculated in versions of GCC prior to GCC-9. This is a helper
4645 function for local use only. */
4646
4647 static unsigned int
4648 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4649 bool *abi_break)
4650 {
4651 *abi_break = false;
4652 if (!type)
4653 return GET_MODE_ALIGNMENT (mode);
4654
4655 if (integer_zerop (TYPE_SIZE (type)))
4656 return 0;
4657
4658 gcc_assert (TYPE_MODE (type) == mode);
4659
4660 if (!AGGREGATE_TYPE_P (type))
4661 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4662
4663 if (TREE_CODE (type) == ARRAY_TYPE)
4664 return TYPE_ALIGN (TREE_TYPE (type));
4665
4666 unsigned int alignment = 0;
4667 unsigned int bitfield_alignment = 0;
4668 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4669 if (TREE_CODE (field) == FIELD_DECL)
4670 {
4671 alignment = std::max (alignment, DECL_ALIGN (field));
4672 if (DECL_BIT_FIELD_TYPE (field))
4673 bitfield_alignment
4674 = std::max (bitfield_alignment,
4675 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4676 }
4677
4678 if (bitfield_alignment > alignment)
4679 {
4680 *abi_break = true;
4681 return bitfield_alignment;
4682 }
4683
4684 return alignment;
4685 }
4686
4687 /* Layout a function argument according to the AAPCS64 rules. The rule
4688 numbers refer to the rule numbers in the AAPCS64. */
4689
4690 static void
4691 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4692 const_tree type,
4693 bool named ATTRIBUTE_UNUSED)
4694 {
4695 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4696 int ncrn, nvrn, nregs;
4697 bool allocate_ncrn, allocate_nvrn;
4698 HOST_WIDE_INT size;
4699 bool abi_break;
4700
4701 /* We need to do this once per argument. */
4702 if (pcum->aapcs_arg_processed)
4703 return;
4704
4705 pcum->aapcs_arg_processed = true;
4706
4707 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4708 if (type)
4709 size = int_size_in_bytes (type);
4710 else
4711 /* No frontends can create types with variable-sized modes, so we
4712 shouldn't be asked to pass or return them. */
4713 size = GET_MODE_SIZE (mode).to_constant ();
4714 size = ROUND_UP (size, UNITS_PER_WORD);
4715
4716 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4717 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4718 mode,
4719 type,
4720 &nregs);
4721
4722 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4723 The following code thus handles passing by SIMD/FP registers first. */
4724
4725 nvrn = pcum->aapcs_nvrn;
4726
4727 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4728 and homogenous short-vector aggregates (HVA). */
4729 if (allocate_nvrn)
4730 {
4731 if (!TARGET_FLOAT)
4732 aarch64_err_no_fpadvsimd (mode);
4733
4734 if (nvrn + nregs <= NUM_FP_ARG_REGS)
4735 {
4736 pcum->aapcs_nextnvrn = nvrn + nregs;
4737 if (!aarch64_composite_type_p (type, mode))
4738 {
4739 gcc_assert (nregs == 1);
4740 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4741 }
4742 else
4743 {
4744 rtx par;
4745 int i;
4746 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4747 for (i = 0; i < nregs; i++)
4748 {
4749 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4750 V0_REGNUM + nvrn + i);
4751 rtx offset = gen_int_mode
4752 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4753 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4754 XVECEXP (par, 0, i) = tmp;
4755 }
4756 pcum->aapcs_reg = par;
4757 }
4758 return;
4759 }
4760 else
4761 {
4762 /* C.3 NSRN is set to 8. */
4763 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4764 goto on_stack;
4765 }
4766 }
4767
4768 ncrn = pcum->aapcs_ncrn;
4769 nregs = size / UNITS_PER_WORD;
4770
4771 /* C6 - C9. though the sign and zero extension semantics are
4772 handled elsewhere. This is the case where the argument fits
4773 entirely general registers. */
4774 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4775 {
4776 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4777
4778 /* C.8 if the argument has an alignment of 16 then the NGRN is
4779 rounded up to the next even number. */
4780 if (nregs == 2
4781 && ncrn % 2
4782 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4783 comparison is there because for > 16 * BITS_PER_UNIT
4784 alignment nregs should be > 2 and therefore it should be
4785 passed by reference rather than value. */
4786 && (aarch64_function_arg_alignment (mode, type, &abi_break)
4787 == 16 * BITS_PER_UNIT))
4788 {
4789 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4790 inform (input_location, "parameter passing for argument of type "
4791 "%qT changed in GCC 9.1", type);
4792 ++ncrn;
4793 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4794 }
4795
4796 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4797 A reg is still generated for it, but the caller should be smart
4798 enough not to use it. */
4799 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4800 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4801 else
4802 {
4803 rtx par;
4804 int i;
4805
4806 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4807 for (i = 0; i < nregs; i++)
4808 {
4809 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4810 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4811 GEN_INT (i * UNITS_PER_WORD));
4812 XVECEXP (par, 0, i) = tmp;
4813 }
4814 pcum->aapcs_reg = par;
4815 }
4816
4817 pcum->aapcs_nextncrn = ncrn + nregs;
4818 return;
4819 }
4820
4821 /* C.11 */
4822 pcum->aapcs_nextncrn = NUM_ARG_REGS;
4823
4824 /* The argument is passed on stack; record the needed number of words for
4825 this argument and align the total size if necessary. */
4826 on_stack:
4827 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4828
4829 if (aarch64_function_arg_alignment (mode, type, &abi_break)
4830 == 16 * BITS_PER_UNIT)
4831 {
4832 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4833 if (pcum->aapcs_stack_size != new_size)
4834 {
4835 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4836 inform (input_location, "parameter passing for argument of type "
4837 "%qT changed in GCC 9.1", type);
4838 pcum->aapcs_stack_size = new_size;
4839 }
4840 }
4841 return;
4842 }
4843
4844 /* Implement TARGET_FUNCTION_ARG. */
4845
4846 static rtx
4847 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
4848 {
4849 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4850 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4851
4852 if (arg.end_marker_p ())
4853 return NULL_RTX;
4854
4855 aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4856 return pcum->aapcs_reg;
4857 }
4858
4859 void
4860 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4861 const_tree fntype ATTRIBUTE_UNUSED,
4862 rtx libname ATTRIBUTE_UNUSED,
4863 const_tree fndecl ATTRIBUTE_UNUSED,
4864 unsigned n_named ATTRIBUTE_UNUSED)
4865 {
4866 pcum->aapcs_ncrn = 0;
4867 pcum->aapcs_nvrn = 0;
4868 pcum->aapcs_nextncrn = 0;
4869 pcum->aapcs_nextnvrn = 0;
4870 pcum->pcs_variant = ARM_PCS_AAPCS64;
4871 pcum->aapcs_reg = NULL_RTX;
4872 pcum->aapcs_arg_processed = false;
4873 pcum->aapcs_stack_words = 0;
4874 pcum->aapcs_stack_size = 0;
4875
4876 if (!TARGET_FLOAT
4877 && fndecl && TREE_PUBLIC (fndecl)
4878 && fntype && fntype != error_mark_node)
4879 {
4880 const_tree type = TREE_TYPE (fntype);
4881 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4882 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4883 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4884 &mode, &nregs, NULL))
4885 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4886 }
4887 return;
4888 }
4889
4890 static void
4891 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4892 const function_arg_info &arg)
4893 {
4894 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4895 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4896 {
4897 aarch64_layout_arg (pcum_v, arg.mode, arg.type, arg.named);
4898 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4899 != (pcum->aapcs_stack_words != 0));
4900 pcum->aapcs_arg_processed = false;
4901 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4902 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4903 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4904 pcum->aapcs_stack_words = 0;
4905 pcum->aapcs_reg = NULL_RTX;
4906 }
4907 }
4908
4909 bool
4910 aarch64_function_arg_regno_p (unsigned regno)
4911 {
4912 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4913 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4914 }
4915
4916 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4917 PARM_BOUNDARY bits of alignment, but will be given anything up
4918 to STACK_BOUNDARY bits if the type requires it. This makes sure
4919 that both before and after the layout of each argument, the Next
4920 Stacked Argument Address (NSAA) will have a minimum alignment of
4921 8 bytes. */
4922
4923 static unsigned int
4924 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4925 {
4926 bool abi_break;
4927 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4928 &abi_break);
4929 if (abi_break & warn_psabi)
4930 inform (input_location, "parameter passing for argument of type "
4931 "%qT changed in GCC 9.1", type);
4932
4933 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4934 }
4935
4936 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4937
4938 static fixed_size_mode
4939 aarch64_get_reg_raw_mode (int regno)
4940 {
4941 if (TARGET_SVE && FP_REGNUM_P (regno))
4942 /* Don't use the SVE part of the register for __builtin_apply and
4943 __builtin_return. The SVE registers aren't used by the normal PCS,
4944 so using them there would be a waste of time. The PCS extensions
4945 for SVE types are fundamentally incompatible with the
4946 __builtin_return/__builtin_apply interface. */
4947 return as_a <fixed_size_mode> (V16QImode);
4948 return default_get_reg_raw_mode (regno);
4949 }
4950
4951 /* Implement TARGET_FUNCTION_ARG_PADDING.
4952
4953 Small aggregate types are placed in the lowest memory address.
4954
4955 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4956
4957 static pad_direction
4958 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4959 {
4960 /* On little-endian targets, the least significant byte of every stack
4961 argument is passed at the lowest byte address of the stack slot. */
4962 if (!BYTES_BIG_ENDIAN)
4963 return PAD_UPWARD;
4964
4965 /* Otherwise, integral, floating-point and pointer types are padded downward:
4966 the least significant byte of a stack argument is passed at the highest
4967 byte address of the stack slot. */
4968 if (type
4969 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4970 || POINTER_TYPE_P (type))
4971 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4972 return PAD_DOWNWARD;
4973
4974 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4975 return PAD_UPWARD;
4976 }
4977
4978 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4979
4980 It specifies padding for the last (may also be the only)
4981 element of a block move between registers and memory. If
4982 assuming the block is in the memory, padding upward means that
4983 the last element is padded after its highest significant byte,
4984 while in downward padding, the last element is padded at the
4985 its least significant byte side.
4986
4987 Small aggregates and small complex types are always padded
4988 upwards.
4989
4990 We don't need to worry about homogeneous floating-point or
4991 short-vector aggregates; their move is not affected by the
4992 padding direction determined here. Regardless of endianness,
4993 each element of such an aggregate is put in the least
4994 significant bits of a fp/simd register.
4995
4996 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4997 register has useful data, and return the opposite if the most
4998 significant byte does. */
4999
5000 bool
5001 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
5002 bool first ATTRIBUTE_UNUSED)
5003 {
5004
5005 /* Small composite types are always padded upward. */
5006 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
5007 {
5008 HOST_WIDE_INT size;
5009 if (type)
5010 size = int_size_in_bytes (type);
5011 else
5012 /* No frontends can create types with variable-sized modes, so we
5013 shouldn't be asked to pass or return them. */
5014 size = GET_MODE_SIZE (mode).to_constant ();
5015 if (size < 2 * UNITS_PER_WORD)
5016 return true;
5017 }
5018
5019 /* Otherwise, use the default padding. */
5020 return !BYTES_BIG_ENDIAN;
5021 }
5022
5023 static scalar_int_mode
5024 aarch64_libgcc_cmp_return_mode (void)
5025 {
5026 return SImode;
5027 }
5028
5029 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5030
5031 /* We use the 12-bit shifted immediate arithmetic instructions so values
5032 must be multiple of (1 << 12), i.e. 4096. */
5033 #define ARITH_FACTOR 4096
5034
5035 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5036 #error Cannot use simple address calculation for stack probing
5037 #endif
5038
5039 /* The pair of scratch registers used for stack probing. */
5040 #define PROBE_STACK_FIRST_REG R9_REGNUM
5041 #define PROBE_STACK_SECOND_REG R10_REGNUM
5042
5043 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5044 inclusive. These are offsets from the current stack pointer. */
5045
5046 static void
5047 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5048 {
5049 HOST_WIDE_INT size;
5050 if (!poly_size.is_constant (&size))
5051 {
5052 sorry ("stack probes for SVE frames");
5053 return;
5054 }
5055
5056 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5057
5058 /* See the same assertion on PROBE_INTERVAL above. */
5059 gcc_assert ((first % ARITH_FACTOR) == 0);
5060
5061 /* See if we have a constant small number of probes to generate. If so,
5062 that's the easy case. */
5063 if (size <= PROBE_INTERVAL)
5064 {
5065 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5066
5067 emit_set_insn (reg1,
5068 plus_constant (Pmode,
5069 stack_pointer_rtx, -(first + base)));
5070 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5071 }
5072
5073 /* The run-time loop is made up of 8 insns in the generic case while the
5074 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5075 else if (size <= 4 * PROBE_INTERVAL)
5076 {
5077 HOST_WIDE_INT i, rem;
5078
5079 emit_set_insn (reg1,
5080 plus_constant (Pmode,
5081 stack_pointer_rtx,
5082 -(first + PROBE_INTERVAL)));
5083 emit_stack_probe (reg1);
5084
5085 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5086 it exceeds SIZE. If only two probes are needed, this will not
5087 generate any code. Then probe at FIRST + SIZE. */
5088 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5089 {
5090 emit_set_insn (reg1,
5091 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5092 emit_stack_probe (reg1);
5093 }
5094
5095 rem = size - (i - PROBE_INTERVAL);
5096 if (rem > 256)
5097 {
5098 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5099
5100 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5101 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5102 }
5103 else
5104 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5105 }
5106
5107 /* Otherwise, do the same as above, but in a loop. Note that we must be
5108 extra careful with variables wrapping around because we might be at
5109 the very top (or the very bottom) of the address space and we have
5110 to be able to handle this case properly; in particular, we use an
5111 equality test for the loop condition. */
5112 else
5113 {
5114 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5115
5116 /* Step 1: round SIZE to the previous multiple of the interval. */
5117
5118 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5119
5120
5121 /* Step 2: compute initial and final value of the loop counter. */
5122
5123 /* TEST_ADDR = SP + FIRST. */
5124 emit_set_insn (reg1,
5125 plus_constant (Pmode, stack_pointer_rtx, -first));
5126
5127 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5128 HOST_WIDE_INT adjustment = - (first + rounded_size);
5129 if (! aarch64_uimm12_shift (adjustment))
5130 {
5131 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5132 true, Pmode);
5133 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5134 }
5135 else
5136 emit_set_insn (reg2,
5137 plus_constant (Pmode, stack_pointer_rtx, adjustment));
5138
5139 /* Step 3: the loop
5140
5141 do
5142 {
5143 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5144 probe at TEST_ADDR
5145 }
5146 while (TEST_ADDR != LAST_ADDR)
5147
5148 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5149 until it is equal to ROUNDED_SIZE. */
5150
5151 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5152
5153
5154 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5155 that SIZE is equal to ROUNDED_SIZE. */
5156
5157 if (size != rounded_size)
5158 {
5159 HOST_WIDE_INT rem = size - rounded_size;
5160
5161 if (rem > 256)
5162 {
5163 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5164
5165 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5166 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5167 }
5168 else
5169 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5170 }
5171 }
5172
5173 /* Make sure nothing is scheduled before we are done. */
5174 emit_insn (gen_blockage ());
5175 }
5176
5177 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5178 absolute addresses. */
5179
5180 const char *
5181 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5182 {
5183 static int labelno = 0;
5184 char loop_lab[32];
5185 rtx xops[2];
5186
5187 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5188
5189 /* Loop. */
5190 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5191
5192 HOST_WIDE_INT stack_clash_probe_interval
5193 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5194
5195 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5196 xops[0] = reg1;
5197 HOST_WIDE_INT interval;
5198 if (flag_stack_clash_protection)
5199 interval = stack_clash_probe_interval;
5200 else
5201 interval = PROBE_INTERVAL;
5202
5203 gcc_assert (aarch64_uimm12_shift (interval));
5204 xops[1] = GEN_INT (interval);
5205
5206 output_asm_insn ("sub\t%0, %0, %1", xops);
5207
5208 /* If doing stack clash protection then we probe up by the ABI specified
5209 amount. We do this because we're dropping full pages at a time in the
5210 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5211 if (flag_stack_clash_protection)
5212 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5213 else
5214 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5215
5216 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5217 by this amount for each iteration. */
5218 output_asm_insn ("str\txzr, [%0, %1]", xops);
5219
5220 /* Test if TEST_ADDR == LAST_ADDR. */
5221 xops[1] = reg2;
5222 output_asm_insn ("cmp\t%0, %1", xops);
5223
5224 /* Branch. */
5225 fputs ("\tb.ne\t", asm_out_file);
5226 assemble_name_raw (asm_out_file, loop_lab);
5227 fputc ('\n', asm_out_file);
5228
5229 return "";
5230 }
5231
5232 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5233 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5234 of GUARD_SIZE. When a probe is emitted it is done at most
5235 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5236 at most MIN_PROBE_THRESHOLD. By the end of this function
5237 BASE = BASE - ADJUSTMENT. */
5238
5239 const char *
5240 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5241 rtx min_probe_threshold, rtx guard_size)
5242 {
5243 /* This function is not allowed to use any instruction generation function
5244 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5245 so instead emit the code you want using output_asm_insn. */
5246 gcc_assert (flag_stack_clash_protection);
5247 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5248 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5249
5250 /* The minimum required allocation before the residual requires probing. */
5251 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5252
5253 /* Clamp the value down to the nearest value that can be used with a cmp. */
5254 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5255 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5256
5257 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5258 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5259
5260 static int labelno = 0;
5261 char loop_start_lab[32];
5262 char loop_end_lab[32];
5263 rtx xops[2];
5264
5265 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5266 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5267
5268 /* Emit loop start label. */
5269 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5270
5271 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5272 xops[0] = adjustment;
5273 xops[1] = probe_offset_value_rtx;
5274 output_asm_insn ("cmp\t%0, %1", xops);
5275
5276 /* Branch to end if not enough adjustment to probe. */
5277 fputs ("\tb.lt\t", asm_out_file);
5278 assemble_name_raw (asm_out_file, loop_end_lab);
5279 fputc ('\n', asm_out_file);
5280
5281 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5282 xops[0] = base;
5283 xops[1] = probe_offset_value_rtx;
5284 output_asm_insn ("sub\t%0, %0, %1", xops);
5285
5286 /* Probe at BASE. */
5287 xops[1] = const0_rtx;
5288 output_asm_insn ("str\txzr, [%0, %1]", xops);
5289
5290 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5291 xops[0] = adjustment;
5292 xops[1] = probe_offset_value_rtx;
5293 output_asm_insn ("sub\t%0, %0, %1", xops);
5294
5295 /* Branch to start if still more bytes to allocate. */
5296 fputs ("\tb\t", asm_out_file);
5297 assemble_name_raw (asm_out_file, loop_start_lab);
5298 fputc ('\n', asm_out_file);
5299
5300 /* No probe leave. */
5301 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5302
5303 /* BASE = BASE - ADJUSTMENT. */
5304 xops[0] = base;
5305 xops[1] = adjustment;
5306 output_asm_insn ("sub\t%0, %0, %1", xops);
5307 return "";
5308 }
5309
5310 /* Determine whether a frame chain needs to be generated. */
5311 static bool
5312 aarch64_needs_frame_chain (void)
5313 {
5314 /* Force a frame chain for EH returns so the return address is at FP+8. */
5315 if (frame_pointer_needed || crtl->calls_eh_return)
5316 return true;
5317
5318 /* A leaf function cannot have calls or write LR. */
5319 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5320
5321 /* Don't use a frame chain in leaf functions if leaf frame pointers
5322 are disabled. */
5323 if (flag_omit_leaf_frame_pointer && is_leaf)
5324 return false;
5325
5326 return aarch64_use_frame_pointer;
5327 }
5328
5329 /* Mark the registers that need to be saved by the callee and calculate
5330 the size of the callee-saved registers area and frame record (both FP
5331 and LR may be omitted). */
5332 static void
5333 aarch64_layout_frame (void)
5334 {
5335 HOST_WIDE_INT offset = 0;
5336 int regno, last_fp_reg = INVALID_REGNUM;
5337 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5338
5339 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5340
5341 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5342 the mid-end is doing. */
5343 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5344
5345 #define SLOT_NOT_REQUIRED (-2)
5346 #define SLOT_REQUIRED (-1)
5347
5348 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5349 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5350
5351 /* If this is a non-leaf simd function with calls we assume that
5352 at least one of those calls is to a non-simd function and thus
5353 we must save V8 to V23 in the prologue. */
5354
5355 if (simd_function && !crtl->is_leaf)
5356 {
5357 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5358 if (FP_SIMD_SAVED_REGNUM_P (regno))
5359 df_set_regs_ever_live (regno, true);
5360 }
5361
5362 /* First mark all the registers that really need to be saved... */
5363 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5364 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5365
5366 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5367 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5368
5369 /* ... that includes the eh data registers (if needed)... */
5370 if (crtl->calls_eh_return)
5371 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5372 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5373 = SLOT_REQUIRED;
5374
5375 /* ... and any callee saved register that dataflow says is live. */
5376 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5377 if (df_regs_ever_live_p (regno)
5378 && (regno == R30_REGNUM
5379 || !call_used_or_fixed_reg_p (regno)))
5380 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5381
5382 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5383 if (df_regs_ever_live_p (regno)
5384 && (!call_used_or_fixed_reg_p (regno)
5385 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5386 {
5387 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5388 last_fp_reg = regno;
5389 }
5390
5391 if (cfun->machine->frame.emit_frame_chain)
5392 {
5393 /* FP and LR are placed in the linkage record. */
5394 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5395 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5396 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5397 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5398 offset = 2 * UNITS_PER_WORD;
5399 }
5400
5401 /* With stack-clash, LR must be saved in non-leaf functions. */
5402 gcc_assert (crtl->is_leaf
5403 || (cfun->machine->frame.reg_offset[R30_REGNUM]
5404 != SLOT_NOT_REQUIRED));
5405
5406 /* Now assign stack slots for them. */
5407 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5408 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5409 {
5410 cfun->machine->frame.reg_offset[regno] = offset;
5411 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5412 cfun->machine->frame.wb_candidate1 = regno;
5413 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5414 cfun->machine->frame.wb_candidate2 = regno;
5415 offset += UNITS_PER_WORD;
5416 }
5417
5418 HOST_WIDE_INT max_int_offset = offset;
5419 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5420 bool has_align_gap = offset != max_int_offset;
5421
5422 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5423 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5424 {
5425 /* If there is an alignment gap between integer and fp callee-saves,
5426 allocate the last fp register to it if possible. */
5427 if (regno == last_fp_reg
5428 && has_align_gap
5429 && !simd_function
5430 && (offset & 8) == 0)
5431 {
5432 cfun->machine->frame.reg_offset[regno] = max_int_offset;
5433 break;
5434 }
5435
5436 cfun->machine->frame.reg_offset[regno] = offset;
5437 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5438 cfun->machine->frame.wb_candidate1 = regno;
5439 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5440 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5441 cfun->machine->frame.wb_candidate2 = regno;
5442 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5443 }
5444
5445 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5446
5447 cfun->machine->frame.saved_regs_size = offset;
5448
5449 HOST_WIDE_INT varargs_and_saved_regs_size
5450 = offset + cfun->machine->frame.saved_varargs_size;
5451
5452 cfun->machine->frame.hard_fp_offset
5453 = aligned_upper_bound (varargs_and_saved_regs_size
5454 + get_frame_size (),
5455 STACK_BOUNDARY / BITS_PER_UNIT);
5456
5457 /* Both these values are already aligned. */
5458 gcc_assert (multiple_p (crtl->outgoing_args_size,
5459 STACK_BOUNDARY / BITS_PER_UNIT));
5460 cfun->machine->frame.frame_size
5461 = (cfun->machine->frame.hard_fp_offset
5462 + crtl->outgoing_args_size);
5463
5464 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5465
5466 cfun->machine->frame.initial_adjust = 0;
5467 cfun->machine->frame.final_adjust = 0;
5468 cfun->machine->frame.callee_adjust = 0;
5469 cfun->machine->frame.callee_offset = 0;
5470
5471 HOST_WIDE_INT max_push_offset = 0;
5472 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5473 max_push_offset = 512;
5474 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5475 max_push_offset = 256;
5476
5477 HOST_WIDE_INT const_size, const_fp_offset;
5478 if (cfun->machine->frame.frame_size.is_constant (&const_size)
5479 && const_size < max_push_offset
5480 && known_eq (crtl->outgoing_args_size, 0))
5481 {
5482 /* Simple, small frame with no outgoing arguments:
5483 stp reg1, reg2, [sp, -frame_size]!
5484 stp reg3, reg4, [sp, 16] */
5485 cfun->machine->frame.callee_adjust = const_size;
5486 }
5487 else if (known_lt (crtl->outgoing_args_size
5488 + cfun->machine->frame.saved_regs_size, 512)
5489 && !(cfun->calls_alloca
5490 && known_lt (cfun->machine->frame.hard_fp_offset,
5491 max_push_offset)))
5492 {
5493 /* Frame with small outgoing arguments:
5494 sub sp, sp, frame_size
5495 stp reg1, reg2, [sp, outgoing_args_size]
5496 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5497 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5498 cfun->machine->frame.callee_offset
5499 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5500 }
5501 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5502 && const_fp_offset < max_push_offset)
5503 {
5504 /* Frame with large outgoing arguments but a small local area:
5505 stp reg1, reg2, [sp, -hard_fp_offset]!
5506 stp reg3, reg4, [sp, 16]
5507 sub sp, sp, outgoing_args_size */
5508 cfun->machine->frame.callee_adjust = const_fp_offset;
5509 cfun->machine->frame.final_adjust
5510 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5511 }
5512 else
5513 {
5514 /* Frame with large local area and outgoing arguments using frame pointer:
5515 sub sp, sp, hard_fp_offset
5516 stp x29, x30, [sp, 0]
5517 add x29, sp, 0
5518 stp reg3, reg4, [sp, 16]
5519 sub sp, sp, outgoing_args_size */
5520 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5521 cfun->machine->frame.final_adjust
5522 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5523 }
5524
5525 cfun->machine->frame.laid_out = true;
5526 }
5527
5528 /* Return true if the register REGNO is saved on entry to
5529 the current function. */
5530
5531 static bool
5532 aarch64_register_saved_on_entry (int regno)
5533 {
5534 return cfun->machine->frame.reg_offset[regno] >= 0;
5535 }
5536
5537 /* Return the next register up from REGNO up to LIMIT for the callee
5538 to save. */
5539
5540 static unsigned
5541 aarch64_next_callee_save (unsigned regno, unsigned limit)
5542 {
5543 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5544 regno ++;
5545 return regno;
5546 }
5547
5548 /* Push the register number REGNO of mode MODE to the stack with write-back
5549 adjusting the stack by ADJUSTMENT. */
5550
5551 static void
5552 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5553 HOST_WIDE_INT adjustment)
5554 {
5555 rtx base_rtx = stack_pointer_rtx;
5556 rtx insn, reg, mem;
5557
5558 reg = gen_rtx_REG (mode, regno);
5559 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5560 plus_constant (Pmode, base_rtx, -adjustment));
5561 mem = gen_frame_mem (mode, mem);
5562
5563 insn = emit_move_insn (mem, reg);
5564 RTX_FRAME_RELATED_P (insn) = 1;
5565 }
5566
5567 /* Generate and return an instruction to store the pair of registers
5568 REG and REG2 of mode MODE to location BASE with write-back adjusting
5569 the stack location BASE by ADJUSTMENT. */
5570
5571 static rtx
5572 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5573 HOST_WIDE_INT adjustment)
5574 {
5575 switch (mode)
5576 {
5577 case E_DImode:
5578 return gen_storewb_pairdi_di (base, base, reg, reg2,
5579 GEN_INT (-adjustment),
5580 GEN_INT (UNITS_PER_WORD - adjustment));
5581 case E_DFmode:
5582 return gen_storewb_pairdf_di (base, base, reg, reg2,
5583 GEN_INT (-adjustment),
5584 GEN_INT (UNITS_PER_WORD - adjustment));
5585 case E_TFmode:
5586 return gen_storewb_pairtf_di (base, base, reg, reg2,
5587 GEN_INT (-adjustment),
5588 GEN_INT (UNITS_PER_VREG - adjustment));
5589 default:
5590 gcc_unreachable ();
5591 }
5592 }
5593
5594 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5595 stack pointer by ADJUSTMENT. */
5596
5597 static void
5598 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5599 {
5600 rtx_insn *insn;
5601 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5602
5603 if (regno2 == INVALID_REGNUM)
5604 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5605
5606 rtx reg1 = gen_rtx_REG (mode, regno1);
5607 rtx reg2 = gen_rtx_REG (mode, regno2);
5608
5609 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5610 reg2, adjustment));
5611 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5612 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5613 RTX_FRAME_RELATED_P (insn) = 1;
5614 }
5615
5616 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5617 adjusting it by ADJUSTMENT afterwards. */
5618
5619 static rtx
5620 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5621 HOST_WIDE_INT adjustment)
5622 {
5623 switch (mode)
5624 {
5625 case E_DImode:
5626 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5627 GEN_INT (UNITS_PER_WORD));
5628 case E_DFmode:
5629 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5630 GEN_INT (UNITS_PER_WORD));
5631 case E_TFmode:
5632 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5633 GEN_INT (UNITS_PER_VREG));
5634 default:
5635 gcc_unreachable ();
5636 }
5637 }
5638
5639 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5640 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5641 into CFI_OPS. */
5642
5643 static void
5644 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5645 rtx *cfi_ops)
5646 {
5647 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5648 rtx reg1 = gen_rtx_REG (mode, regno1);
5649
5650 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5651
5652 if (regno2 == INVALID_REGNUM)
5653 {
5654 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5655 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5656 emit_move_insn (reg1, gen_frame_mem (mode, mem));
5657 }
5658 else
5659 {
5660 rtx reg2 = gen_rtx_REG (mode, regno2);
5661 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5662 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5663 reg2, adjustment));
5664 }
5665 }
5666
5667 /* Generate and return a store pair instruction of mode MODE to store
5668 register REG1 to MEM1 and register REG2 to MEM2. */
5669
5670 static rtx
5671 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5672 rtx reg2)
5673 {
5674 switch (mode)
5675 {
5676 case E_DImode:
5677 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5678
5679 case E_DFmode:
5680 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5681
5682 case E_TFmode:
5683 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5684
5685 default:
5686 gcc_unreachable ();
5687 }
5688 }
5689
5690 /* Generate and regurn a load pair isntruction of mode MODE to load register
5691 REG1 from MEM1 and register REG2 from MEM2. */
5692
5693 static rtx
5694 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5695 rtx mem2)
5696 {
5697 switch (mode)
5698 {
5699 case E_DImode:
5700 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5701
5702 case E_DFmode:
5703 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5704
5705 case E_TFmode:
5706 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5707
5708 default:
5709 gcc_unreachable ();
5710 }
5711 }
5712
5713 /* Return TRUE if return address signing should be enabled for the current
5714 function, otherwise return FALSE. */
5715
5716 bool
5717 aarch64_return_address_signing_enabled (void)
5718 {
5719 /* This function should only be called after frame laid out. */
5720 gcc_assert (cfun->machine->frame.laid_out);
5721
5722 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5723 if its LR is pushed onto stack. */
5724 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5725 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5726 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5727 }
5728
5729 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5730 bool
5731 aarch64_bti_enabled (void)
5732 {
5733 return (aarch64_enable_bti == 1);
5734 }
5735
5736 /* Emit code to save the callee-saved registers from register number START
5737 to LIMIT to the stack at the location starting at offset START_OFFSET,
5738 skipping any write-back candidates if SKIP_WB is true. */
5739
5740 static void
5741 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5742 unsigned start, unsigned limit, bool skip_wb)
5743 {
5744 rtx_insn *insn;
5745 unsigned regno;
5746 unsigned regno2;
5747
5748 for (regno = aarch64_next_callee_save (start, limit);
5749 regno <= limit;
5750 regno = aarch64_next_callee_save (regno + 1, limit))
5751 {
5752 rtx reg, mem;
5753 poly_int64 offset;
5754 int offset_diff;
5755
5756 if (skip_wb
5757 && (regno == cfun->machine->frame.wb_candidate1
5758 || regno == cfun->machine->frame.wb_candidate2))
5759 continue;
5760
5761 if (cfun->machine->reg_is_wrapped_separately[regno])
5762 continue;
5763
5764 reg = gen_rtx_REG (mode, regno);
5765 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5766 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5767 offset));
5768
5769 regno2 = aarch64_next_callee_save (regno + 1, limit);
5770 offset_diff = cfun->machine->frame.reg_offset[regno2]
5771 - cfun->machine->frame.reg_offset[regno];
5772
5773 if (regno2 <= limit
5774 && !cfun->machine->reg_is_wrapped_separately[regno2]
5775 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5776 {
5777 rtx reg2 = gen_rtx_REG (mode, regno2);
5778 rtx mem2;
5779
5780 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5781 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5782 offset));
5783 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5784 reg2));
5785
5786 /* The first part of a frame-related parallel insn is
5787 always assumed to be relevant to the frame
5788 calculations; subsequent parts, are only
5789 frame-related if explicitly marked. */
5790 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5791 regno = regno2;
5792 }
5793 else
5794 insn = emit_move_insn (mem, reg);
5795
5796 RTX_FRAME_RELATED_P (insn) = 1;
5797 }
5798 }
5799
5800 /* Emit code to restore the callee registers of mode MODE from register
5801 number START up to and including LIMIT. Restore from the stack offset
5802 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5803 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5804
5805 static void
5806 aarch64_restore_callee_saves (machine_mode mode,
5807 poly_int64 start_offset, unsigned start,
5808 unsigned limit, bool skip_wb, rtx *cfi_ops)
5809 {
5810 rtx base_rtx = stack_pointer_rtx;
5811 unsigned regno;
5812 unsigned regno2;
5813 poly_int64 offset;
5814
5815 for (regno = aarch64_next_callee_save (start, limit);
5816 regno <= limit;
5817 regno = aarch64_next_callee_save (regno + 1, limit))
5818 {
5819 if (cfun->machine->reg_is_wrapped_separately[regno])
5820 continue;
5821
5822 rtx reg, mem;
5823 int offset_diff;
5824
5825 if (skip_wb
5826 && (regno == cfun->machine->frame.wb_candidate1
5827 || regno == cfun->machine->frame.wb_candidate2))
5828 continue;
5829
5830 reg = gen_rtx_REG (mode, regno);
5831 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5832 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5833
5834 regno2 = aarch64_next_callee_save (regno + 1, limit);
5835 offset_diff = cfun->machine->frame.reg_offset[regno2]
5836 - cfun->machine->frame.reg_offset[regno];
5837
5838 if (regno2 <= limit
5839 && !cfun->machine->reg_is_wrapped_separately[regno2]
5840 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5841 {
5842 rtx reg2 = gen_rtx_REG (mode, regno2);
5843 rtx mem2;
5844
5845 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5846 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5847 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5848
5849 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5850 regno = regno2;
5851 }
5852 else
5853 emit_move_insn (reg, mem);
5854 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5855 }
5856 }
5857
5858 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5859 of MODE. */
5860
5861 static inline bool
5862 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5863 {
5864 HOST_WIDE_INT multiple;
5865 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5866 && IN_RANGE (multiple, -8, 7));
5867 }
5868
5869 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5870 of MODE. */
5871
5872 static inline bool
5873 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5874 {
5875 HOST_WIDE_INT multiple;
5876 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5877 && IN_RANGE (multiple, 0, 63));
5878 }
5879
5880 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5881 of MODE. */
5882
5883 bool
5884 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5885 {
5886 HOST_WIDE_INT multiple;
5887 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5888 && IN_RANGE (multiple, -64, 63));
5889 }
5890
5891 /* Return true if OFFSET is a signed 9-bit value. */
5892
5893 bool
5894 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5895 poly_int64 offset)
5896 {
5897 HOST_WIDE_INT const_offset;
5898 return (offset.is_constant (&const_offset)
5899 && IN_RANGE (const_offset, -256, 255));
5900 }
5901
5902 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5903 of MODE. */
5904
5905 static inline bool
5906 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5907 {
5908 HOST_WIDE_INT multiple;
5909 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5910 && IN_RANGE (multiple, -256, 255));
5911 }
5912
5913 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5914 of MODE. */
5915
5916 static inline bool
5917 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5918 {
5919 HOST_WIDE_INT multiple;
5920 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5921 && IN_RANGE (multiple, 0, 4095));
5922 }
5923
5924 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5925
5926 static sbitmap
5927 aarch64_get_separate_components (void)
5928 {
5929 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5930 bitmap_clear (components);
5931
5932 /* The registers we need saved to the frame. */
5933 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5934 if (aarch64_register_saved_on_entry (regno))
5935 {
5936 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5937 if (!frame_pointer_needed)
5938 offset += cfun->machine->frame.frame_size
5939 - cfun->machine->frame.hard_fp_offset;
5940 /* Check that we can access the stack slot of the register with one
5941 direct load with no adjustments needed. */
5942 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5943 bitmap_set_bit (components, regno);
5944 }
5945
5946 /* Don't mess with the hard frame pointer. */
5947 if (frame_pointer_needed)
5948 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5949
5950 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5951 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5952 /* If registers have been chosen to be stored/restored with
5953 writeback don't interfere with them to avoid having to output explicit
5954 stack adjustment instructions. */
5955 if (reg2 != INVALID_REGNUM)
5956 bitmap_clear_bit (components, reg2);
5957 if (reg1 != INVALID_REGNUM)
5958 bitmap_clear_bit (components, reg1);
5959
5960 bitmap_clear_bit (components, LR_REGNUM);
5961 bitmap_clear_bit (components, SP_REGNUM);
5962
5963 return components;
5964 }
5965
5966 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5967
5968 static sbitmap
5969 aarch64_components_for_bb (basic_block bb)
5970 {
5971 bitmap in = DF_LIVE_IN (bb);
5972 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5973 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5974 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5975
5976 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5977 bitmap_clear (components);
5978
5979 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5980 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5981 if ((!call_used_or_fixed_reg_p (regno)
5982 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5983 && (bitmap_bit_p (in, regno)
5984 || bitmap_bit_p (gen, regno)
5985 || bitmap_bit_p (kill, regno)))
5986 {
5987 unsigned regno2, offset, offset2;
5988 bitmap_set_bit (components, regno);
5989
5990 /* If there is a callee-save at an adjacent offset, add it too
5991 to increase the use of LDP/STP. */
5992 offset = cfun->machine->frame.reg_offset[regno];
5993 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5994
5995 if (regno2 <= LAST_SAVED_REGNUM)
5996 {
5997 offset2 = cfun->machine->frame.reg_offset[regno2];
5998 if ((offset & ~8) == (offset2 & ~8))
5999 bitmap_set_bit (components, regno2);
6000 }
6001 }
6002
6003 return components;
6004 }
6005
6006 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6007 Nothing to do for aarch64. */
6008
6009 static void
6010 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
6011 {
6012 }
6013
6014 /* Return the next set bit in BMP from START onwards. Return the total number
6015 of bits in BMP if no set bit is found at or after START. */
6016
6017 static unsigned int
6018 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6019 {
6020 unsigned int nbits = SBITMAP_SIZE (bmp);
6021 if (start == nbits)
6022 return start;
6023
6024 gcc_assert (start < nbits);
6025 for (unsigned int i = start; i < nbits; i++)
6026 if (bitmap_bit_p (bmp, i))
6027 return i;
6028
6029 return nbits;
6030 }
6031
6032 /* Do the work for aarch64_emit_prologue_components and
6033 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6034 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6035 for these components or the epilogue sequence. That is, it determines
6036 whether we should emit stores or loads and what kind of CFA notes to attach
6037 to the insns. Otherwise the logic for the two sequences is very
6038 similar. */
6039
6040 static void
6041 aarch64_process_components (sbitmap components, bool prologue_p)
6042 {
6043 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6044 ? HARD_FRAME_POINTER_REGNUM
6045 : STACK_POINTER_REGNUM);
6046
6047 unsigned last_regno = SBITMAP_SIZE (components);
6048 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6049 rtx_insn *insn = NULL;
6050
6051 while (regno != last_regno)
6052 {
6053 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
6054 so DFmode for the vector registers is enough. For simd functions
6055 we want to save the low 128 bits. */
6056 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
6057
6058 rtx reg = gen_rtx_REG (mode, regno);
6059 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6060 if (!frame_pointer_needed)
6061 offset += cfun->machine->frame.frame_size
6062 - cfun->machine->frame.hard_fp_offset;
6063 rtx addr = plus_constant (Pmode, ptr_reg, offset);
6064 rtx mem = gen_frame_mem (mode, addr);
6065
6066 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6067 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6068 /* No more registers to handle after REGNO.
6069 Emit a single save/restore and exit. */
6070 if (regno2 == last_regno)
6071 {
6072 insn = emit_insn (set);
6073 RTX_FRAME_RELATED_P (insn) = 1;
6074 if (prologue_p)
6075 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6076 else
6077 add_reg_note (insn, REG_CFA_RESTORE, reg);
6078 break;
6079 }
6080
6081 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6082 /* The next register is not of the same class or its offset is not
6083 mergeable with the current one into a pair. */
6084 if (!satisfies_constraint_Ump (mem)
6085 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6086 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
6087 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6088 GET_MODE_SIZE (mode)))
6089 {
6090 insn = emit_insn (set);
6091 RTX_FRAME_RELATED_P (insn) = 1;
6092 if (prologue_p)
6093 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6094 else
6095 add_reg_note (insn, REG_CFA_RESTORE, reg);
6096
6097 regno = regno2;
6098 continue;
6099 }
6100
6101 /* REGNO2 can be saved/restored in a pair with REGNO. */
6102 rtx reg2 = gen_rtx_REG (mode, regno2);
6103 if (!frame_pointer_needed)
6104 offset2 += cfun->machine->frame.frame_size
6105 - cfun->machine->frame.hard_fp_offset;
6106 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6107 rtx mem2 = gen_frame_mem (mode, addr2);
6108 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6109 : gen_rtx_SET (reg2, mem2);
6110
6111 if (prologue_p)
6112 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6113 else
6114 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6115
6116 RTX_FRAME_RELATED_P (insn) = 1;
6117 if (prologue_p)
6118 {
6119 add_reg_note (insn, REG_CFA_OFFSET, set);
6120 add_reg_note (insn, REG_CFA_OFFSET, set2);
6121 }
6122 else
6123 {
6124 add_reg_note (insn, REG_CFA_RESTORE, reg);
6125 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6126 }
6127
6128 regno = aarch64_get_next_set_bit (components, regno2 + 1);
6129 }
6130 }
6131
6132 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6133
6134 static void
6135 aarch64_emit_prologue_components (sbitmap components)
6136 {
6137 aarch64_process_components (components, true);
6138 }
6139
6140 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6141
6142 static void
6143 aarch64_emit_epilogue_components (sbitmap components)
6144 {
6145 aarch64_process_components (components, false);
6146 }
6147
6148 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6149
6150 static void
6151 aarch64_set_handled_components (sbitmap components)
6152 {
6153 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6154 if (bitmap_bit_p (components, regno))
6155 cfun->machine->reg_is_wrapped_separately[regno] = true;
6156 }
6157
6158 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6159 determining the probe offset for alloca. */
6160
6161 static HOST_WIDE_INT
6162 aarch64_stack_clash_protection_alloca_probe_range (void)
6163 {
6164 return STACK_CLASH_CALLER_GUARD;
6165 }
6166
6167
6168 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6169 registers. If POLY_SIZE is not large enough to require a probe this function
6170 will only adjust the stack. When allocating the stack space
6171 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6172 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6173 arguments. If we are then we ensure that any allocation larger than the ABI
6174 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6175 maintained.
6176
6177 We emit barriers after each stack adjustment to prevent optimizations from
6178 breaking the invariant that we never drop the stack more than a page. This
6179 invariant is needed to make it easier to correctly handle asynchronous
6180 events, e.g. if we were to allow the stack to be dropped by more than a page
6181 and then have multiple probes up and we take a signal somewhere in between
6182 then the signal handler doesn't know the state of the stack and can make no
6183 assumptions about which pages have been probed. */
6184
6185 static void
6186 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6187 poly_int64 poly_size,
6188 bool frame_related_p,
6189 bool final_adjustment_p)
6190 {
6191 HOST_WIDE_INT guard_size
6192 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6193 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6194 /* When doing the final adjustment for the outgoing argument size we can't
6195 assume that LR was saved at position 0. So subtract it's offset from the
6196 ABI safe buffer so that we don't accidentally allow an adjustment that
6197 would result in an allocation larger than the ABI buffer without
6198 probing. */
6199 HOST_WIDE_INT min_probe_threshold
6200 = final_adjustment_p
6201 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6202 : guard_size - guard_used_by_caller;
6203
6204 poly_int64 frame_size = cfun->machine->frame.frame_size;
6205
6206 /* We should always have a positive probe threshold. */
6207 gcc_assert (min_probe_threshold > 0);
6208
6209 if (flag_stack_clash_protection && !final_adjustment_p)
6210 {
6211 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6212 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6213
6214 if (known_eq (frame_size, 0))
6215 {
6216 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6217 }
6218 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6219 && known_lt (final_adjust, guard_used_by_caller))
6220 {
6221 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6222 }
6223 }
6224
6225 /* If SIZE is not large enough to require probing, just adjust the stack and
6226 exit. */
6227 if (known_lt (poly_size, min_probe_threshold)
6228 || !flag_stack_clash_protection)
6229 {
6230 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6231 return;
6232 }
6233
6234 HOST_WIDE_INT size;
6235 /* Handle the SVE non-constant case first. */
6236 if (!poly_size.is_constant (&size))
6237 {
6238 if (dump_file)
6239 {
6240 fprintf (dump_file, "Stack clash SVE prologue: ");
6241 print_dec (poly_size, dump_file);
6242 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6243 }
6244
6245 /* First calculate the amount of bytes we're actually spilling. */
6246 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6247 poly_size, temp1, temp2, false, true);
6248
6249 rtx_insn *insn = get_last_insn ();
6250
6251 if (frame_related_p)
6252 {
6253 /* This is done to provide unwinding information for the stack
6254 adjustments we're about to do, however to prevent the optimizers
6255 from removing the R11 move and leaving the CFA note (which would be
6256 very wrong) we tie the old and new stack pointer together.
6257 The tie will expand to nothing but the optimizers will not touch
6258 the instruction. */
6259 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6260 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6261 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6262
6263 /* We want the CFA independent of the stack pointer for the
6264 duration of the loop. */
6265 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6266 RTX_FRAME_RELATED_P (insn) = 1;
6267 }
6268
6269 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6270 rtx guard_const = gen_int_mode (guard_size, Pmode);
6271
6272 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6273 stack_pointer_rtx, temp1,
6274 probe_const, guard_const));
6275
6276 /* Now reset the CFA register if needed. */
6277 if (frame_related_p)
6278 {
6279 add_reg_note (insn, REG_CFA_DEF_CFA,
6280 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6281 gen_int_mode (poly_size, Pmode)));
6282 RTX_FRAME_RELATED_P (insn) = 1;
6283 }
6284
6285 return;
6286 }
6287
6288 if (dump_file)
6289 fprintf (dump_file,
6290 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6291 " bytes, probing will be required.\n", size);
6292
6293 /* Round size to the nearest multiple of guard_size, and calculate the
6294 residual as the difference between the original size and the rounded
6295 size. */
6296 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6297 HOST_WIDE_INT residual = size - rounded_size;
6298
6299 /* We can handle a small number of allocations/probes inline. Otherwise
6300 punt to a loop. */
6301 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6302 {
6303 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6304 {
6305 aarch64_sub_sp (NULL, temp2, guard_size, true);
6306 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6307 guard_used_by_caller));
6308 emit_insn (gen_blockage ());
6309 }
6310 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6311 }
6312 else
6313 {
6314 /* Compute the ending address. */
6315 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6316 temp1, NULL, false, true);
6317 rtx_insn *insn = get_last_insn ();
6318
6319 /* For the initial allocation, we don't have a frame pointer
6320 set up, so we always need CFI notes. If we're doing the
6321 final allocation, then we may have a frame pointer, in which
6322 case it is the CFA, otherwise we need CFI notes.
6323
6324 We can determine which allocation we are doing by looking at
6325 the value of FRAME_RELATED_P since the final allocations are not
6326 frame related. */
6327 if (frame_related_p)
6328 {
6329 /* We want the CFA independent of the stack pointer for the
6330 duration of the loop. */
6331 add_reg_note (insn, REG_CFA_DEF_CFA,
6332 plus_constant (Pmode, temp1, rounded_size));
6333 RTX_FRAME_RELATED_P (insn) = 1;
6334 }
6335
6336 /* This allocates and probes the stack. Note that this re-uses some of
6337 the existing Ada stack protection code. However we are guaranteed not
6338 to enter the non loop or residual branches of that code.
6339
6340 The non-loop part won't be entered because if our allocation amount
6341 doesn't require a loop, the case above would handle it.
6342
6343 The residual amount won't be entered because TEMP1 is a mutliple of
6344 the allocation size. The residual will always be 0. As such, the only
6345 part we are actually using from that code is the loop setup. The
6346 actual probing is done in aarch64_output_probe_stack_range. */
6347 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6348 stack_pointer_rtx, temp1));
6349
6350 /* Now reset the CFA register if needed. */
6351 if (frame_related_p)
6352 {
6353 add_reg_note (insn, REG_CFA_DEF_CFA,
6354 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6355 RTX_FRAME_RELATED_P (insn) = 1;
6356 }
6357
6358 emit_insn (gen_blockage ());
6359 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6360 }
6361
6362 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6363 be probed. This maintains the requirement that each page is probed at
6364 least once. For initial probing we probe only if the allocation is
6365 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6366 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6367 GUARD_SIZE. This works that for any allocation that is large enough to
6368 trigger a probe here, we'll have at least one, and if they're not large
6369 enough for this code to emit anything for them, The page would have been
6370 probed by the saving of FP/LR either by this function or any callees. If
6371 we don't have any callees then we won't have more stack adjustments and so
6372 are still safe. */
6373 if (residual)
6374 {
6375 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6376 /* If we're doing final adjustments, and we've done any full page
6377 allocations then any residual needs to be probed. */
6378 if (final_adjustment_p && rounded_size != 0)
6379 min_probe_threshold = 0;
6380 /* If doing a small final adjustment, we always probe at offset 0.
6381 This is done to avoid issues when LR is not at position 0 or when
6382 the final adjustment is smaller than the probing offset. */
6383 else if (final_adjustment_p && rounded_size == 0)
6384 residual_probe_offset = 0;
6385
6386 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6387 if (residual >= min_probe_threshold)
6388 {
6389 if (dump_file)
6390 fprintf (dump_file,
6391 "Stack clash AArch64 prologue residuals: "
6392 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6393 "\n", residual);
6394
6395 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6396 residual_probe_offset));
6397 emit_insn (gen_blockage ());
6398 }
6399 }
6400 }
6401
6402 /* Return 1 if the register is used by the epilogue. We need to say the
6403 return register is used, but only after epilogue generation is complete.
6404 Note that in the case of sibcalls, the values "used by the epilogue" are
6405 considered live at the start of the called function.
6406
6407 For SIMD functions we need to return 1 for FP registers that are saved and
6408 restored by a function but are not zero in call_used_regs. If we do not do
6409 this optimizations may remove the restore of the register. */
6410
6411 int
6412 aarch64_epilogue_uses (int regno)
6413 {
6414 if (epilogue_completed)
6415 {
6416 if (regno == LR_REGNUM)
6417 return 1;
6418 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6419 return 1;
6420 }
6421 return 0;
6422 }
6423
6424 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6425 is saved at BASE + OFFSET. */
6426
6427 static void
6428 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6429 rtx base, poly_int64 offset)
6430 {
6431 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6432 add_reg_note (insn, REG_CFA_EXPRESSION,
6433 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6434 }
6435
6436 /* AArch64 stack frames generated by this compiler look like:
6437
6438 +-------------------------------+
6439 | |
6440 | incoming stack arguments |
6441 | |
6442 +-------------------------------+
6443 | | <-- incoming stack pointer (aligned)
6444 | callee-allocated save area |
6445 | for register varargs |
6446 | |
6447 +-------------------------------+
6448 | local variables | <-- frame_pointer_rtx
6449 | |
6450 +-------------------------------+
6451 | padding | \
6452 +-------------------------------+ |
6453 | callee-saved registers | | frame.saved_regs_size
6454 +-------------------------------+ |
6455 | LR' | |
6456 +-------------------------------+ |
6457 | FP' | / <- hard_frame_pointer_rtx (aligned)
6458 +-------------------------------+
6459 | dynamic allocation |
6460 +-------------------------------+
6461 | padding |
6462 +-------------------------------+
6463 | outgoing stack arguments | <-- arg_pointer
6464 | |
6465 +-------------------------------+
6466 | | <-- stack_pointer_rtx (aligned)
6467
6468 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6469 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6470 unchanged.
6471
6472 By default for stack-clash we assume the guard is at least 64KB, but this
6473 value is configurable to either 4KB or 64KB. We also force the guard size to
6474 be the same as the probing interval and both values are kept in sync.
6475
6476 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6477 on the guard size) of stack space without probing.
6478
6479 When probing is needed, we emit a probe at the start of the prologue
6480 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6481
6482 We have to track how much space has been allocated and the only stores
6483 to the stack we track as implicit probes are the FP/LR stores.
6484
6485 For outgoing arguments we probe if the size is larger than 1KB, such that
6486 the ABI specified buffer is maintained for the next callee.
6487
6488 The following registers are reserved during frame layout and should not be
6489 used for any other purpose:
6490
6491 - r11: Used by stack clash protection when SVE is enabled.
6492 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6493 - r14 and r15: Used for speculation tracking.
6494 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6495 - r30(LR), r29(FP): Used by standard frame layout.
6496
6497 These registers must be avoided in frame layout related code unless the
6498 explicit intention is to interact with one of the features listed above. */
6499
6500 /* Generate the prologue instructions for entry into a function.
6501 Establish the stack frame by decreasing the stack pointer with a
6502 properly calculated size and, if necessary, create a frame record
6503 filled with the values of LR and previous frame pointer. The
6504 current FP is also set up if it is in use. */
6505
6506 void
6507 aarch64_expand_prologue (void)
6508 {
6509 poly_int64 frame_size = cfun->machine->frame.frame_size;
6510 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6511 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6512 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6513 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6514 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6515 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6516 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6517 rtx_insn *insn;
6518
6519 /* Sign return address for functions. */
6520 if (aarch64_return_address_signing_enabled ())
6521 {
6522 switch (aarch64_ra_sign_key)
6523 {
6524 case AARCH64_KEY_A:
6525 insn = emit_insn (gen_paciasp ());
6526 break;
6527 case AARCH64_KEY_B:
6528 insn = emit_insn (gen_pacibsp ());
6529 break;
6530 default:
6531 gcc_unreachable ();
6532 }
6533 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6534 RTX_FRAME_RELATED_P (insn) = 1;
6535 }
6536
6537 if (flag_stack_usage_info)
6538 current_function_static_stack_size = constant_lower_bound (frame_size);
6539
6540 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6541 {
6542 if (crtl->is_leaf && !cfun->calls_alloca)
6543 {
6544 if (maybe_gt (frame_size, PROBE_INTERVAL)
6545 && maybe_gt (frame_size, get_stack_check_protect ()))
6546 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6547 (frame_size
6548 - get_stack_check_protect ()));
6549 }
6550 else if (maybe_gt (frame_size, 0))
6551 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6552 }
6553
6554 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6555 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6556
6557 /* In theory we should never have both an initial adjustment
6558 and a callee save adjustment. Verify that is the case since the
6559 code below does not handle it for -fstack-clash-protection. */
6560 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6561
6562 /* Will only probe if the initial adjustment is larger than the guard
6563 less the amount of the guard reserved for use by the caller's
6564 outgoing args. */
6565 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6566 true, false);
6567
6568 if (callee_adjust != 0)
6569 aarch64_push_regs (reg1, reg2, callee_adjust);
6570
6571 if (emit_frame_chain)
6572 {
6573 poly_int64 reg_offset = callee_adjust;
6574 if (callee_adjust == 0)
6575 {
6576 reg1 = R29_REGNUM;
6577 reg2 = R30_REGNUM;
6578 reg_offset = callee_offset;
6579 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6580 }
6581 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6582 stack_pointer_rtx, callee_offset,
6583 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6584 if (frame_pointer_needed && !frame_size.is_constant ())
6585 {
6586 /* Variable-sized frames need to describe the save slot
6587 address using DW_CFA_expression rather than DW_CFA_offset.
6588 This means that, without taking further action, the
6589 locations of the registers that we've already saved would
6590 remain based on the stack pointer even after we redefine
6591 the CFA based on the frame pointer. We therefore need new
6592 DW_CFA_expressions to re-express the save slots with addresses
6593 based on the frame pointer. */
6594 rtx_insn *insn = get_last_insn ();
6595 gcc_assert (RTX_FRAME_RELATED_P (insn));
6596
6597 /* Add an explicit CFA definition if this was previously
6598 implicit. */
6599 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6600 {
6601 rtx src = plus_constant (Pmode, stack_pointer_rtx,
6602 callee_offset);
6603 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6604 gen_rtx_SET (hard_frame_pointer_rtx, src));
6605 }
6606
6607 /* Change the save slot expressions for the registers that
6608 we've already saved. */
6609 reg_offset -= callee_offset;
6610 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6611 reg_offset + UNITS_PER_WORD);
6612 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6613 reg_offset);
6614 }
6615 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6616 }
6617
6618 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6619 callee_adjust != 0 || emit_frame_chain);
6620 if (aarch64_simd_decl_p (cfun->decl))
6621 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6622 callee_adjust != 0 || emit_frame_chain);
6623 else
6624 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6625 callee_adjust != 0 || emit_frame_chain);
6626
6627 /* We may need to probe the final adjustment if it is larger than the guard
6628 that is assumed by the called. */
6629 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6630 !frame_pointer_needed, true);
6631 }
6632
6633 /* Return TRUE if we can use a simple_return insn.
6634
6635 This function checks whether the callee saved stack is empty, which
6636 means no restore actions are need. The pro_and_epilogue will use
6637 this to check whether shrink-wrapping opt is feasible. */
6638
6639 bool
6640 aarch64_use_return_insn_p (void)
6641 {
6642 if (!reload_completed)
6643 return false;
6644
6645 if (crtl->profile)
6646 return false;
6647
6648 return known_eq (cfun->machine->frame.frame_size, 0);
6649 }
6650
6651 /* Return false for non-leaf SIMD functions in order to avoid
6652 shrink-wrapping them. Doing this will lose the necessary
6653 save/restore of FP registers. */
6654
6655 bool
6656 aarch64_use_simple_return_insn_p (void)
6657 {
6658 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6659 return false;
6660
6661 return true;
6662 }
6663
6664 /* Generate the epilogue instructions for returning from a function.
6665 This is almost exactly the reverse of the prolog sequence, except
6666 that we need to insert barriers to avoid scheduling loads that read
6667 from a deallocated stack, and we optimize the unwind records by
6668 emitting them all together if possible. */
6669 void
6670 aarch64_expand_epilogue (bool for_sibcall)
6671 {
6672 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6673 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6674 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6675 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6676 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6677 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6678 rtx cfi_ops = NULL;
6679 rtx_insn *insn;
6680 /* A stack clash protection prologue may not have left EP0_REGNUM or
6681 EP1_REGNUM in a usable state. The same is true for allocations
6682 with an SVE component, since we then need both temporary registers
6683 for each allocation. For stack clash we are in a usable state if
6684 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6685 HOST_WIDE_INT guard_size
6686 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6687 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6688
6689 /* We can re-use the registers when the allocation amount is smaller than
6690 guard_size - guard_used_by_caller because we won't be doing any probes
6691 then. In such situations the register should remain live with the correct
6692 value. */
6693 bool can_inherit_p = (initial_adjust.is_constant ()
6694 && final_adjust.is_constant ())
6695 && (!flag_stack_clash_protection
6696 || known_lt (initial_adjust,
6697 guard_size - guard_used_by_caller));
6698
6699 /* We need to add memory barrier to prevent read from deallocated stack. */
6700 bool need_barrier_p
6701 = maybe_ne (get_frame_size ()
6702 + cfun->machine->frame.saved_varargs_size, 0);
6703
6704 /* Emit a barrier to prevent loads from a deallocated stack. */
6705 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6706 || cfun->calls_alloca
6707 || crtl->calls_eh_return)
6708 {
6709 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6710 need_barrier_p = false;
6711 }
6712
6713 /* Restore the stack pointer from the frame pointer if it may not
6714 be the same as the stack pointer. */
6715 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6716 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6717 if (frame_pointer_needed
6718 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6719 /* If writeback is used when restoring callee-saves, the CFA
6720 is restored on the instruction doing the writeback. */
6721 aarch64_add_offset (Pmode, stack_pointer_rtx,
6722 hard_frame_pointer_rtx, -callee_offset,
6723 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6724 else
6725 /* The case where we need to re-use the register here is very rare, so
6726 avoid the complicated condition and just always emit a move if the
6727 immediate doesn't fit. */
6728 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6729
6730 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6731 callee_adjust != 0, &cfi_ops);
6732 if (aarch64_simd_decl_p (cfun->decl))
6733 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6734 callee_adjust != 0, &cfi_ops);
6735 else
6736 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6737 callee_adjust != 0, &cfi_ops);
6738
6739 if (need_barrier_p)
6740 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6741
6742 if (callee_adjust != 0)
6743 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6744
6745 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6746 {
6747 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6748 insn = get_last_insn ();
6749 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6750 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6751 RTX_FRAME_RELATED_P (insn) = 1;
6752 cfi_ops = NULL;
6753 }
6754
6755 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6756 add restriction on emit_move optimization to leaf functions. */
6757 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6758 (!can_inherit_p || !crtl->is_leaf
6759 || df_regs_ever_live_p (EP0_REGNUM)));
6760
6761 if (cfi_ops)
6762 {
6763 /* Emit delayed restores and reset the CFA to be SP. */
6764 insn = get_last_insn ();
6765 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6766 REG_NOTES (insn) = cfi_ops;
6767 RTX_FRAME_RELATED_P (insn) = 1;
6768 }
6769
6770 /* We prefer to emit the combined return/authenticate instruction RETAA,
6771 however there are three cases in which we must instead emit an explicit
6772 authentication instruction.
6773
6774 1) Sibcalls don't return in a normal way, so if we're about to call one
6775 we must authenticate.
6776
6777 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6778 generating code for !TARGET_ARMV8_3 we can't use it and must
6779 explicitly authenticate.
6780
6781 3) On an eh_return path we make extra stack adjustments to update the
6782 canonical frame address to be the exception handler's CFA. We want
6783 to authenticate using the CFA of the function which calls eh_return.
6784 */
6785 if (aarch64_return_address_signing_enabled ()
6786 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6787 {
6788 switch (aarch64_ra_sign_key)
6789 {
6790 case AARCH64_KEY_A:
6791 insn = emit_insn (gen_autiasp ());
6792 break;
6793 case AARCH64_KEY_B:
6794 insn = emit_insn (gen_autibsp ());
6795 break;
6796 default:
6797 gcc_unreachable ();
6798 }
6799 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6800 RTX_FRAME_RELATED_P (insn) = 1;
6801 }
6802
6803 /* Stack adjustment for exception handler. */
6804 if (crtl->calls_eh_return && !for_sibcall)
6805 {
6806 /* We need to unwind the stack by the offset computed by
6807 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6808 to be SP; letting the CFA move during this adjustment
6809 is just as correct as retaining the CFA from the body
6810 of the function. Therefore, do nothing special. */
6811 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6812 }
6813
6814 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6815 if (!for_sibcall)
6816 emit_jump_insn (ret_rtx);
6817 }
6818
6819 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6820 normally or return to a previous frame after unwinding.
6821
6822 An EH return uses a single shared return sequence. The epilogue is
6823 exactly like a normal epilogue except that it has an extra input
6824 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6825 that must be applied after the frame has been destroyed. An extra label
6826 is inserted before the epilogue which initializes this register to zero,
6827 and this is the entry point for a normal return.
6828
6829 An actual EH return updates the return address, initializes the stack
6830 adjustment and jumps directly into the epilogue (bypassing the zeroing
6831 of the adjustment). Since the return address is typically saved on the
6832 stack when a function makes a call, the saved LR must be updated outside
6833 the epilogue.
6834
6835 This poses problems as the store is generated well before the epilogue,
6836 so the offset of LR is not known yet. Also optimizations will remove the
6837 store as it appears dead, even after the epilogue is generated (as the
6838 base or offset for loading LR is different in many cases).
6839
6840 To avoid these problems this implementation forces the frame pointer
6841 in eh_return functions so that the location of LR is fixed and known early.
6842 It also marks the store volatile, so no optimization is permitted to
6843 remove the store. */
6844 rtx
6845 aarch64_eh_return_handler_rtx (void)
6846 {
6847 rtx tmp = gen_frame_mem (Pmode,
6848 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6849
6850 /* Mark the store volatile, so no optimization is permitted to remove it. */
6851 MEM_VOLATILE_P (tmp) = true;
6852 return tmp;
6853 }
6854
6855 /* Output code to add DELTA to the first argument, and then jump
6856 to FUNCTION. Used for C++ multiple inheritance. */
6857 static void
6858 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6859 HOST_WIDE_INT delta,
6860 HOST_WIDE_INT vcall_offset,
6861 tree function)
6862 {
6863 /* The this pointer is always in x0. Note that this differs from
6864 Arm where the this pointer maybe bumped to r1 if r0 is required
6865 to return a pointer to an aggregate. On AArch64 a result value
6866 pointer will be in x8. */
6867 int this_regno = R0_REGNUM;
6868 rtx this_rtx, temp0, temp1, addr, funexp;
6869 rtx_insn *insn;
6870 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6871
6872 if (aarch64_bti_enabled ())
6873 emit_insn (gen_bti_c());
6874
6875 reload_completed = 1;
6876 emit_note (NOTE_INSN_PROLOGUE_END);
6877
6878 this_rtx = gen_rtx_REG (Pmode, this_regno);
6879 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6880 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6881
6882 if (vcall_offset == 0)
6883 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6884 else
6885 {
6886 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6887
6888 addr = this_rtx;
6889 if (delta != 0)
6890 {
6891 if (delta >= -256 && delta < 256)
6892 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6893 plus_constant (Pmode, this_rtx, delta));
6894 else
6895 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6896 temp1, temp0, false);
6897 }
6898
6899 if (Pmode == ptr_mode)
6900 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6901 else
6902 aarch64_emit_move (temp0,
6903 gen_rtx_ZERO_EXTEND (Pmode,
6904 gen_rtx_MEM (ptr_mode, addr)));
6905
6906 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6907 addr = plus_constant (Pmode, temp0, vcall_offset);
6908 else
6909 {
6910 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6911 Pmode);
6912 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6913 }
6914
6915 if (Pmode == ptr_mode)
6916 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6917 else
6918 aarch64_emit_move (temp1,
6919 gen_rtx_SIGN_EXTEND (Pmode,
6920 gen_rtx_MEM (ptr_mode, addr)));
6921
6922 emit_insn (gen_add2_insn (this_rtx, temp1));
6923 }
6924
6925 /* Generate a tail call to the target function. */
6926 if (!TREE_USED (function))
6927 {
6928 assemble_external (function);
6929 TREE_USED (function) = 1;
6930 }
6931 funexp = XEXP (DECL_RTL (function), 0);
6932 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6933 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6934 SIBLING_CALL_P (insn) = 1;
6935
6936 insn = get_insns ();
6937 shorten_branches (insn);
6938
6939 assemble_start_function (thunk, fnname);
6940 final_start_function (insn, file, 1);
6941 final (insn, file, 1);
6942 final_end_function ();
6943 assemble_end_function (thunk, fnname);
6944
6945 /* Stop pretending to be a post-reload pass. */
6946 reload_completed = 0;
6947 }
6948
6949 static bool
6950 aarch64_tls_referenced_p (rtx x)
6951 {
6952 if (!TARGET_HAVE_TLS)
6953 return false;
6954 subrtx_iterator::array_type array;
6955 FOR_EACH_SUBRTX (iter, array, x, ALL)
6956 {
6957 const_rtx x = *iter;
6958 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6959 return true;
6960 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6961 TLS offsets, not real symbol references. */
6962 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6963 iter.skip_subrtxes ();
6964 }
6965 return false;
6966 }
6967
6968
6969 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6970 a left shift of 0 or 12 bits. */
6971 bool
6972 aarch64_uimm12_shift (HOST_WIDE_INT val)
6973 {
6974 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6975 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6976 );
6977 }
6978
6979 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6980 that can be created with a left shift of 0 or 12. */
6981 static HOST_WIDE_INT
6982 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6983 {
6984 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6985 handle correctly. */
6986 gcc_assert ((val & 0xffffff) == val);
6987
6988 if (((val & 0xfff) << 0) == val)
6989 return val;
6990
6991 return val & (0xfff << 12);
6992 }
6993
6994 /* Return true if val is an immediate that can be loaded into a
6995 register by a MOVZ instruction. */
6996 static bool
6997 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6998 {
6999 if (GET_MODE_SIZE (mode) > 4)
7000 {
7001 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
7002 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
7003 return 1;
7004 }
7005 else
7006 {
7007 /* Ignore sign extension. */
7008 val &= (HOST_WIDE_INT) 0xffffffff;
7009 }
7010 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
7011 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
7012 }
7013
7014 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
7015 64-bit (DImode) integer. */
7016
7017 static unsigned HOST_WIDE_INT
7018 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
7019 {
7020 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
7021 while (size < 64)
7022 {
7023 val &= (HOST_WIDE_INT_1U << size) - 1;
7024 val |= val << size;
7025 size *= 2;
7026 }
7027 return val;
7028 }
7029
7030 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7031
7032 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7033 {
7034 0x0000000100000001ull,
7035 0x0001000100010001ull,
7036 0x0101010101010101ull,
7037 0x1111111111111111ull,
7038 0x5555555555555555ull,
7039 };
7040
7041
7042 /* Return true if val is a valid bitmask immediate. */
7043
7044 bool
7045 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7046 {
7047 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7048 int bits;
7049
7050 /* Check for a single sequence of one bits and return quickly if so.
7051 The special cases of all ones and all zeroes returns false. */
7052 val = aarch64_replicate_bitmask_imm (val_in, mode);
7053 tmp = val + (val & -val);
7054
7055 if (tmp == (tmp & -tmp))
7056 return (val + 1) > 1;
7057
7058 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7059 if (mode == SImode)
7060 val = (val << 32) | (val & 0xffffffff);
7061
7062 /* Invert if the immediate doesn't start with a zero bit - this means we
7063 only need to search for sequences of one bits. */
7064 if (val & 1)
7065 val = ~val;
7066
7067 /* Find the first set bit and set tmp to val with the first sequence of one
7068 bits removed. Return success if there is a single sequence of ones. */
7069 first_one = val & -val;
7070 tmp = val & (val + first_one);
7071
7072 if (tmp == 0)
7073 return true;
7074
7075 /* Find the next set bit and compute the difference in bit position. */
7076 next_one = tmp & -tmp;
7077 bits = clz_hwi (first_one) - clz_hwi (next_one);
7078 mask = val ^ tmp;
7079
7080 /* Check the bit position difference is a power of 2, and that the first
7081 sequence of one bits fits within 'bits' bits. */
7082 if ((mask >> bits) != 0 || bits != (bits & -bits))
7083 return false;
7084
7085 /* Check the sequence of one bits is repeated 64/bits times. */
7086 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7087 }
7088
7089 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7090 Assumed precondition: VAL_IN Is not zero. */
7091
7092 unsigned HOST_WIDE_INT
7093 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7094 {
7095 int lowest_bit_set = ctz_hwi (val_in);
7096 int highest_bit_set = floor_log2 (val_in);
7097 gcc_assert (val_in != 0);
7098
7099 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7100 (HOST_WIDE_INT_1U << lowest_bit_set));
7101 }
7102
7103 /* Create constant where bits outside of lowest bit set to highest bit set
7104 are set to 1. */
7105
7106 unsigned HOST_WIDE_INT
7107 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7108 {
7109 return val_in | ~aarch64_and_split_imm1 (val_in);
7110 }
7111
7112 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7113
7114 bool
7115 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7116 {
7117 scalar_int_mode int_mode;
7118 if (!is_a <scalar_int_mode> (mode, &int_mode))
7119 return false;
7120
7121 if (aarch64_bitmask_imm (val_in, int_mode))
7122 return false;
7123
7124 if (aarch64_move_imm (val_in, int_mode))
7125 return false;
7126
7127 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7128
7129 return aarch64_bitmask_imm (imm2, int_mode);
7130 }
7131
7132 /* Return true if val is an immediate that can be loaded into a
7133 register in a single instruction. */
7134 bool
7135 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7136 {
7137 scalar_int_mode int_mode;
7138 if (!is_a <scalar_int_mode> (mode, &int_mode))
7139 return false;
7140
7141 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7142 return 1;
7143 return aarch64_bitmask_imm (val, int_mode);
7144 }
7145
7146 static bool
7147 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7148 {
7149 rtx base, offset;
7150
7151 if (GET_CODE (x) == HIGH)
7152 return true;
7153
7154 /* There's no way to calculate VL-based values using relocations. */
7155 subrtx_iterator::array_type array;
7156 FOR_EACH_SUBRTX (iter, array, x, ALL)
7157 if (GET_CODE (*iter) == CONST_POLY_INT)
7158 return true;
7159
7160 split_const (x, &base, &offset);
7161 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7162 {
7163 if (aarch64_classify_symbol (base, INTVAL (offset))
7164 != SYMBOL_FORCE_TO_MEM)
7165 return true;
7166 else
7167 /* Avoid generating a 64-bit relocation in ILP32; leave
7168 to aarch64_expand_mov_immediate to handle it properly. */
7169 return mode != ptr_mode;
7170 }
7171
7172 return aarch64_tls_referenced_p (x);
7173 }
7174
7175 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7176 The expansion for a table switch is quite expensive due to the number
7177 of instructions, the table lookup and hard to predict indirect jump.
7178 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7179 set, otherwise use tables for > 16 cases as a tradeoff between size and
7180 performance. When optimizing for size, use the default setting. */
7181
7182 static unsigned int
7183 aarch64_case_values_threshold (void)
7184 {
7185 /* Use the specified limit for the number of cases before using jump
7186 tables at higher optimization levels. */
7187 if (optimize > 2
7188 && selected_cpu->tune->max_case_values != 0)
7189 return selected_cpu->tune->max_case_values;
7190 else
7191 return optimize_size ? default_case_values_threshold () : 17;
7192 }
7193
7194 /* Return true if register REGNO is a valid index register.
7195 STRICT_P is true if REG_OK_STRICT is in effect. */
7196
7197 bool
7198 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7199 {
7200 if (!HARD_REGISTER_NUM_P (regno))
7201 {
7202 if (!strict_p)
7203 return true;
7204
7205 if (!reg_renumber)
7206 return false;
7207
7208 regno = reg_renumber[regno];
7209 }
7210 return GP_REGNUM_P (regno);
7211 }
7212
7213 /* Return true if register REGNO is a valid base register for mode MODE.
7214 STRICT_P is true if REG_OK_STRICT is in effect. */
7215
7216 bool
7217 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7218 {
7219 if (!HARD_REGISTER_NUM_P (regno))
7220 {
7221 if (!strict_p)
7222 return true;
7223
7224 if (!reg_renumber)
7225 return false;
7226
7227 regno = reg_renumber[regno];
7228 }
7229
7230 /* The fake registers will be eliminated to either the stack or
7231 hard frame pointer, both of which are usually valid base registers.
7232 Reload deals with the cases where the eliminated form isn't valid. */
7233 return (GP_REGNUM_P (regno)
7234 || regno == SP_REGNUM
7235 || regno == FRAME_POINTER_REGNUM
7236 || regno == ARG_POINTER_REGNUM);
7237 }
7238
7239 /* Return true if X is a valid base register for mode MODE.
7240 STRICT_P is true if REG_OK_STRICT is in effect. */
7241
7242 static bool
7243 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7244 {
7245 if (!strict_p
7246 && GET_CODE (x) == SUBREG
7247 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7248 x = SUBREG_REG (x);
7249
7250 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7251 }
7252
7253 /* Return true if address offset is a valid index. If it is, fill in INFO
7254 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7255
7256 static bool
7257 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7258 machine_mode mode, bool strict_p)
7259 {
7260 enum aarch64_address_type type;
7261 rtx index;
7262 int shift;
7263
7264 /* (reg:P) */
7265 if ((REG_P (x) || GET_CODE (x) == SUBREG)
7266 && GET_MODE (x) == Pmode)
7267 {
7268 type = ADDRESS_REG_REG;
7269 index = x;
7270 shift = 0;
7271 }
7272 /* (sign_extend:DI (reg:SI)) */
7273 else if ((GET_CODE (x) == SIGN_EXTEND
7274 || GET_CODE (x) == ZERO_EXTEND)
7275 && GET_MODE (x) == DImode
7276 && GET_MODE (XEXP (x, 0)) == SImode)
7277 {
7278 type = (GET_CODE (x) == SIGN_EXTEND)
7279 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7280 index = XEXP (x, 0);
7281 shift = 0;
7282 }
7283 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7284 else if (GET_CODE (x) == MULT
7285 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7286 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7287 && GET_MODE (XEXP (x, 0)) == DImode
7288 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7289 && CONST_INT_P (XEXP (x, 1)))
7290 {
7291 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7292 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7293 index = XEXP (XEXP (x, 0), 0);
7294 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7295 }
7296 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7297 else if (GET_CODE (x) == ASHIFT
7298 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7299 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7300 && GET_MODE (XEXP (x, 0)) == DImode
7301 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7302 && CONST_INT_P (XEXP (x, 1)))
7303 {
7304 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7305 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7306 index = XEXP (XEXP (x, 0), 0);
7307 shift = INTVAL (XEXP (x, 1));
7308 }
7309 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7310 else if ((GET_CODE (x) == SIGN_EXTRACT
7311 || GET_CODE (x) == ZERO_EXTRACT)
7312 && GET_MODE (x) == DImode
7313 && GET_CODE (XEXP (x, 0)) == MULT
7314 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7315 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7316 {
7317 type = (GET_CODE (x) == SIGN_EXTRACT)
7318 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7319 index = XEXP (XEXP (x, 0), 0);
7320 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7321 if (INTVAL (XEXP (x, 1)) != 32 + shift
7322 || INTVAL (XEXP (x, 2)) != 0)
7323 shift = -1;
7324 }
7325 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7326 (const_int 0xffffffff<<shift)) */
7327 else if (GET_CODE (x) == AND
7328 && GET_MODE (x) == DImode
7329 && GET_CODE (XEXP (x, 0)) == MULT
7330 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7331 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7332 && CONST_INT_P (XEXP (x, 1)))
7333 {
7334 type = ADDRESS_REG_UXTW;
7335 index = XEXP (XEXP (x, 0), 0);
7336 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7337 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7338 shift = -1;
7339 }
7340 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7341 else if ((GET_CODE (x) == SIGN_EXTRACT
7342 || GET_CODE (x) == ZERO_EXTRACT)
7343 && GET_MODE (x) == DImode
7344 && GET_CODE (XEXP (x, 0)) == ASHIFT
7345 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7346 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7347 {
7348 type = (GET_CODE (x) == SIGN_EXTRACT)
7349 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7350 index = XEXP (XEXP (x, 0), 0);
7351 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7352 if (INTVAL (XEXP (x, 1)) != 32 + shift
7353 || INTVAL (XEXP (x, 2)) != 0)
7354 shift = -1;
7355 }
7356 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7357 (const_int 0xffffffff<<shift)) */
7358 else if (GET_CODE (x) == AND
7359 && GET_MODE (x) == DImode
7360 && GET_CODE (XEXP (x, 0)) == ASHIFT
7361 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7362 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7363 && CONST_INT_P (XEXP (x, 1)))
7364 {
7365 type = ADDRESS_REG_UXTW;
7366 index = XEXP (XEXP (x, 0), 0);
7367 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7368 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7369 shift = -1;
7370 }
7371 /* (mult:P (reg:P) (const_int scale)) */
7372 else if (GET_CODE (x) == MULT
7373 && GET_MODE (x) == Pmode
7374 && GET_MODE (XEXP (x, 0)) == Pmode
7375 && CONST_INT_P (XEXP (x, 1)))
7376 {
7377 type = ADDRESS_REG_REG;
7378 index = XEXP (x, 0);
7379 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7380 }
7381 /* (ashift:P (reg:P) (const_int shift)) */
7382 else if (GET_CODE (x) == ASHIFT
7383 && GET_MODE (x) == Pmode
7384 && GET_MODE (XEXP (x, 0)) == Pmode
7385 && CONST_INT_P (XEXP (x, 1)))
7386 {
7387 type = ADDRESS_REG_REG;
7388 index = XEXP (x, 0);
7389 shift = INTVAL (XEXP (x, 1));
7390 }
7391 else
7392 return false;
7393
7394 if (!strict_p
7395 && GET_CODE (index) == SUBREG
7396 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7397 index = SUBREG_REG (index);
7398
7399 if (aarch64_sve_data_mode_p (mode))
7400 {
7401 if (type != ADDRESS_REG_REG
7402 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7403 return false;
7404 }
7405 else
7406 {
7407 if (shift != 0
7408 && !(IN_RANGE (shift, 1, 3)
7409 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7410 return false;
7411 }
7412
7413 if (REG_P (index)
7414 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7415 {
7416 info->type = type;
7417 info->offset = index;
7418 info->shift = shift;
7419 return true;
7420 }
7421
7422 return false;
7423 }
7424
7425 /* Return true if MODE is one of the modes for which we
7426 support LDP/STP operations. */
7427
7428 static bool
7429 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7430 {
7431 return mode == SImode || mode == DImode
7432 || mode == SFmode || mode == DFmode
7433 || (aarch64_vector_mode_supported_p (mode)
7434 && (known_eq (GET_MODE_SIZE (mode), 8)
7435 || (known_eq (GET_MODE_SIZE (mode), 16)
7436 && (aarch64_tune_params.extra_tuning_flags
7437 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7438 }
7439
7440 /* Return true if REGNO is a virtual pointer register, or an eliminable
7441 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7442 include stack_pointer or hard_frame_pointer. */
7443 static bool
7444 virt_or_elim_regno_p (unsigned regno)
7445 {
7446 return ((regno >= FIRST_VIRTUAL_REGISTER
7447 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7448 || regno == FRAME_POINTER_REGNUM
7449 || regno == ARG_POINTER_REGNUM);
7450 }
7451
7452 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7453 If it is, fill in INFO appropriately. STRICT_P is true if
7454 REG_OK_STRICT is in effect. */
7455
7456 bool
7457 aarch64_classify_address (struct aarch64_address_info *info,
7458 rtx x, machine_mode mode, bool strict_p,
7459 aarch64_addr_query_type type)
7460 {
7461 enum rtx_code code = GET_CODE (x);
7462 rtx op0, op1;
7463 poly_int64 offset;
7464
7465 HOST_WIDE_INT const_size;
7466
7467 /* On BE, we use load/store pair for all large int mode load/stores.
7468 TI/TFmode may also use a load/store pair. */
7469 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7470 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7471 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7472 || type == ADDR_QUERY_LDP_STP_N
7473 || mode == TImode
7474 || mode == TFmode
7475 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7476
7477 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7478 corresponds to the actual size of the memory being loaded/stored and the
7479 mode of the corresponding addressing mode is half of that. */
7480 if (type == ADDR_QUERY_LDP_STP_N
7481 && known_eq (GET_MODE_SIZE (mode), 16))
7482 mode = DFmode;
7483
7484 bool allow_reg_index_p = (!load_store_pair_p
7485 && (known_lt (GET_MODE_SIZE (mode), 16)
7486 || vec_flags == VEC_ADVSIMD
7487 || vec_flags & VEC_SVE_DATA));
7488
7489 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7490 [Rn, #offset, MUL VL]. */
7491 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7492 && (code != REG && code != PLUS))
7493 return false;
7494
7495 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7496 REG addressing. */
7497 if (advsimd_struct_p
7498 && !BYTES_BIG_ENDIAN
7499 && (code != POST_INC && code != REG))
7500 return false;
7501
7502 gcc_checking_assert (GET_MODE (x) == VOIDmode
7503 || SCALAR_INT_MODE_P (GET_MODE (x)));
7504
7505 switch (code)
7506 {
7507 case REG:
7508 case SUBREG:
7509 info->type = ADDRESS_REG_IMM;
7510 info->base = x;
7511 info->offset = const0_rtx;
7512 info->const_offset = 0;
7513 return aarch64_base_register_rtx_p (x, strict_p);
7514
7515 case PLUS:
7516 op0 = XEXP (x, 0);
7517 op1 = XEXP (x, 1);
7518
7519 if (! strict_p
7520 && REG_P (op0)
7521 && virt_or_elim_regno_p (REGNO (op0))
7522 && poly_int_rtx_p (op1, &offset))
7523 {
7524 info->type = ADDRESS_REG_IMM;
7525 info->base = op0;
7526 info->offset = op1;
7527 info->const_offset = offset;
7528
7529 return true;
7530 }
7531
7532 if (maybe_ne (GET_MODE_SIZE (mode), 0)
7533 && aarch64_base_register_rtx_p (op0, strict_p)
7534 && poly_int_rtx_p (op1, &offset))
7535 {
7536 info->type = ADDRESS_REG_IMM;
7537 info->base = op0;
7538 info->offset = op1;
7539 info->const_offset = offset;
7540
7541 /* TImode and TFmode values are allowed in both pairs of X
7542 registers and individual Q registers. The available
7543 address modes are:
7544 X,X: 7-bit signed scaled offset
7545 Q: 9-bit signed offset
7546 We conservatively require an offset representable in either mode.
7547 When performing the check for pairs of X registers i.e. LDP/STP
7548 pass down DImode since that is the natural size of the LDP/STP
7549 instruction memory accesses. */
7550 if (mode == TImode || mode == TFmode)
7551 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7552 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7553 || offset_12bit_unsigned_scaled_p (mode, offset)));
7554
7555 /* A 7bit offset check because OImode will emit a ldp/stp
7556 instruction (only big endian will get here).
7557 For ldp/stp instructions, the offset is scaled for the size of a
7558 single element of the pair. */
7559 if (mode == OImode)
7560 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7561
7562 /* Three 9/12 bit offsets checks because CImode will emit three
7563 ldr/str instructions (only big endian will get here). */
7564 if (mode == CImode)
7565 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7566 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7567 offset + 32)
7568 || offset_12bit_unsigned_scaled_p (V16QImode,
7569 offset + 32)));
7570
7571 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7572 instructions (only big endian will get here). */
7573 if (mode == XImode)
7574 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7575 && aarch64_offset_7bit_signed_scaled_p (TImode,
7576 offset + 32));
7577
7578 /* Make "m" use the LD1 offset range for SVE data modes, so
7579 that pre-RTL optimizers like ivopts will work to that
7580 instead of the wider LDR/STR range. */
7581 if (vec_flags == VEC_SVE_DATA)
7582 return (type == ADDR_QUERY_M
7583 ? offset_4bit_signed_scaled_p (mode, offset)
7584 : offset_9bit_signed_scaled_p (mode, offset));
7585
7586 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7587 {
7588 poly_int64 end_offset = (offset
7589 + GET_MODE_SIZE (mode)
7590 - BYTES_PER_SVE_VECTOR);
7591 return (type == ADDR_QUERY_M
7592 ? offset_4bit_signed_scaled_p (mode, offset)
7593 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7594 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7595 end_offset)));
7596 }
7597
7598 if (vec_flags == VEC_SVE_PRED)
7599 return offset_9bit_signed_scaled_p (mode, offset);
7600
7601 if (load_store_pair_p)
7602 return ((known_eq (GET_MODE_SIZE (mode), 4)
7603 || known_eq (GET_MODE_SIZE (mode), 8)
7604 || known_eq (GET_MODE_SIZE (mode), 16))
7605 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7606 else
7607 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7608 || offset_12bit_unsigned_scaled_p (mode, offset));
7609 }
7610
7611 if (allow_reg_index_p)
7612 {
7613 /* Look for base + (scaled/extended) index register. */
7614 if (aarch64_base_register_rtx_p (op0, strict_p)
7615 && aarch64_classify_index (info, op1, mode, strict_p))
7616 {
7617 info->base = op0;
7618 return true;
7619 }
7620 if (aarch64_base_register_rtx_p (op1, strict_p)
7621 && aarch64_classify_index (info, op0, mode, strict_p))
7622 {
7623 info->base = op1;
7624 return true;
7625 }
7626 }
7627
7628 return false;
7629
7630 case POST_INC:
7631 case POST_DEC:
7632 case PRE_INC:
7633 case PRE_DEC:
7634 info->type = ADDRESS_REG_WB;
7635 info->base = XEXP (x, 0);
7636 info->offset = NULL_RTX;
7637 return aarch64_base_register_rtx_p (info->base, strict_p);
7638
7639 case POST_MODIFY:
7640 case PRE_MODIFY:
7641 info->type = ADDRESS_REG_WB;
7642 info->base = XEXP (x, 0);
7643 if (GET_CODE (XEXP (x, 1)) == PLUS
7644 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7645 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7646 && aarch64_base_register_rtx_p (info->base, strict_p))
7647 {
7648 info->offset = XEXP (XEXP (x, 1), 1);
7649 info->const_offset = offset;
7650
7651 /* TImode and TFmode values are allowed in both pairs of X
7652 registers and individual Q registers. The available
7653 address modes are:
7654 X,X: 7-bit signed scaled offset
7655 Q: 9-bit signed offset
7656 We conservatively require an offset representable in either mode.
7657 */
7658 if (mode == TImode || mode == TFmode)
7659 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7660 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7661
7662 if (load_store_pair_p)
7663 return ((known_eq (GET_MODE_SIZE (mode), 4)
7664 || known_eq (GET_MODE_SIZE (mode), 8)
7665 || known_eq (GET_MODE_SIZE (mode), 16))
7666 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7667 else
7668 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7669 }
7670 return false;
7671
7672 case CONST:
7673 case SYMBOL_REF:
7674 case LABEL_REF:
7675 /* load literal: pc-relative constant pool entry. Only supported
7676 for SI mode or larger. */
7677 info->type = ADDRESS_SYMBOLIC;
7678
7679 if (!load_store_pair_p
7680 && GET_MODE_SIZE (mode).is_constant (&const_size)
7681 && const_size >= 4)
7682 {
7683 rtx sym, addend;
7684
7685 split_const (x, &sym, &addend);
7686 return ((GET_CODE (sym) == LABEL_REF
7687 || (GET_CODE (sym) == SYMBOL_REF
7688 && CONSTANT_POOL_ADDRESS_P (sym)
7689 && aarch64_pcrelative_literal_loads)));
7690 }
7691 return false;
7692
7693 case LO_SUM:
7694 info->type = ADDRESS_LO_SUM;
7695 info->base = XEXP (x, 0);
7696 info->offset = XEXP (x, 1);
7697 if (allow_reg_index_p
7698 && aarch64_base_register_rtx_p (info->base, strict_p))
7699 {
7700 rtx sym, offs;
7701 split_const (info->offset, &sym, &offs);
7702 if (GET_CODE (sym) == SYMBOL_REF
7703 && (aarch64_classify_symbol (sym, INTVAL (offs))
7704 == SYMBOL_SMALL_ABSOLUTE))
7705 {
7706 /* The symbol and offset must be aligned to the access size. */
7707 unsigned int align;
7708
7709 if (CONSTANT_POOL_ADDRESS_P (sym))
7710 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7711 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7712 {
7713 tree exp = SYMBOL_REF_DECL (sym);
7714 align = TYPE_ALIGN (TREE_TYPE (exp));
7715 align = aarch64_constant_alignment (exp, align);
7716 }
7717 else if (SYMBOL_REF_DECL (sym))
7718 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7719 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7720 && SYMBOL_REF_BLOCK (sym) != NULL)
7721 align = SYMBOL_REF_BLOCK (sym)->alignment;
7722 else
7723 align = BITS_PER_UNIT;
7724
7725 poly_int64 ref_size = GET_MODE_SIZE (mode);
7726 if (known_eq (ref_size, 0))
7727 ref_size = GET_MODE_SIZE (DImode);
7728
7729 return (multiple_p (INTVAL (offs), ref_size)
7730 && multiple_p (align / BITS_PER_UNIT, ref_size));
7731 }
7732 }
7733 return false;
7734
7735 default:
7736 return false;
7737 }
7738 }
7739
7740 /* Return true if the address X is valid for a PRFM instruction.
7741 STRICT_P is true if we should do strict checking with
7742 aarch64_classify_address. */
7743
7744 bool
7745 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7746 {
7747 struct aarch64_address_info addr;
7748
7749 /* PRFM accepts the same addresses as DImode... */
7750 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7751 if (!res)
7752 return false;
7753
7754 /* ... except writeback forms. */
7755 return addr.type != ADDRESS_REG_WB;
7756 }
7757
7758 bool
7759 aarch64_symbolic_address_p (rtx x)
7760 {
7761 rtx offset;
7762
7763 split_const (x, &x, &offset);
7764 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7765 }
7766
7767 /* Classify the base of symbolic expression X. */
7768
7769 enum aarch64_symbol_type
7770 aarch64_classify_symbolic_expression (rtx x)
7771 {
7772 rtx offset;
7773
7774 split_const (x, &x, &offset);
7775 return aarch64_classify_symbol (x, INTVAL (offset));
7776 }
7777
7778
7779 /* Return TRUE if X is a legitimate address for accessing memory in
7780 mode MODE. */
7781 static bool
7782 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7783 {
7784 struct aarch64_address_info addr;
7785
7786 return aarch64_classify_address (&addr, x, mode, strict_p);
7787 }
7788
7789 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7790 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7791 bool
7792 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7793 aarch64_addr_query_type type)
7794 {
7795 struct aarch64_address_info addr;
7796
7797 return aarch64_classify_address (&addr, x, mode, strict_p, type);
7798 }
7799
7800 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7801
7802 static bool
7803 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7804 poly_int64 orig_offset,
7805 machine_mode mode)
7806 {
7807 HOST_WIDE_INT size;
7808 if (GET_MODE_SIZE (mode).is_constant (&size))
7809 {
7810 HOST_WIDE_INT const_offset, second_offset;
7811
7812 /* A general SVE offset is A * VQ + B. Remove the A component from
7813 coefficient 0 in order to get the constant B. */
7814 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7815
7816 /* Split an out-of-range address displacement into a base and
7817 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7818 range otherwise to increase opportunities for sharing the base
7819 address of different sizes. Unaligned accesses use the signed
7820 9-bit range, TImode/TFmode use the intersection of signed
7821 scaled 7-bit and signed 9-bit offset. */
7822 if (mode == TImode || mode == TFmode)
7823 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7824 else if ((const_offset & (size - 1)) != 0)
7825 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7826 else
7827 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7828
7829 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7830 return false;
7831
7832 /* Split the offset into second_offset and the rest. */
7833 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7834 *offset2 = gen_int_mode (second_offset, Pmode);
7835 return true;
7836 }
7837 else
7838 {
7839 /* Get the mode we should use as the basis of the range. For structure
7840 modes this is the mode of one vector. */
7841 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7842 machine_mode step_mode
7843 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7844
7845 /* Get the "mul vl" multiplier we'd like to use. */
7846 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7847 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7848 if (vec_flags & VEC_SVE_DATA)
7849 /* LDR supports a 9-bit range, but the move patterns for
7850 structure modes require all vectors to be in range of the
7851 same base. The simplest way of accomodating that while still
7852 promoting reuse of anchor points between different modes is
7853 to use an 8-bit range unconditionally. */
7854 vnum = ((vnum + 128) & 255) - 128;
7855 else
7856 /* Predicates are only handled singly, so we might as well use
7857 the full range. */
7858 vnum = ((vnum + 256) & 511) - 256;
7859 if (vnum == 0)
7860 return false;
7861
7862 /* Convert the "mul vl" multiplier into a byte offset. */
7863 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7864 if (known_eq (second_offset, orig_offset))
7865 return false;
7866
7867 /* Split the offset into second_offset and the rest. */
7868 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7869 *offset2 = gen_int_mode (second_offset, Pmode);
7870 return true;
7871 }
7872 }
7873
7874 /* Return the binary representation of floating point constant VALUE in INTVAL.
7875 If the value cannot be converted, return false without setting INTVAL.
7876 The conversion is done in the given MODE. */
7877 bool
7878 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7879 {
7880
7881 /* We make a general exception for 0. */
7882 if (aarch64_float_const_zero_rtx_p (value))
7883 {
7884 *intval = 0;
7885 return true;
7886 }
7887
7888 scalar_float_mode mode;
7889 if (GET_CODE (value) != CONST_DOUBLE
7890 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7891 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7892 /* Only support up to DF mode. */
7893 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7894 return false;
7895
7896 unsigned HOST_WIDE_INT ival = 0;
7897
7898 long res[2];
7899 real_to_target (res,
7900 CONST_DOUBLE_REAL_VALUE (value),
7901 REAL_MODE_FORMAT (mode));
7902
7903 if (mode == DFmode)
7904 {
7905 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7906 ival = zext_hwi (res[order], 32);
7907 ival |= (zext_hwi (res[1 - order], 32) << 32);
7908 }
7909 else
7910 ival = zext_hwi (res[0], 32);
7911
7912 *intval = ival;
7913 return true;
7914 }
7915
7916 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7917 single MOV(+MOVK) followed by an FMOV. */
7918 bool
7919 aarch64_float_const_rtx_p (rtx x)
7920 {
7921 machine_mode mode = GET_MODE (x);
7922 if (mode == VOIDmode)
7923 return false;
7924
7925 /* Determine whether it's cheaper to write float constants as
7926 mov/movk pairs over ldr/adrp pairs. */
7927 unsigned HOST_WIDE_INT ival;
7928
7929 if (GET_CODE (x) == CONST_DOUBLE
7930 && SCALAR_FLOAT_MODE_P (mode)
7931 && aarch64_reinterpret_float_as_int (x, &ival))
7932 {
7933 scalar_int_mode imode = (mode == HFmode
7934 ? SImode
7935 : int_mode_for_mode (mode).require ());
7936 int num_instr = aarch64_internal_mov_immediate
7937 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7938 return num_instr < 3;
7939 }
7940
7941 return false;
7942 }
7943
7944 /* Return TRUE if rtx X is immediate constant 0.0 */
7945 bool
7946 aarch64_float_const_zero_rtx_p (rtx x)
7947 {
7948 if (GET_MODE (x) == VOIDmode)
7949 return false;
7950
7951 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7952 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7953 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7954 }
7955
7956 /* Return TRUE if rtx X is immediate constant that fits in a single
7957 MOVI immediate operation. */
7958 bool
7959 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7960 {
7961 if (!TARGET_SIMD)
7962 return false;
7963
7964 machine_mode vmode;
7965 scalar_int_mode imode;
7966 unsigned HOST_WIDE_INT ival;
7967
7968 if (GET_CODE (x) == CONST_DOUBLE
7969 && SCALAR_FLOAT_MODE_P (mode))
7970 {
7971 if (!aarch64_reinterpret_float_as_int (x, &ival))
7972 return false;
7973
7974 /* We make a general exception for 0. */
7975 if (aarch64_float_const_zero_rtx_p (x))
7976 return true;
7977
7978 imode = int_mode_for_mode (mode).require ();
7979 }
7980 else if (GET_CODE (x) == CONST_INT
7981 && is_a <scalar_int_mode> (mode, &imode))
7982 ival = INTVAL (x);
7983 else
7984 return false;
7985
7986 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7987 a 128 bit vector mode. */
7988 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7989
7990 vmode = aarch64_simd_container_mode (imode, width);
7991 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7992
7993 return aarch64_simd_valid_immediate (v_op, NULL);
7994 }
7995
7996
7997 /* Return the fixed registers used for condition codes. */
7998
7999 static bool
8000 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
8001 {
8002 *p1 = CC_REGNUM;
8003 *p2 = INVALID_REGNUM;
8004 return true;
8005 }
8006
8007 /* This function is used by the call expanders of the machine description.
8008 RESULT is the register in which the result is returned. It's NULL for
8009 "call" and "sibcall".
8010 MEM is the location of the function call.
8011 SIBCALL indicates whether this function call is normal call or sibling call.
8012 It will generate different pattern accordingly. */
8013
8014 void
8015 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
8016 {
8017 rtx call, callee, tmp;
8018 rtvec vec;
8019 machine_mode mode;
8020
8021 gcc_assert (MEM_P (mem));
8022 callee = XEXP (mem, 0);
8023 mode = GET_MODE (callee);
8024 gcc_assert (mode == Pmode);
8025
8026 /* Decide if we should generate indirect calls by loading the
8027 address of the callee into a register before performing
8028 the branch-and-link. */
8029 if (SYMBOL_REF_P (callee)
8030 ? (aarch64_is_long_call_p (callee)
8031 || aarch64_is_noplt_call_p (callee))
8032 : !REG_P (callee))
8033 XEXP (mem, 0) = force_reg (mode, callee);
8034
8035 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8036
8037 if (result != NULL_RTX)
8038 call = gen_rtx_SET (result, call);
8039
8040 if (sibcall)
8041 tmp = ret_rtx;
8042 else
8043 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8044
8045 vec = gen_rtvec (2, call, tmp);
8046 call = gen_rtx_PARALLEL (VOIDmode, vec);
8047
8048 aarch64_emit_call_insn (call);
8049 }
8050
8051 /* Emit call insn with PAT and do aarch64-specific handling. */
8052
8053 void
8054 aarch64_emit_call_insn (rtx pat)
8055 {
8056 rtx insn = emit_call_insn (pat);
8057
8058 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8059 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8060 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8061 }
8062
8063 machine_mode
8064 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8065 {
8066 machine_mode mode_x = GET_MODE (x);
8067 rtx_code code_x = GET_CODE (x);
8068
8069 /* All floating point compares return CCFP if it is an equality
8070 comparison, and CCFPE otherwise. */
8071 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8072 {
8073 switch (code)
8074 {
8075 case EQ:
8076 case NE:
8077 case UNORDERED:
8078 case ORDERED:
8079 case UNLT:
8080 case UNLE:
8081 case UNGT:
8082 case UNGE:
8083 case UNEQ:
8084 return CCFPmode;
8085
8086 case LT:
8087 case LE:
8088 case GT:
8089 case GE:
8090 case LTGT:
8091 return CCFPEmode;
8092
8093 default:
8094 gcc_unreachable ();
8095 }
8096 }
8097
8098 /* Equality comparisons of short modes against zero can be performed
8099 using the TST instruction with the appropriate bitmask. */
8100 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8101 && (code == EQ || code == NE)
8102 && (mode_x == HImode || mode_x == QImode))
8103 return CC_NZmode;
8104
8105 /* Similarly, comparisons of zero_extends from shorter modes can
8106 be performed using an ANDS with an immediate mask. */
8107 if (y == const0_rtx && code_x == ZERO_EXTEND
8108 && (mode_x == SImode || mode_x == DImode)
8109 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8110 && (code == EQ || code == NE))
8111 return CC_NZmode;
8112
8113 if ((mode_x == SImode || mode_x == DImode)
8114 && y == const0_rtx
8115 && (code == EQ || code == NE || code == LT || code == GE)
8116 && (code_x == PLUS || code_x == MINUS || code_x == AND
8117 || code_x == NEG
8118 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8119 && CONST_INT_P (XEXP (x, 2)))))
8120 return CC_NZmode;
8121
8122 /* A compare with a shifted operand. Because of canonicalization,
8123 the comparison will have to be swapped when we emit the assembly
8124 code. */
8125 if ((mode_x == SImode || mode_x == DImode)
8126 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8127 && (code_x == ASHIFT || code_x == ASHIFTRT
8128 || code_x == LSHIFTRT
8129 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8130 return CC_SWPmode;
8131
8132 /* Similarly for a negated operand, but we can only do this for
8133 equalities. */
8134 if ((mode_x == SImode || mode_x == DImode)
8135 && (REG_P (y) || GET_CODE (y) == SUBREG)
8136 && (code == EQ || code == NE)
8137 && code_x == NEG)
8138 return CC_Zmode;
8139
8140 /* A test for unsigned overflow from an addition. */
8141 if ((mode_x == DImode || mode_x == TImode)
8142 && (code == LTU || code == GEU)
8143 && code_x == PLUS
8144 && rtx_equal_p (XEXP (x, 0), y))
8145 return CC_Cmode;
8146
8147 /* A test for unsigned overflow from an add with carry. */
8148 if ((mode_x == DImode || mode_x == TImode)
8149 && (code == LTU || code == GEU)
8150 && code_x == PLUS
8151 && CONST_SCALAR_INT_P (y)
8152 && (rtx_mode_t (y, mode_x)
8153 == (wi::shwi (1, mode_x)
8154 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8155 return CC_ADCmode;
8156
8157 /* A test for signed overflow. */
8158 if ((mode_x == DImode || mode_x == TImode)
8159 && code == NE
8160 && code_x == PLUS
8161 && GET_CODE (y) == SIGN_EXTEND)
8162 return CC_Vmode;
8163
8164 /* For everything else, return CCmode. */
8165 return CCmode;
8166 }
8167
8168 static int
8169 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8170
8171 int
8172 aarch64_get_condition_code (rtx x)
8173 {
8174 machine_mode mode = GET_MODE (XEXP (x, 0));
8175 enum rtx_code comp_code = GET_CODE (x);
8176
8177 if (GET_MODE_CLASS (mode) != MODE_CC)
8178 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8179 return aarch64_get_condition_code_1 (mode, comp_code);
8180 }
8181
8182 static int
8183 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8184 {
8185 switch (mode)
8186 {
8187 case E_CCFPmode:
8188 case E_CCFPEmode:
8189 switch (comp_code)
8190 {
8191 case GE: return AARCH64_GE;
8192 case GT: return AARCH64_GT;
8193 case LE: return AARCH64_LS;
8194 case LT: return AARCH64_MI;
8195 case NE: return AARCH64_NE;
8196 case EQ: return AARCH64_EQ;
8197 case ORDERED: return AARCH64_VC;
8198 case UNORDERED: return AARCH64_VS;
8199 case UNLT: return AARCH64_LT;
8200 case UNLE: return AARCH64_LE;
8201 case UNGT: return AARCH64_HI;
8202 case UNGE: return AARCH64_PL;
8203 default: return -1;
8204 }
8205 break;
8206
8207 case E_CCmode:
8208 switch (comp_code)
8209 {
8210 case NE: return AARCH64_NE;
8211 case EQ: return AARCH64_EQ;
8212 case GE: return AARCH64_GE;
8213 case GT: return AARCH64_GT;
8214 case LE: return AARCH64_LE;
8215 case LT: return AARCH64_LT;
8216 case GEU: return AARCH64_CS;
8217 case GTU: return AARCH64_HI;
8218 case LEU: return AARCH64_LS;
8219 case LTU: return AARCH64_CC;
8220 default: return -1;
8221 }
8222 break;
8223
8224 case E_CC_SWPmode:
8225 switch (comp_code)
8226 {
8227 case NE: return AARCH64_NE;
8228 case EQ: return AARCH64_EQ;
8229 case GE: return AARCH64_LE;
8230 case GT: return AARCH64_LT;
8231 case LE: return AARCH64_GE;
8232 case LT: return AARCH64_GT;
8233 case GEU: return AARCH64_LS;
8234 case GTU: return AARCH64_CC;
8235 case LEU: return AARCH64_CS;
8236 case LTU: return AARCH64_HI;
8237 default: return -1;
8238 }
8239 break;
8240
8241 case E_CC_NZCmode:
8242 switch (comp_code)
8243 {
8244 case NE: return AARCH64_NE; /* = any */
8245 case EQ: return AARCH64_EQ; /* = none */
8246 case GE: return AARCH64_PL; /* = nfrst */
8247 case LT: return AARCH64_MI; /* = first */
8248 case GEU: return AARCH64_CS; /* = nlast */
8249 case GTU: return AARCH64_HI; /* = pmore */
8250 case LEU: return AARCH64_LS; /* = plast */
8251 case LTU: return AARCH64_CC; /* = last */
8252 default: return -1;
8253 }
8254 break;
8255
8256 case E_CC_NZmode:
8257 switch (comp_code)
8258 {
8259 case NE: return AARCH64_NE;
8260 case EQ: return AARCH64_EQ;
8261 case GE: return AARCH64_PL;
8262 case LT: return AARCH64_MI;
8263 default: return -1;
8264 }
8265 break;
8266
8267 case E_CC_Zmode:
8268 switch (comp_code)
8269 {
8270 case NE: return AARCH64_NE;
8271 case EQ: return AARCH64_EQ;
8272 default: return -1;
8273 }
8274 break;
8275
8276 case E_CC_Cmode:
8277 switch (comp_code)
8278 {
8279 case LTU: return AARCH64_CS;
8280 case GEU: return AARCH64_CC;
8281 default: return -1;
8282 }
8283 break;
8284
8285 case E_CC_ADCmode:
8286 switch (comp_code)
8287 {
8288 case GEU: return AARCH64_CS;
8289 case LTU: return AARCH64_CC;
8290 default: return -1;
8291 }
8292 break;
8293
8294 case E_CC_Vmode:
8295 switch (comp_code)
8296 {
8297 case NE: return AARCH64_VS;
8298 case EQ: return AARCH64_VC;
8299 default: return -1;
8300 }
8301 break;
8302
8303 default:
8304 return -1;
8305 }
8306
8307 return -1;
8308 }
8309
8310 bool
8311 aarch64_const_vec_all_same_in_range_p (rtx x,
8312 HOST_WIDE_INT minval,
8313 HOST_WIDE_INT maxval)
8314 {
8315 rtx elt;
8316 return (const_vec_duplicate_p (x, &elt)
8317 && CONST_INT_P (elt)
8318 && IN_RANGE (INTVAL (elt), minval, maxval));
8319 }
8320
8321 bool
8322 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8323 {
8324 return aarch64_const_vec_all_same_in_range_p (x, val, val);
8325 }
8326
8327 /* Return true if VEC is a constant in which every element is in the range
8328 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8329
8330 static bool
8331 aarch64_const_vec_all_in_range_p (rtx vec,
8332 HOST_WIDE_INT minval,
8333 HOST_WIDE_INT maxval)
8334 {
8335 if (GET_CODE (vec) != CONST_VECTOR
8336 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8337 return false;
8338
8339 int nunits;
8340 if (!CONST_VECTOR_STEPPED_P (vec))
8341 nunits = const_vector_encoded_nelts (vec);
8342 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8343 return false;
8344
8345 for (int i = 0; i < nunits; i++)
8346 {
8347 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8348 if (!CONST_INT_P (vec_elem)
8349 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8350 return false;
8351 }
8352 return true;
8353 }
8354
8355 /* N Z C V. */
8356 #define AARCH64_CC_V 1
8357 #define AARCH64_CC_C (1 << 1)
8358 #define AARCH64_CC_Z (1 << 2)
8359 #define AARCH64_CC_N (1 << 3)
8360
8361 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8362 static const int aarch64_nzcv_codes[] =
8363 {
8364 0, /* EQ, Z == 1. */
8365 AARCH64_CC_Z, /* NE, Z == 0. */
8366 0, /* CS, C == 1. */
8367 AARCH64_CC_C, /* CC, C == 0. */
8368 0, /* MI, N == 1. */
8369 AARCH64_CC_N, /* PL, N == 0. */
8370 0, /* VS, V == 1. */
8371 AARCH64_CC_V, /* VC, V == 0. */
8372 0, /* HI, C ==1 && Z == 0. */
8373 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
8374 AARCH64_CC_V, /* GE, N == V. */
8375 0, /* LT, N != V. */
8376 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
8377 0, /* LE, !(Z == 0 && N == V). */
8378 0, /* AL, Any. */
8379 0 /* NV, Any. */
8380 };
8381
8382 /* Print floating-point vector immediate operand X to F, negating it
8383 first if NEGATE is true. Return true on success, false if it isn't
8384 a constant we can handle. */
8385
8386 static bool
8387 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8388 {
8389 rtx elt;
8390
8391 if (!const_vec_duplicate_p (x, &elt))
8392 return false;
8393
8394 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8395 if (negate)
8396 r = real_value_negate (&r);
8397
8398 /* Handle the SVE single-bit immediates specially, since they have a
8399 fixed form in the assembly syntax. */
8400 if (real_equal (&r, &dconst0))
8401 asm_fprintf (f, "0.0");
8402 else if (real_equal (&r, &dconst2))
8403 asm_fprintf (f, "2.0");
8404 else if (real_equal (&r, &dconst1))
8405 asm_fprintf (f, "1.0");
8406 else if (real_equal (&r, &dconsthalf))
8407 asm_fprintf (f, "0.5");
8408 else
8409 {
8410 const int buf_size = 20;
8411 char float_buf[buf_size] = {'\0'};
8412 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8413 1, GET_MODE (elt));
8414 asm_fprintf (f, "%s", float_buf);
8415 }
8416
8417 return true;
8418 }
8419
8420 /* Return the equivalent letter for size. */
8421 static char
8422 sizetochar (int size)
8423 {
8424 switch (size)
8425 {
8426 case 64: return 'd';
8427 case 32: return 's';
8428 case 16: return 'h';
8429 case 8 : return 'b';
8430 default: gcc_unreachable ();
8431 }
8432 }
8433
8434 /* Print operand X to file F in a target specific manner according to CODE.
8435 The acceptable formatting commands given by CODE are:
8436 'c': An integer or symbol address without a preceding #
8437 sign.
8438 'C': Take the duplicated element in a vector constant
8439 and print it in hex.
8440 'D': Take the duplicated element in a vector constant
8441 and print it as an unsigned integer, in decimal.
8442 'e': Print the sign/zero-extend size as a character 8->b,
8443 16->h, 32->w. Can also be used for masks:
8444 0xff->b, 0xffff->h, 0xffffffff->w.
8445 'I': If the operand is a duplicated vector constant,
8446 replace it with the duplicated scalar. If the
8447 operand is then a floating-point constant, replace
8448 it with the integer bit representation. Print the
8449 transformed constant as a signed decimal number.
8450 'p': Prints N such that 2^N == X (X must be power of 2 and
8451 const int).
8452 'P': Print the number of non-zero bits in X (a const_int).
8453 'H': Print the higher numbered register of a pair (TImode)
8454 of regs.
8455 'm': Print a condition (eq, ne, etc).
8456 'M': Same as 'm', but invert condition.
8457 'N': Take the duplicated element in a vector constant
8458 and print the negative of it in decimal.
8459 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8460 'S/T/U/V': Print a FP/SIMD register name for a register list.
8461 The register printed is the FP/SIMD register name
8462 of X + 0/1/2/3 for S/T/U/V.
8463 'R': Print a scalar Integer/FP/SIMD register name + 1.
8464 'X': Print bottom 16 bits of integer constant in hex.
8465 'w/x': Print a general register name or the zero register
8466 (32-bit or 64-bit).
8467 '0': Print a normal operand, if it's a general register,
8468 then we assume DImode.
8469 'k': Print NZCV for conditional compare instructions.
8470 'A': Output address constant representing the first
8471 argument of X, specifying a relocation offset
8472 if appropriate.
8473 'L': Output constant address specified by X
8474 with a relocation offset if appropriate.
8475 'G': Prints address of X, specifying a PC relative
8476 relocation mode if appropriate.
8477 'y': Output address of LDP or STP - this is used for
8478 some LDP/STPs which don't use a PARALLEL in their
8479 pattern (so the mode needs to be adjusted).
8480 'z': Output address of a typical LDP or STP. */
8481
8482 static void
8483 aarch64_print_operand (FILE *f, rtx x, int code)
8484 {
8485 rtx elt;
8486 switch (code)
8487 {
8488 case 'c':
8489 switch (GET_CODE (x))
8490 {
8491 case CONST_INT:
8492 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8493 break;
8494
8495 case SYMBOL_REF:
8496 output_addr_const (f, x);
8497 break;
8498
8499 case CONST:
8500 if (GET_CODE (XEXP (x, 0)) == PLUS
8501 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8502 {
8503 output_addr_const (f, x);
8504 break;
8505 }
8506 /* Fall through. */
8507
8508 default:
8509 output_operand_lossage ("unsupported operand for code '%c'", code);
8510 }
8511 break;
8512
8513 case 'e':
8514 {
8515 x = unwrap_const_vec_duplicate (x);
8516 if (!CONST_INT_P (x))
8517 {
8518 output_operand_lossage ("invalid operand for '%%%c'", code);
8519 return;
8520 }
8521
8522 HOST_WIDE_INT val = INTVAL (x);
8523 if ((val & ~7) == 8 || val == 0xff)
8524 fputc ('b', f);
8525 else if ((val & ~7) == 16 || val == 0xffff)
8526 fputc ('h', f);
8527 else if ((val & ~7) == 32 || val == 0xffffffff)
8528 fputc ('w', f);
8529 else
8530 {
8531 output_operand_lossage ("invalid operand for '%%%c'", code);
8532 return;
8533 }
8534 }
8535 break;
8536
8537 case 'p':
8538 {
8539 int n;
8540
8541 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8542 {
8543 output_operand_lossage ("invalid operand for '%%%c'", code);
8544 return;
8545 }
8546
8547 asm_fprintf (f, "%d", n);
8548 }
8549 break;
8550
8551 case 'P':
8552 if (!CONST_INT_P (x))
8553 {
8554 output_operand_lossage ("invalid operand for '%%%c'", code);
8555 return;
8556 }
8557
8558 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8559 break;
8560
8561 case 'H':
8562 if (x == const0_rtx)
8563 {
8564 asm_fprintf (f, "xzr");
8565 break;
8566 }
8567
8568 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8569 {
8570 output_operand_lossage ("invalid operand for '%%%c'", code);
8571 return;
8572 }
8573
8574 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8575 break;
8576
8577 case 'I':
8578 {
8579 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8580 if (CONST_INT_P (x))
8581 asm_fprintf (f, "%wd", INTVAL (x));
8582 else
8583 {
8584 output_operand_lossage ("invalid operand for '%%%c'", code);
8585 return;
8586 }
8587 break;
8588 }
8589
8590 case 'M':
8591 case 'm':
8592 {
8593 int cond_code;
8594 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8595 if (x == const_true_rtx)
8596 {
8597 if (code == 'M')
8598 fputs ("nv", f);
8599 return;
8600 }
8601
8602 if (!COMPARISON_P (x))
8603 {
8604 output_operand_lossage ("invalid operand for '%%%c'", code);
8605 return;
8606 }
8607
8608 cond_code = aarch64_get_condition_code (x);
8609 gcc_assert (cond_code >= 0);
8610 if (code == 'M')
8611 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8612 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8613 fputs (aarch64_sve_condition_codes[cond_code], f);
8614 else
8615 fputs (aarch64_condition_codes[cond_code], f);
8616 }
8617 break;
8618
8619 case 'N':
8620 if (!const_vec_duplicate_p (x, &elt))
8621 {
8622 output_operand_lossage ("invalid vector constant");
8623 return;
8624 }
8625
8626 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8627 asm_fprintf (f, "%wd", -INTVAL (elt));
8628 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8629 && aarch64_print_vector_float_operand (f, x, true))
8630 ;
8631 else
8632 {
8633 output_operand_lossage ("invalid vector constant");
8634 return;
8635 }
8636 break;
8637
8638 case 'b':
8639 case 'h':
8640 case 's':
8641 case 'd':
8642 case 'q':
8643 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8644 {
8645 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8646 return;
8647 }
8648 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8649 break;
8650
8651 case 'S':
8652 case 'T':
8653 case 'U':
8654 case 'V':
8655 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8656 {
8657 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8658 return;
8659 }
8660 asm_fprintf (f, "%c%d",
8661 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8662 REGNO (x) - V0_REGNUM + (code - 'S'));
8663 break;
8664
8665 case 'R':
8666 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
8667 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8668 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8669 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
8670 else
8671 output_operand_lossage ("incompatible register operand for '%%%c'",
8672 code);
8673 break;
8674
8675 case 'X':
8676 if (!CONST_INT_P (x))
8677 {
8678 output_operand_lossage ("invalid operand for '%%%c'", code);
8679 return;
8680 }
8681 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8682 break;
8683
8684 case 'C':
8685 {
8686 /* Print a replicated constant in hex. */
8687 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8688 {
8689 output_operand_lossage ("invalid operand for '%%%c'", code);
8690 return;
8691 }
8692 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8693 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8694 }
8695 break;
8696
8697 case 'D':
8698 {
8699 /* Print a replicated constant in decimal, treating it as
8700 unsigned. */
8701 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8702 {
8703 output_operand_lossage ("invalid operand for '%%%c'", code);
8704 return;
8705 }
8706 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8707 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8708 }
8709 break;
8710
8711 case 'w':
8712 case 'x':
8713 if (x == const0_rtx
8714 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8715 {
8716 asm_fprintf (f, "%czr", code);
8717 break;
8718 }
8719
8720 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8721 {
8722 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8723 break;
8724 }
8725
8726 if (REG_P (x) && REGNO (x) == SP_REGNUM)
8727 {
8728 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8729 break;
8730 }
8731
8732 /* Fall through */
8733
8734 case 0:
8735 if (x == NULL)
8736 {
8737 output_operand_lossage ("missing operand");
8738 return;
8739 }
8740
8741 switch (GET_CODE (x))
8742 {
8743 case REG:
8744 if (aarch64_sve_data_mode_p (GET_MODE (x)))
8745 {
8746 if (REG_NREGS (x) == 1)
8747 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8748 else
8749 {
8750 char suffix
8751 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8752 asm_fprintf (f, "{z%d.%c - z%d.%c}",
8753 REGNO (x) - V0_REGNUM, suffix,
8754 END_REGNO (x) - V0_REGNUM - 1, suffix);
8755 }
8756 }
8757 else
8758 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8759 break;
8760
8761 case MEM:
8762 output_address (GET_MODE (x), XEXP (x, 0));
8763 break;
8764
8765 case LABEL_REF:
8766 case SYMBOL_REF:
8767 output_addr_const (asm_out_file, x);
8768 break;
8769
8770 case CONST_INT:
8771 asm_fprintf (f, "%wd", INTVAL (x));
8772 break;
8773
8774 case CONST:
8775 if (!VECTOR_MODE_P (GET_MODE (x)))
8776 {
8777 output_addr_const (asm_out_file, x);
8778 break;
8779 }
8780 /* fall through */
8781
8782 case CONST_VECTOR:
8783 if (!const_vec_duplicate_p (x, &elt))
8784 {
8785 output_operand_lossage ("invalid vector constant");
8786 return;
8787 }
8788
8789 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8790 asm_fprintf (f, "%wd", INTVAL (elt));
8791 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8792 && aarch64_print_vector_float_operand (f, x, false))
8793 ;
8794 else
8795 {
8796 output_operand_lossage ("invalid vector constant");
8797 return;
8798 }
8799 break;
8800
8801 case CONST_DOUBLE:
8802 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8803 be getting CONST_DOUBLEs holding integers. */
8804 gcc_assert (GET_MODE (x) != VOIDmode);
8805 if (aarch64_float_const_zero_rtx_p (x))
8806 {
8807 fputc ('0', f);
8808 break;
8809 }
8810 else if (aarch64_float_const_representable_p (x))
8811 {
8812 #define buf_size 20
8813 char float_buf[buf_size] = {'\0'};
8814 real_to_decimal_for_mode (float_buf,
8815 CONST_DOUBLE_REAL_VALUE (x),
8816 buf_size, buf_size,
8817 1, GET_MODE (x));
8818 asm_fprintf (asm_out_file, "%s", float_buf);
8819 break;
8820 #undef buf_size
8821 }
8822 output_operand_lossage ("invalid constant");
8823 return;
8824 default:
8825 output_operand_lossage ("invalid operand");
8826 return;
8827 }
8828 break;
8829
8830 case 'A':
8831 if (GET_CODE (x) == HIGH)
8832 x = XEXP (x, 0);
8833
8834 switch (aarch64_classify_symbolic_expression (x))
8835 {
8836 case SYMBOL_SMALL_GOT_4G:
8837 asm_fprintf (asm_out_file, ":got:");
8838 break;
8839
8840 case SYMBOL_SMALL_TLSGD:
8841 asm_fprintf (asm_out_file, ":tlsgd:");
8842 break;
8843
8844 case SYMBOL_SMALL_TLSDESC:
8845 asm_fprintf (asm_out_file, ":tlsdesc:");
8846 break;
8847
8848 case SYMBOL_SMALL_TLSIE:
8849 asm_fprintf (asm_out_file, ":gottprel:");
8850 break;
8851
8852 case SYMBOL_TLSLE24:
8853 asm_fprintf (asm_out_file, ":tprel:");
8854 break;
8855
8856 case SYMBOL_TINY_GOT:
8857 gcc_unreachable ();
8858 break;
8859
8860 default:
8861 break;
8862 }
8863 output_addr_const (asm_out_file, x);
8864 break;
8865
8866 case 'L':
8867 switch (aarch64_classify_symbolic_expression (x))
8868 {
8869 case SYMBOL_SMALL_GOT_4G:
8870 asm_fprintf (asm_out_file, ":lo12:");
8871 break;
8872
8873 case SYMBOL_SMALL_TLSGD:
8874 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8875 break;
8876
8877 case SYMBOL_SMALL_TLSDESC:
8878 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8879 break;
8880
8881 case SYMBOL_SMALL_TLSIE:
8882 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8883 break;
8884
8885 case SYMBOL_TLSLE12:
8886 asm_fprintf (asm_out_file, ":tprel_lo12:");
8887 break;
8888
8889 case SYMBOL_TLSLE24:
8890 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8891 break;
8892
8893 case SYMBOL_TINY_GOT:
8894 asm_fprintf (asm_out_file, ":got:");
8895 break;
8896
8897 case SYMBOL_TINY_TLSIE:
8898 asm_fprintf (asm_out_file, ":gottprel:");
8899 break;
8900
8901 default:
8902 break;
8903 }
8904 output_addr_const (asm_out_file, x);
8905 break;
8906
8907 case 'G':
8908 switch (aarch64_classify_symbolic_expression (x))
8909 {
8910 case SYMBOL_TLSLE24:
8911 asm_fprintf (asm_out_file, ":tprel_hi12:");
8912 break;
8913 default:
8914 break;
8915 }
8916 output_addr_const (asm_out_file, x);
8917 break;
8918
8919 case 'k':
8920 {
8921 HOST_WIDE_INT cond_code;
8922
8923 if (!CONST_INT_P (x))
8924 {
8925 output_operand_lossage ("invalid operand for '%%%c'", code);
8926 return;
8927 }
8928
8929 cond_code = INTVAL (x);
8930 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8931 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8932 }
8933 break;
8934
8935 case 'y':
8936 case 'z':
8937 {
8938 machine_mode mode = GET_MODE (x);
8939
8940 if (GET_CODE (x) != MEM
8941 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8942 {
8943 output_operand_lossage ("invalid operand for '%%%c'", code);
8944 return;
8945 }
8946
8947 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8948 code == 'y'
8949 ? ADDR_QUERY_LDP_STP_N
8950 : ADDR_QUERY_LDP_STP))
8951 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8952 }
8953 break;
8954
8955 default:
8956 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8957 return;
8958 }
8959 }
8960
8961 /* Print address 'x' of a memory access with mode 'mode'.
8962 'op' is the context required by aarch64_classify_address. It can either be
8963 MEM for a normal memory access or PARALLEL for LDP/STP. */
8964 static bool
8965 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8966 aarch64_addr_query_type type)
8967 {
8968 struct aarch64_address_info addr;
8969 unsigned int size;
8970
8971 /* Check all addresses are Pmode - including ILP32. */
8972 if (GET_MODE (x) != Pmode
8973 && (!CONST_INT_P (x)
8974 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8975 {
8976 output_operand_lossage ("invalid address mode");
8977 return false;
8978 }
8979
8980 if (aarch64_classify_address (&addr, x, mode, true, type))
8981 switch (addr.type)
8982 {
8983 case ADDRESS_REG_IMM:
8984 if (known_eq (addr.const_offset, 0))
8985 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8986 else if (aarch64_sve_data_mode_p (mode))
8987 {
8988 HOST_WIDE_INT vnum
8989 = exact_div (addr.const_offset,
8990 BYTES_PER_SVE_VECTOR).to_constant ();
8991 asm_fprintf (f, "[%s, #%wd, mul vl]",
8992 reg_names[REGNO (addr.base)], vnum);
8993 }
8994 else if (aarch64_sve_pred_mode_p (mode))
8995 {
8996 HOST_WIDE_INT vnum
8997 = exact_div (addr.const_offset,
8998 BYTES_PER_SVE_PRED).to_constant ();
8999 asm_fprintf (f, "[%s, #%wd, mul vl]",
9000 reg_names[REGNO (addr.base)], vnum);
9001 }
9002 else
9003 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
9004 INTVAL (addr.offset));
9005 return true;
9006
9007 case ADDRESS_REG_REG:
9008 if (addr.shift == 0)
9009 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
9010 reg_names [REGNO (addr.offset)]);
9011 else
9012 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
9013 reg_names [REGNO (addr.offset)], addr.shift);
9014 return true;
9015
9016 case ADDRESS_REG_UXTW:
9017 if (addr.shift == 0)
9018 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9019 REGNO (addr.offset) - R0_REGNUM);
9020 else
9021 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9022 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9023 return true;
9024
9025 case ADDRESS_REG_SXTW:
9026 if (addr.shift == 0)
9027 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9028 REGNO (addr.offset) - R0_REGNUM);
9029 else
9030 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9031 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9032 return true;
9033
9034 case ADDRESS_REG_WB:
9035 /* Writeback is only supported for fixed-width modes. */
9036 size = GET_MODE_SIZE (mode).to_constant ();
9037 switch (GET_CODE (x))
9038 {
9039 case PRE_INC:
9040 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9041 return true;
9042 case POST_INC:
9043 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9044 return true;
9045 case PRE_DEC:
9046 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9047 return true;
9048 case POST_DEC:
9049 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9050 return true;
9051 case PRE_MODIFY:
9052 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9053 INTVAL (addr.offset));
9054 return true;
9055 case POST_MODIFY:
9056 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9057 INTVAL (addr.offset));
9058 return true;
9059 default:
9060 break;
9061 }
9062 break;
9063
9064 case ADDRESS_LO_SUM:
9065 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9066 output_addr_const (f, addr.offset);
9067 asm_fprintf (f, "]");
9068 return true;
9069
9070 case ADDRESS_SYMBOLIC:
9071 output_addr_const (f, x);
9072 return true;
9073 }
9074
9075 return false;
9076 }
9077
9078 /* Print address 'x' of a memory access with mode 'mode'. */
9079 static void
9080 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9081 {
9082 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9083 output_addr_const (f, x);
9084 }
9085
9086 bool
9087 aarch64_label_mentioned_p (rtx x)
9088 {
9089 const char *fmt;
9090 int i;
9091
9092 if (GET_CODE (x) == LABEL_REF)
9093 return true;
9094
9095 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9096 referencing instruction, but they are constant offsets, not
9097 symbols. */
9098 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9099 return false;
9100
9101 fmt = GET_RTX_FORMAT (GET_CODE (x));
9102 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9103 {
9104 if (fmt[i] == 'E')
9105 {
9106 int j;
9107
9108 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9109 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9110 return 1;
9111 }
9112 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9113 return 1;
9114 }
9115
9116 return 0;
9117 }
9118
9119 /* Implement REGNO_REG_CLASS. */
9120
9121 enum reg_class
9122 aarch64_regno_regclass (unsigned regno)
9123 {
9124 if (GP_REGNUM_P (regno))
9125 return GENERAL_REGS;
9126
9127 if (regno == SP_REGNUM)
9128 return STACK_REG;
9129
9130 if (regno == FRAME_POINTER_REGNUM
9131 || regno == ARG_POINTER_REGNUM)
9132 return POINTER_REGS;
9133
9134 if (FP_REGNUM_P (regno))
9135 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9136 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9137
9138 if (PR_REGNUM_P (regno))
9139 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9140
9141 return NO_REGS;
9142 }
9143
9144 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9145 If OFFSET is out of range, return an offset of an anchor point
9146 that is in range. Return 0 otherwise. */
9147
9148 static HOST_WIDE_INT
9149 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9150 machine_mode mode)
9151 {
9152 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9153 if (size > 16)
9154 return (offset + 0x400) & ~0x7f0;
9155
9156 /* For offsets that aren't a multiple of the access size, the limit is
9157 -256...255. */
9158 if (offset & (size - 1))
9159 {
9160 /* BLKmode typically uses LDP of X-registers. */
9161 if (mode == BLKmode)
9162 return (offset + 512) & ~0x3ff;
9163 return (offset + 0x100) & ~0x1ff;
9164 }
9165
9166 /* Small negative offsets are supported. */
9167 if (IN_RANGE (offset, -256, 0))
9168 return 0;
9169
9170 if (mode == TImode || mode == TFmode)
9171 return (offset + 0x100) & ~0x1ff;
9172
9173 /* Use 12-bit offset by access size. */
9174 return offset & (~0xfff * size);
9175 }
9176
9177 static rtx
9178 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
9179 {
9180 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9181 where mask is selected by alignment and size of the offset.
9182 We try to pick as large a range for the offset as possible to
9183 maximize the chance of a CSE. However, for aligned addresses
9184 we limit the range to 4k so that structures with different sized
9185 elements are likely to use the same base. We need to be careful
9186 not to split a CONST for some forms of address expression, otherwise
9187 it will generate sub-optimal code. */
9188
9189 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9190 {
9191 rtx base = XEXP (x, 0);
9192 rtx offset_rtx = XEXP (x, 1);
9193 HOST_WIDE_INT offset = INTVAL (offset_rtx);
9194
9195 if (GET_CODE (base) == PLUS)
9196 {
9197 rtx op0 = XEXP (base, 0);
9198 rtx op1 = XEXP (base, 1);
9199
9200 /* Force any scaling into a temp for CSE. */
9201 op0 = force_reg (Pmode, op0);
9202 op1 = force_reg (Pmode, op1);
9203
9204 /* Let the pointer register be in op0. */
9205 if (REG_POINTER (op1))
9206 std::swap (op0, op1);
9207
9208 /* If the pointer is virtual or frame related, then we know that
9209 virtual register instantiation or register elimination is going
9210 to apply a second constant. We want the two constants folded
9211 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9212 if (virt_or_elim_regno_p (REGNO (op0)))
9213 {
9214 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9215 NULL_RTX, true, OPTAB_DIRECT);
9216 return gen_rtx_PLUS (Pmode, base, op1);
9217 }
9218
9219 /* Otherwise, in order to encourage CSE (and thence loop strength
9220 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9221 base = expand_binop (Pmode, add_optab, op0, op1,
9222 NULL_RTX, true, OPTAB_DIRECT);
9223 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9224 }
9225
9226 HOST_WIDE_INT size;
9227 if (GET_MODE_SIZE (mode).is_constant (&size))
9228 {
9229 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9230 mode);
9231 if (base_offset != 0)
9232 {
9233 base = plus_constant (Pmode, base, base_offset);
9234 base = force_operand (base, NULL_RTX);
9235 return plus_constant (Pmode, base, offset - base_offset);
9236 }
9237 }
9238 }
9239
9240 return x;
9241 }
9242
9243 static reg_class_t
9244 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9245 reg_class_t rclass,
9246 machine_mode mode,
9247 secondary_reload_info *sri)
9248 {
9249 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9250 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9251 comment at the head of aarch64-sve.md for more details about the
9252 big-endian handling. */
9253 if (BYTES_BIG_ENDIAN
9254 && reg_class_subset_p (rclass, FP_REGS)
9255 && !((REG_P (x) && HARD_REGISTER_P (x))
9256 || aarch64_simd_valid_immediate (x, NULL))
9257 && aarch64_sve_data_mode_p (mode))
9258 {
9259 sri->icode = CODE_FOR_aarch64_sve_reload_be;
9260 return NO_REGS;
9261 }
9262
9263 /* If we have to disable direct literal pool loads and stores because the
9264 function is too big, then we need a scratch register. */
9265 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9266 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9267 || targetm.vector_mode_supported_p (GET_MODE (x)))
9268 && !aarch64_pcrelative_literal_loads)
9269 {
9270 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9271 return NO_REGS;
9272 }
9273
9274 /* Without the TARGET_SIMD instructions we cannot move a Q register
9275 to a Q register directly. We need a scratch. */
9276 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9277 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9278 && reg_class_subset_p (rclass, FP_REGS))
9279 {
9280 sri->icode = code_for_aarch64_reload_mov (mode);
9281 return NO_REGS;
9282 }
9283
9284 /* A TFmode or TImode memory access should be handled via an FP_REGS
9285 because AArch64 has richer addressing modes for LDR/STR instructions
9286 than LDP/STP instructions. */
9287 if (TARGET_FLOAT && rclass == GENERAL_REGS
9288 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9289 return FP_REGS;
9290
9291 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9292 return GENERAL_REGS;
9293
9294 return NO_REGS;
9295 }
9296
9297 static bool
9298 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9299 {
9300 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9301
9302 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9303 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9304 if (frame_pointer_needed)
9305 return to == HARD_FRAME_POINTER_REGNUM;
9306 return true;
9307 }
9308
9309 poly_int64
9310 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9311 {
9312 if (to == HARD_FRAME_POINTER_REGNUM)
9313 {
9314 if (from == ARG_POINTER_REGNUM)
9315 return cfun->machine->frame.hard_fp_offset;
9316
9317 if (from == FRAME_POINTER_REGNUM)
9318 return cfun->machine->frame.hard_fp_offset
9319 - cfun->machine->frame.locals_offset;
9320 }
9321
9322 if (to == STACK_POINTER_REGNUM)
9323 {
9324 if (from == FRAME_POINTER_REGNUM)
9325 return cfun->machine->frame.frame_size
9326 - cfun->machine->frame.locals_offset;
9327 }
9328
9329 return cfun->machine->frame.frame_size;
9330 }
9331
9332 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9333 previous frame. */
9334
9335 rtx
9336 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9337 {
9338 if (count != 0)
9339 return const0_rtx;
9340 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9341 }
9342
9343
9344 static void
9345 aarch64_asm_trampoline_template (FILE *f)
9346 {
9347 int offset1 = 16;
9348 int offset2 = 20;
9349
9350 if (aarch64_bti_enabled ())
9351 {
9352 asm_fprintf (f, "\thint\t34 // bti c\n");
9353 offset1 -= 4;
9354 offset2 -= 4;
9355 }
9356
9357 if (TARGET_ILP32)
9358 {
9359 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9360 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9361 offset1);
9362 }
9363 else
9364 {
9365 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9366 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9367 offset2);
9368 }
9369 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9370
9371 /* The trampoline needs an extra padding instruction. In case if BTI is
9372 enabled the padding instruction is replaced by the BTI instruction at
9373 the beginning. */
9374 if (!aarch64_bti_enabled ())
9375 assemble_aligned_integer (4, const0_rtx);
9376
9377 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9378 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9379 }
9380
9381 static void
9382 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9383 {
9384 rtx fnaddr, mem, a_tramp;
9385 const int tramp_code_sz = 16;
9386
9387 /* Don't need to copy the trailing D-words, we fill those in below. */
9388 emit_block_move (m_tramp, assemble_trampoline_template (),
9389 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9390 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9391 fnaddr = XEXP (DECL_RTL (fndecl), 0);
9392 if (GET_MODE (fnaddr) != ptr_mode)
9393 fnaddr = convert_memory_address (ptr_mode, fnaddr);
9394 emit_move_insn (mem, fnaddr);
9395
9396 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9397 emit_move_insn (mem, chain_value);
9398
9399 /* XXX We should really define a "clear_cache" pattern and use
9400 gen_clear_cache(). */
9401 a_tramp = XEXP (m_tramp, 0);
9402 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9403 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9404 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9405 ptr_mode);
9406 }
9407
9408 static unsigned char
9409 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9410 {
9411 /* ??? Logically we should only need to provide a value when
9412 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9413 can hold MODE, but at the moment we need to handle all modes.
9414 Just ignore any runtime parts for registers that can't store them. */
9415 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9416 unsigned int nregs;
9417 switch (regclass)
9418 {
9419 case TAILCALL_ADDR_REGS:
9420 case POINTER_REGS:
9421 case GENERAL_REGS:
9422 case ALL_REGS:
9423 case POINTER_AND_FP_REGS:
9424 case FP_REGS:
9425 case FP_LO_REGS:
9426 case FP_LO8_REGS:
9427 if (aarch64_sve_data_mode_p (mode)
9428 && constant_multiple_p (GET_MODE_SIZE (mode),
9429 BYTES_PER_SVE_VECTOR, &nregs))
9430 return nregs;
9431 return (aarch64_vector_data_mode_p (mode)
9432 ? CEIL (lowest_size, UNITS_PER_VREG)
9433 : CEIL (lowest_size, UNITS_PER_WORD));
9434 case STACK_REG:
9435 case PR_REGS:
9436 case PR_LO_REGS:
9437 case PR_HI_REGS:
9438 return 1;
9439
9440 case NO_REGS:
9441 return 0;
9442
9443 default:
9444 break;
9445 }
9446 gcc_unreachable ();
9447 }
9448
9449 static reg_class_t
9450 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9451 {
9452 if (regclass == POINTER_REGS)
9453 return GENERAL_REGS;
9454
9455 if (regclass == STACK_REG)
9456 {
9457 if (REG_P(x)
9458 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9459 return regclass;
9460
9461 return NO_REGS;
9462 }
9463
9464 /* Register eliminiation can result in a request for
9465 SP+constant->FP_REGS. We cannot support such operations which
9466 use SP as source and an FP_REG as destination, so reject out
9467 right now. */
9468 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9469 {
9470 rtx lhs = XEXP (x, 0);
9471
9472 /* Look through a possible SUBREG introduced by ILP32. */
9473 if (GET_CODE (lhs) == SUBREG)
9474 lhs = SUBREG_REG (lhs);
9475
9476 gcc_assert (REG_P (lhs));
9477 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9478 POINTER_REGS));
9479 return NO_REGS;
9480 }
9481
9482 return regclass;
9483 }
9484
9485 void
9486 aarch64_asm_output_labelref (FILE* f, const char *name)
9487 {
9488 asm_fprintf (f, "%U%s", name);
9489 }
9490
9491 static void
9492 aarch64_elf_asm_constructor (rtx symbol, int priority)
9493 {
9494 if (priority == DEFAULT_INIT_PRIORITY)
9495 default_ctor_section_asm_out_constructor (symbol, priority);
9496 else
9497 {
9498 section *s;
9499 /* While priority is known to be in range [0, 65535], so 18 bytes
9500 would be enough, the compiler might not know that. To avoid
9501 -Wformat-truncation false positive, use a larger size. */
9502 char buf[23];
9503 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9504 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9505 switch_to_section (s);
9506 assemble_align (POINTER_SIZE);
9507 assemble_aligned_integer (POINTER_BYTES, symbol);
9508 }
9509 }
9510
9511 static void
9512 aarch64_elf_asm_destructor (rtx symbol, int priority)
9513 {
9514 if (priority == DEFAULT_INIT_PRIORITY)
9515 default_dtor_section_asm_out_destructor (symbol, priority);
9516 else
9517 {
9518 section *s;
9519 /* While priority is known to be in range [0, 65535], so 18 bytes
9520 would be enough, the compiler might not know that. To avoid
9521 -Wformat-truncation false positive, use a larger size. */
9522 char buf[23];
9523 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9524 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9525 switch_to_section (s);
9526 assemble_align (POINTER_SIZE);
9527 assemble_aligned_integer (POINTER_BYTES, symbol);
9528 }
9529 }
9530
9531 const char*
9532 aarch64_output_casesi (rtx *operands)
9533 {
9534 char buf[100];
9535 char label[100];
9536 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9537 int index;
9538 static const char *const patterns[4][2] =
9539 {
9540 {
9541 "ldrb\t%w3, [%0,%w1,uxtw]",
9542 "add\t%3, %4, %w3, sxtb #2"
9543 },
9544 {
9545 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9546 "add\t%3, %4, %w3, sxth #2"
9547 },
9548 {
9549 "ldr\t%w3, [%0,%w1,uxtw #2]",
9550 "add\t%3, %4, %w3, sxtw #2"
9551 },
9552 /* We assume that DImode is only generated when not optimizing and
9553 that we don't really need 64-bit address offsets. That would
9554 imply an object file with 8GB of code in a single function! */
9555 {
9556 "ldr\t%w3, [%0,%w1,uxtw #2]",
9557 "add\t%3, %4, %w3, sxtw #2"
9558 }
9559 };
9560
9561 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9562
9563 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9564 index = exact_log2 (GET_MODE_SIZE (mode));
9565
9566 gcc_assert (index >= 0 && index <= 3);
9567
9568 /* Need to implement table size reduction, by chaning the code below. */
9569 output_asm_insn (patterns[index][0], operands);
9570 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9571 snprintf (buf, sizeof (buf),
9572 "adr\t%%4, %s", targetm.strip_name_encoding (label));
9573 output_asm_insn (buf, operands);
9574 output_asm_insn (patterns[index][1], operands);
9575 output_asm_insn ("br\t%3", operands);
9576 assemble_label (asm_out_file, label);
9577 return "";
9578 }
9579
9580
9581 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9582 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9583 operator. */
9584
9585 int
9586 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9587 {
9588 if (shift >= 0 && shift <= 3)
9589 {
9590 int size;
9591 for (size = 8; size <= 32; size *= 2)
9592 {
9593 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9594 if (mask == bits << shift)
9595 return size;
9596 }
9597 }
9598 return 0;
9599 }
9600
9601 /* Constant pools are per function only when PC relative
9602 literal loads are true or we are in the large memory
9603 model. */
9604
9605 static inline bool
9606 aarch64_can_use_per_function_literal_pools_p (void)
9607 {
9608 return (aarch64_pcrelative_literal_loads
9609 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9610 }
9611
9612 static bool
9613 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9614 {
9615 /* We can't use blocks for constants when we're using a per-function
9616 constant pool. */
9617 return !aarch64_can_use_per_function_literal_pools_p ();
9618 }
9619
9620 /* Select appropriate section for constants depending
9621 on where we place literal pools. */
9622
9623 static section *
9624 aarch64_select_rtx_section (machine_mode mode,
9625 rtx x,
9626 unsigned HOST_WIDE_INT align)
9627 {
9628 if (aarch64_can_use_per_function_literal_pools_p ())
9629 return function_section (current_function_decl);
9630
9631 return default_elf_select_rtx_section (mode, x, align);
9632 }
9633
9634 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9635 void
9636 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9637 HOST_WIDE_INT offset)
9638 {
9639 /* When using per-function literal pools, we must ensure that any code
9640 section is aligned to the minimal instruction length, lest we get
9641 errors from the assembler re "unaligned instructions". */
9642 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9643 ASM_OUTPUT_ALIGN (f, 2);
9644 }
9645
9646 /* Costs. */
9647
9648 /* Helper function for rtx cost calculation. Strip a shift expression
9649 from X. Returns the inner operand if successful, or the original
9650 expression on failure. */
9651 static rtx
9652 aarch64_strip_shift (rtx x)
9653 {
9654 rtx op = x;
9655
9656 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9657 we can convert both to ROR during final output. */
9658 if ((GET_CODE (op) == ASHIFT
9659 || GET_CODE (op) == ASHIFTRT
9660 || GET_CODE (op) == LSHIFTRT
9661 || GET_CODE (op) == ROTATERT
9662 || GET_CODE (op) == ROTATE)
9663 && CONST_INT_P (XEXP (op, 1)))
9664 return XEXP (op, 0);
9665
9666 if (GET_CODE (op) == MULT
9667 && CONST_INT_P (XEXP (op, 1))
9668 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9669 return XEXP (op, 0);
9670
9671 return x;
9672 }
9673
9674 /* Helper function for rtx cost calculation. Strip an extend
9675 expression from X. Returns the inner operand if successful, or the
9676 original expression on failure. We deal with a number of possible
9677 canonicalization variations here. If STRIP_SHIFT is true, then
9678 we can strip off a shift also. */
9679 static rtx
9680 aarch64_strip_extend (rtx x, bool strip_shift)
9681 {
9682 scalar_int_mode mode;
9683 rtx op = x;
9684
9685 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9686 return op;
9687
9688 /* Zero and sign extraction of a widened value. */
9689 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9690 && XEXP (op, 2) == const0_rtx
9691 && GET_CODE (XEXP (op, 0)) == MULT
9692 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9693 XEXP (op, 1)))
9694 return XEXP (XEXP (op, 0), 0);
9695
9696 /* It can also be represented (for zero-extend) as an AND with an
9697 immediate. */
9698 if (GET_CODE (op) == AND
9699 && GET_CODE (XEXP (op, 0)) == MULT
9700 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9701 && CONST_INT_P (XEXP (op, 1))
9702 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9703 INTVAL (XEXP (op, 1))) != 0)
9704 return XEXP (XEXP (op, 0), 0);
9705
9706 /* Now handle extended register, as this may also have an optional
9707 left shift by 1..4. */
9708 if (strip_shift
9709 && GET_CODE (op) == ASHIFT
9710 && CONST_INT_P (XEXP (op, 1))
9711 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9712 op = XEXP (op, 0);
9713
9714 if (GET_CODE (op) == ZERO_EXTEND
9715 || GET_CODE (op) == SIGN_EXTEND)
9716 op = XEXP (op, 0);
9717
9718 if (op != x)
9719 return op;
9720
9721 return x;
9722 }
9723
9724 /* Return true iff CODE is a shift supported in combination
9725 with arithmetic instructions. */
9726
9727 static bool
9728 aarch64_shift_p (enum rtx_code code)
9729 {
9730 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9731 }
9732
9733
9734 /* Return true iff X is a cheap shift without a sign extend. */
9735
9736 static bool
9737 aarch64_cheap_mult_shift_p (rtx x)
9738 {
9739 rtx op0, op1;
9740
9741 op0 = XEXP (x, 0);
9742 op1 = XEXP (x, 1);
9743
9744 if (!(aarch64_tune_params.extra_tuning_flags
9745 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9746 return false;
9747
9748 if (GET_CODE (op0) == SIGN_EXTEND)
9749 return false;
9750
9751 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9752 && UINTVAL (op1) <= 4)
9753 return true;
9754
9755 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9756 return false;
9757
9758 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9759
9760 if (l2 > 0 && l2 <= 4)
9761 return true;
9762
9763 return false;
9764 }
9765
9766 /* Helper function for rtx cost calculation. Calculate the cost of
9767 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9768 Return the calculated cost of the expression, recursing manually in to
9769 operands where needed. */
9770
9771 static int
9772 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9773 {
9774 rtx op0, op1;
9775 const struct cpu_cost_table *extra_cost
9776 = aarch64_tune_params.insn_extra_cost;
9777 int cost = 0;
9778 bool compound_p = (outer == PLUS || outer == MINUS);
9779 machine_mode mode = GET_MODE (x);
9780
9781 gcc_checking_assert (code == MULT);
9782
9783 op0 = XEXP (x, 0);
9784 op1 = XEXP (x, 1);
9785
9786 if (VECTOR_MODE_P (mode))
9787 mode = GET_MODE_INNER (mode);
9788
9789 /* Integer multiply/fma. */
9790 if (GET_MODE_CLASS (mode) == MODE_INT)
9791 {
9792 /* The multiply will be canonicalized as a shift, cost it as such. */
9793 if (aarch64_shift_p (GET_CODE (x))
9794 || (CONST_INT_P (op1)
9795 && exact_log2 (INTVAL (op1)) > 0))
9796 {
9797 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9798 || GET_CODE (op0) == SIGN_EXTEND;
9799 if (speed)
9800 {
9801 if (compound_p)
9802 {
9803 /* If the shift is considered cheap,
9804 then don't add any cost. */
9805 if (aarch64_cheap_mult_shift_p (x))
9806 ;
9807 else if (REG_P (op1))
9808 /* ARITH + shift-by-register. */
9809 cost += extra_cost->alu.arith_shift_reg;
9810 else if (is_extend)
9811 /* ARITH + extended register. We don't have a cost field
9812 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9813 cost += extra_cost->alu.extend_arith;
9814 else
9815 /* ARITH + shift-by-immediate. */
9816 cost += extra_cost->alu.arith_shift;
9817 }
9818 else
9819 /* LSL (immediate). */
9820 cost += extra_cost->alu.shift;
9821
9822 }
9823 /* Strip extends as we will have costed them in the case above. */
9824 if (is_extend)
9825 op0 = aarch64_strip_extend (op0, true);
9826
9827 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9828
9829 return cost;
9830 }
9831
9832 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9833 compound and let the below cases handle it. After all, MNEG is a
9834 special-case alias of MSUB. */
9835 if (GET_CODE (op0) == NEG)
9836 {
9837 op0 = XEXP (op0, 0);
9838 compound_p = true;
9839 }
9840
9841 /* Integer multiplies or FMAs have zero/sign extending variants. */
9842 if ((GET_CODE (op0) == ZERO_EXTEND
9843 && GET_CODE (op1) == ZERO_EXTEND)
9844 || (GET_CODE (op0) == SIGN_EXTEND
9845 && GET_CODE (op1) == SIGN_EXTEND))
9846 {
9847 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9848 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9849
9850 if (speed)
9851 {
9852 if (compound_p)
9853 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9854 cost += extra_cost->mult[0].extend_add;
9855 else
9856 /* MUL/SMULL/UMULL. */
9857 cost += extra_cost->mult[0].extend;
9858 }
9859
9860 return cost;
9861 }
9862
9863 /* This is either an integer multiply or a MADD. In both cases
9864 we want to recurse and cost the operands. */
9865 cost += rtx_cost (op0, mode, MULT, 0, speed);
9866 cost += rtx_cost (op1, mode, MULT, 1, speed);
9867
9868 if (speed)
9869 {
9870 if (compound_p)
9871 /* MADD/MSUB. */
9872 cost += extra_cost->mult[mode == DImode].add;
9873 else
9874 /* MUL. */
9875 cost += extra_cost->mult[mode == DImode].simple;
9876 }
9877
9878 return cost;
9879 }
9880 else
9881 {
9882 if (speed)
9883 {
9884 /* Floating-point FMA/FMUL can also support negations of the
9885 operands, unless the rounding mode is upward or downward in
9886 which case FNMUL is different than FMUL with operand negation. */
9887 bool neg0 = GET_CODE (op0) == NEG;
9888 bool neg1 = GET_CODE (op1) == NEG;
9889 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9890 {
9891 if (neg0)
9892 op0 = XEXP (op0, 0);
9893 if (neg1)
9894 op1 = XEXP (op1, 0);
9895 }
9896
9897 if (compound_p)
9898 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9899 cost += extra_cost->fp[mode == DFmode].fma;
9900 else
9901 /* FMUL/FNMUL. */
9902 cost += extra_cost->fp[mode == DFmode].mult;
9903 }
9904
9905 cost += rtx_cost (op0, mode, MULT, 0, speed);
9906 cost += rtx_cost (op1, mode, MULT, 1, speed);
9907 return cost;
9908 }
9909 }
9910
9911 static int
9912 aarch64_address_cost (rtx x,
9913 machine_mode mode,
9914 addr_space_t as ATTRIBUTE_UNUSED,
9915 bool speed)
9916 {
9917 enum rtx_code c = GET_CODE (x);
9918 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9919 struct aarch64_address_info info;
9920 int cost = 0;
9921 info.shift = 0;
9922
9923 if (!aarch64_classify_address (&info, x, mode, false))
9924 {
9925 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9926 {
9927 /* This is a CONST or SYMBOL ref which will be split
9928 in a different way depending on the code model in use.
9929 Cost it through the generic infrastructure. */
9930 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9931 /* Divide through by the cost of one instruction to
9932 bring it to the same units as the address costs. */
9933 cost_symbol_ref /= COSTS_N_INSNS (1);
9934 /* The cost is then the cost of preparing the address,
9935 followed by an immediate (possibly 0) offset. */
9936 return cost_symbol_ref + addr_cost->imm_offset;
9937 }
9938 else
9939 {
9940 /* This is most likely a jump table from a case
9941 statement. */
9942 return addr_cost->register_offset;
9943 }
9944 }
9945
9946 switch (info.type)
9947 {
9948 case ADDRESS_LO_SUM:
9949 case ADDRESS_SYMBOLIC:
9950 case ADDRESS_REG_IMM:
9951 cost += addr_cost->imm_offset;
9952 break;
9953
9954 case ADDRESS_REG_WB:
9955 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9956 cost += addr_cost->pre_modify;
9957 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9958 cost += addr_cost->post_modify;
9959 else
9960 gcc_unreachable ();
9961
9962 break;
9963
9964 case ADDRESS_REG_REG:
9965 cost += addr_cost->register_offset;
9966 break;
9967
9968 case ADDRESS_REG_SXTW:
9969 cost += addr_cost->register_sextend;
9970 break;
9971
9972 case ADDRESS_REG_UXTW:
9973 cost += addr_cost->register_zextend;
9974 break;
9975
9976 default:
9977 gcc_unreachable ();
9978 }
9979
9980
9981 if (info.shift > 0)
9982 {
9983 /* For the sake of calculating the cost of the shifted register
9984 component, we can treat same sized modes in the same way. */
9985 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9986 cost += addr_cost->addr_scale_costs.hi;
9987 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9988 cost += addr_cost->addr_scale_costs.si;
9989 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9990 cost += addr_cost->addr_scale_costs.di;
9991 else
9992 /* We can't tell, or this is a 128-bit vector. */
9993 cost += addr_cost->addr_scale_costs.ti;
9994 }
9995
9996 return cost;
9997 }
9998
9999 /* Return the cost of a branch. If SPEED_P is true then the compiler is
10000 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
10001 to be taken. */
10002
10003 int
10004 aarch64_branch_cost (bool speed_p, bool predictable_p)
10005 {
10006 /* When optimizing for speed, use the cost of unpredictable branches. */
10007 const struct cpu_branch_cost *branch_costs =
10008 aarch64_tune_params.branch_costs;
10009
10010 if (!speed_p || predictable_p)
10011 return branch_costs->predictable;
10012 else
10013 return branch_costs->unpredictable;
10014 }
10015
10016 /* Return true if the RTX X in mode MODE is a zero or sign extract
10017 usable in an ADD or SUB (extended register) instruction. */
10018 static bool
10019 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10020 {
10021 /* Catch add with a sign extract.
10022 This is add_<optab><mode>_multp2. */
10023 if (GET_CODE (x) == SIGN_EXTRACT
10024 || GET_CODE (x) == ZERO_EXTRACT)
10025 {
10026 rtx op0 = XEXP (x, 0);
10027 rtx op1 = XEXP (x, 1);
10028 rtx op2 = XEXP (x, 2);
10029
10030 if (GET_CODE (op0) == MULT
10031 && CONST_INT_P (op1)
10032 && op2 == const0_rtx
10033 && CONST_INT_P (XEXP (op0, 1))
10034 && aarch64_is_extend_from_extract (mode,
10035 XEXP (op0, 1),
10036 op1))
10037 {
10038 return true;
10039 }
10040 }
10041 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10042 No shift. */
10043 else if (GET_CODE (x) == SIGN_EXTEND
10044 || GET_CODE (x) == ZERO_EXTEND)
10045 return REG_P (XEXP (x, 0));
10046
10047 return false;
10048 }
10049
10050 static bool
10051 aarch64_frint_unspec_p (unsigned int u)
10052 {
10053 switch (u)
10054 {
10055 case UNSPEC_FRINTZ:
10056 case UNSPEC_FRINTP:
10057 case UNSPEC_FRINTM:
10058 case UNSPEC_FRINTA:
10059 case UNSPEC_FRINTN:
10060 case UNSPEC_FRINTX:
10061 case UNSPEC_FRINTI:
10062 return true;
10063
10064 default:
10065 return false;
10066 }
10067 }
10068
10069 /* Return true iff X is an rtx that will match an extr instruction
10070 i.e. as described in the *extr<mode>5_insn family of patterns.
10071 OP0 and OP1 will be set to the operands of the shifts involved
10072 on success and will be NULL_RTX otherwise. */
10073
10074 static bool
10075 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10076 {
10077 rtx op0, op1;
10078 scalar_int_mode mode;
10079 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10080 return false;
10081
10082 *res_op0 = NULL_RTX;
10083 *res_op1 = NULL_RTX;
10084
10085 if (GET_CODE (x) != IOR)
10086 return false;
10087
10088 op0 = XEXP (x, 0);
10089 op1 = XEXP (x, 1);
10090
10091 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10092 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10093 {
10094 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10095 if (GET_CODE (op1) == ASHIFT)
10096 std::swap (op0, op1);
10097
10098 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10099 return false;
10100
10101 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10102 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10103
10104 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10105 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10106 {
10107 *res_op0 = XEXP (op0, 0);
10108 *res_op1 = XEXP (op1, 0);
10109 return true;
10110 }
10111 }
10112
10113 return false;
10114 }
10115
10116 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10117 storing it in *COST. Result is true if the total cost of the operation
10118 has now been calculated. */
10119 static bool
10120 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10121 {
10122 rtx inner;
10123 rtx comparator;
10124 enum rtx_code cmpcode;
10125
10126 if (COMPARISON_P (op0))
10127 {
10128 inner = XEXP (op0, 0);
10129 comparator = XEXP (op0, 1);
10130 cmpcode = GET_CODE (op0);
10131 }
10132 else
10133 {
10134 inner = op0;
10135 comparator = const0_rtx;
10136 cmpcode = NE;
10137 }
10138
10139 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10140 {
10141 /* Conditional branch. */
10142 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10143 return true;
10144 else
10145 {
10146 if (cmpcode == NE || cmpcode == EQ)
10147 {
10148 if (comparator == const0_rtx)
10149 {
10150 /* TBZ/TBNZ/CBZ/CBNZ. */
10151 if (GET_CODE (inner) == ZERO_EXTRACT)
10152 /* TBZ/TBNZ. */
10153 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10154 ZERO_EXTRACT, 0, speed);
10155 else
10156 /* CBZ/CBNZ. */
10157 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10158
10159 return true;
10160 }
10161 }
10162 else if (cmpcode == LT || cmpcode == GE)
10163 {
10164 /* TBZ/TBNZ. */
10165 if (comparator == const0_rtx)
10166 return true;
10167 }
10168 }
10169 }
10170 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10171 {
10172 /* CCMP. */
10173 if (GET_CODE (op1) == COMPARE)
10174 {
10175 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10176 if (XEXP (op1, 1) == const0_rtx)
10177 *cost += 1;
10178 if (speed)
10179 {
10180 machine_mode mode = GET_MODE (XEXP (op1, 0));
10181 const struct cpu_cost_table *extra_cost
10182 = aarch64_tune_params.insn_extra_cost;
10183
10184 if (GET_MODE_CLASS (mode) == MODE_INT)
10185 *cost += extra_cost->alu.arith;
10186 else
10187 *cost += extra_cost->fp[mode == DFmode].compare;
10188 }
10189 return true;
10190 }
10191
10192 /* It's a conditional operation based on the status flags,
10193 so it must be some flavor of CSEL. */
10194
10195 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10196 if (GET_CODE (op1) == NEG
10197 || GET_CODE (op1) == NOT
10198 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10199 op1 = XEXP (op1, 0);
10200 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10201 {
10202 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10203 op1 = XEXP (op1, 0);
10204 op2 = XEXP (op2, 0);
10205 }
10206
10207 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10208 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10209 return true;
10210 }
10211
10212 /* We don't know what this is, cost all operands. */
10213 return false;
10214 }
10215
10216 /* Check whether X is a bitfield operation of the form shift + extend that
10217 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10218 operand to which the bitfield operation is applied. Otherwise return
10219 NULL_RTX. */
10220
10221 static rtx
10222 aarch64_extend_bitfield_pattern_p (rtx x)
10223 {
10224 rtx_code outer_code = GET_CODE (x);
10225 machine_mode outer_mode = GET_MODE (x);
10226
10227 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10228 && outer_mode != SImode && outer_mode != DImode)
10229 return NULL_RTX;
10230
10231 rtx inner = XEXP (x, 0);
10232 rtx_code inner_code = GET_CODE (inner);
10233 machine_mode inner_mode = GET_MODE (inner);
10234 rtx op = NULL_RTX;
10235
10236 switch (inner_code)
10237 {
10238 case ASHIFT:
10239 if (CONST_INT_P (XEXP (inner, 1))
10240 && (inner_mode == QImode || inner_mode == HImode))
10241 op = XEXP (inner, 0);
10242 break;
10243 case LSHIFTRT:
10244 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10245 && (inner_mode == QImode || inner_mode == HImode))
10246 op = XEXP (inner, 0);
10247 break;
10248 case ASHIFTRT:
10249 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10250 && (inner_mode == QImode || inner_mode == HImode))
10251 op = XEXP (inner, 0);
10252 break;
10253 default:
10254 break;
10255 }
10256
10257 return op;
10258 }
10259
10260 /* Return true if the mask and a shift amount from an RTX of the form
10261 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10262 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10263
10264 bool
10265 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10266 rtx shft_amnt)
10267 {
10268 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10269 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10270 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10271 && (INTVAL (mask)
10272 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10273 }
10274
10275 /* Return true if the masks and a shift amount from an RTX of the form
10276 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10277 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10278
10279 bool
10280 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10281 unsigned HOST_WIDE_INT mask1,
10282 unsigned HOST_WIDE_INT shft_amnt,
10283 unsigned HOST_WIDE_INT mask2)
10284 {
10285 unsigned HOST_WIDE_INT t;
10286
10287 /* Verify that there is no overlap in what bits are set in the two masks. */
10288 if (mask1 != ~mask2)
10289 return false;
10290
10291 /* Verify that mask2 is not all zeros or ones. */
10292 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10293 return false;
10294
10295 /* The shift amount should always be less than the mode size. */
10296 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10297
10298 /* Verify that the mask being shifted is contiguous and would be in the
10299 least significant bits after shifting by shft_amnt. */
10300 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10301 return (t == (t & -t));
10302 }
10303
10304 /* Calculate the cost of calculating X, storing it in *COST. Result
10305 is true if the total cost of the operation has now been calculated. */
10306 static bool
10307 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10308 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10309 {
10310 rtx op0, op1, op2;
10311 const struct cpu_cost_table *extra_cost
10312 = aarch64_tune_params.insn_extra_cost;
10313 int code = GET_CODE (x);
10314 scalar_int_mode int_mode;
10315
10316 /* By default, assume that everything has equivalent cost to the
10317 cheapest instruction. Any additional costs are applied as a delta
10318 above this default. */
10319 *cost = COSTS_N_INSNS (1);
10320
10321 switch (code)
10322 {
10323 case SET:
10324 /* The cost depends entirely on the operands to SET. */
10325 *cost = 0;
10326 op0 = SET_DEST (x);
10327 op1 = SET_SRC (x);
10328
10329 switch (GET_CODE (op0))
10330 {
10331 case MEM:
10332 if (speed)
10333 {
10334 rtx address = XEXP (op0, 0);
10335 if (VECTOR_MODE_P (mode))
10336 *cost += extra_cost->ldst.storev;
10337 else if (GET_MODE_CLASS (mode) == MODE_INT)
10338 *cost += extra_cost->ldst.store;
10339 else if (mode == SFmode)
10340 *cost += extra_cost->ldst.storef;
10341 else if (mode == DFmode)
10342 *cost += extra_cost->ldst.stored;
10343
10344 *cost +=
10345 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10346 0, speed));
10347 }
10348
10349 *cost += rtx_cost (op1, mode, SET, 1, speed);
10350 return true;
10351
10352 case SUBREG:
10353 if (! REG_P (SUBREG_REG (op0)))
10354 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10355
10356 /* Fall through. */
10357 case REG:
10358 /* The cost is one per vector-register copied. */
10359 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10360 {
10361 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10362 *cost = COSTS_N_INSNS (nregs);
10363 }
10364 /* const0_rtx is in general free, but we will use an
10365 instruction to set a register to 0. */
10366 else if (REG_P (op1) || op1 == const0_rtx)
10367 {
10368 /* The cost is 1 per register copied. */
10369 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10370 *cost = COSTS_N_INSNS (nregs);
10371 }
10372 else
10373 /* Cost is just the cost of the RHS of the set. */
10374 *cost += rtx_cost (op1, mode, SET, 1, speed);
10375 return true;
10376
10377 case ZERO_EXTRACT:
10378 case SIGN_EXTRACT:
10379 /* Bit-field insertion. Strip any redundant widening of
10380 the RHS to meet the width of the target. */
10381 if (GET_CODE (op1) == SUBREG)
10382 op1 = SUBREG_REG (op1);
10383 if ((GET_CODE (op1) == ZERO_EXTEND
10384 || GET_CODE (op1) == SIGN_EXTEND)
10385 && CONST_INT_P (XEXP (op0, 1))
10386 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10387 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10388 op1 = XEXP (op1, 0);
10389
10390 if (CONST_INT_P (op1))
10391 {
10392 /* MOV immediate is assumed to always be cheap. */
10393 *cost = COSTS_N_INSNS (1);
10394 }
10395 else
10396 {
10397 /* BFM. */
10398 if (speed)
10399 *cost += extra_cost->alu.bfi;
10400 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10401 }
10402
10403 return true;
10404
10405 default:
10406 /* We can't make sense of this, assume default cost. */
10407 *cost = COSTS_N_INSNS (1);
10408 return false;
10409 }
10410 return false;
10411
10412 case CONST_INT:
10413 /* If an instruction can incorporate a constant within the
10414 instruction, the instruction's expression avoids calling
10415 rtx_cost() on the constant. If rtx_cost() is called on a
10416 constant, then it is usually because the constant must be
10417 moved into a register by one or more instructions.
10418
10419 The exception is constant 0, which can be expressed
10420 as XZR/WZR and is therefore free. The exception to this is
10421 if we have (set (reg) (const0_rtx)) in which case we must cost
10422 the move. However, we can catch that when we cost the SET, so
10423 we don't need to consider that here. */
10424 if (x == const0_rtx)
10425 *cost = 0;
10426 else
10427 {
10428 /* To an approximation, building any other constant is
10429 proportionally expensive to the number of instructions
10430 required to build that constant. This is true whether we
10431 are compiling for SPEED or otherwise. */
10432 if (!is_a <scalar_int_mode> (mode, &int_mode))
10433 int_mode = word_mode;
10434 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10435 (NULL_RTX, x, false, int_mode));
10436 }
10437 return true;
10438
10439 case CONST_DOUBLE:
10440
10441 /* First determine number of instructions to do the move
10442 as an integer constant. */
10443 if (!aarch64_float_const_representable_p (x)
10444 && !aarch64_can_const_movi_rtx_p (x, mode)
10445 && aarch64_float_const_rtx_p (x))
10446 {
10447 unsigned HOST_WIDE_INT ival;
10448 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10449 gcc_assert (succeed);
10450
10451 scalar_int_mode imode = (mode == HFmode
10452 ? SImode
10453 : int_mode_for_mode (mode).require ());
10454 int ncost = aarch64_internal_mov_immediate
10455 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10456 *cost += COSTS_N_INSNS (ncost);
10457 return true;
10458 }
10459
10460 if (speed)
10461 {
10462 /* mov[df,sf]_aarch64. */
10463 if (aarch64_float_const_representable_p (x))
10464 /* FMOV (scalar immediate). */
10465 *cost += extra_cost->fp[mode == DFmode].fpconst;
10466 else if (!aarch64_float_const_zero_rtx_p (x))
10467 {
10468 /* This will be a load from memory. */
10469 if (mode == DFmode)
10470 *cost += extra_cost->ldst.loadd;
10471 else
10472 *cost += extra_cost->ldst.loadf;
10473 }
10474 else
10475 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10476 or MOV v0.s[0], wzr - neither of which are modeled by the
10477 cost tables. Just use the default cost. */
10478 {
10479 }
10480 }
10481
10482 return true;
10483
10484 case MEM:
10485 if (speed)
10486 {
10487 /* For loads we want the base cost of a load, plus an
10488 approximation for the additional cost of the addressing
10489 mode. */
10490 rtx address = XEXP (x, 0);
10491 if (VECTOR_MODE_P (mode))
10492 *cost += extra_cost->ldst.loadv;
10493 else if (GET_MODE_CLASS (mode) == MODE_INT)
10494 *cost += extra_cost->ldst.load;
10495 else if (mode == SFmode)
10496 *cost += extra_cost->ldst.loadf;
10497 else if (mode == DFmode)
10498 *cost += extra_cost->ldst.loadd;
10499
10500 *cost +=
10501 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10502 0, speed));
10503 }
10504
10505 return true;
10506
10507 case NEG:
10508 op0 = XEXP (x, 0);
10509
10510 if (VECTOR_MODE_P (mode))
10511 {
10512 if (speed)
10513 {
10514 /* FNEG. */
10515 *cost += extra_cost->vect.alu;
10516 }
10517 return false;
10518 }
10519
10520 if (GET_MODE_CLASS (mode) == MODE_INT)
10521 {
10522 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10523 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10524 {
10525 /* CSETM. */
10526 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10527 return true;
10528 }
10529
10530 /* Cost this as SUB wzr, X. */
10531 op0 = CONST0_RTX (mode);
10532 op1 = XEXP (x, 0);
10533 goto cost_minus;
10534 }
10535
10536 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10537 {
10538 /* Support (neg(fma...)) as a single instruction only if
10539 sign of zeros is unimportant. This matches the decision
10540 making in aarch64.md. */
10541 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10542 {
10543 /* FNMADD. */
10544 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10545 return true;
10546 }
10547 if (GET_CODE (op0) == MULT)
10548 {
10549 /* FNMUL. */
10550 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10551 return true;
10552 }
10553 if (speed)
10554 /* FNEG. */
10555 *cost += extra_cost->fp[mode == DFmode].neg;
10556 return false;
10557 }
10558
10559 return false;
10560
10561 case CLRSB:
10562 case CLZ:
10563 if (speed)
10564 {
10565 if (VECTOR_MODE_P (mode))
10566 *cost += extra_cost->vect.alu;
10567 else
10568 *cost += extra_cost->alu.clz;
10569 }
10570
10571 return false;
10572
10573 case COMPARE:
10574 op0 = XEXP (x, 0);
10575 op1 = XEXP (x, 1);
10576
10577 if (op1 == const0_rtx
10578 && GET_CODE (op0) == AND)
10579 {
10580 x = op0;
10581 mode = GET_MODE (op0);
10582 goto cost_logic;
10583 }
10584
10585 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10586 {
10587 /* TODO: A write to the CC flags possibly costs extra, this
10588 needs encoding in the cost tables. */
10589
10590 mode = GET_MODE (op0);
10591 /* ANDS. */
10592 if (GET_CODE (op0) == AND)
10593 {
10594 x = op0;
10595 goto cost_logic;
10596 }
10597
10598 if (GET_CODE (op0) == PLUS)
10599 {
10600 /* ADDS (and CMN alias). */
10601 x = op0;
10602 goto cost_plus;
10603 }
10604
10605 if (GET_CODE (op0) == MINUS)
10606 {
10607 /* SUBS. */
10608 x = op0;
10609 goto cost_minus;
10610 }
10611
10612 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10613 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10614 && CONST_INT_P (XEXP (op0, 2)))
10615 {
10616 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10617 Handle it here directly rather than going to cost_logic
10618 since we know the immediate generated for the TST is valid
10619 so we can avoid creating an intermediate rtx for it only
10620 for costing purposes. */
10621 if (speed)
10622 *cost += extra_cost->alu.logical;
10623
10624 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10625 ZERO_EXTRACT, 0, speed);
10626 return true;
10627 }
10628
10629 if (GET_CODE (op1) == NEG)
10630 {
10631 /* CMN. */
10632 if (speed)
10633 *cost += extra_cost->alu.arith;
10634
10635 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10636 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10637 return true;
10638 }
10639
10640 /* CMP.
10641
10642 Compare can freely swap the order of operands, and
10643 canonicalization puts the more complex operation first.
10644 But the integer MINUS logic expects the shift/extend
10645 operation in op1. */
10646 if (! (REG_P (op0)
10647 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10648 {
10649 op0 = XEXP (x, 1);
10650 op1 = XEXP (x, 0);
10651 }
10652 goto cost_minus;
10653 }
10654
10655 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10656 {
10657 /* FCMP. */
10658 if (speed)
10659 *cost += extra_cost->fp[mode == DFmode].compare;
10660
10661 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10662 {
10663 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10664 /* FCMP supports constant 0.0 for no extra cost. */
10665 return true;
10666 }
10667 return false;
10668 }
10669
10670 if (VECTOR_MODE_P (mode))
10671 {
10672 /* Vector compare. */
10673 if (speed)
10674 *cost += extra_cost->vect.alu;
10675
10676 if (aarch64_float_const_zero_rtx_p (op1))
10677 {
10678 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10679 cost. */
10680 return true;
10681 }
10682 return false;
10683 }
10684 return false;
10685
10686 case MINUS:
10687 {
10688 op0 = XEXP (x, 0);
10689 op1 = XEXP (x, 1);
10690
10691 cost_minus:
10692 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10693
10694 /* Detect valid immediates. */
10695 if ((GET_MODE_CLASS (mode) == MODE_INT
10696 || (GET_MODE_CLASS (mode) == MODE_CC
10697 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10698 && CONST_INT_P (op1)
10699 && aarch64_uimm12_shift (INTVAL (op1)))
10700 {
10701 if (speed)
10702 /* SUB(S) (immediate). */
10703 *cost += extra_cost->alu.arith;
10704 return true;
10705 }
10706
10707 /* Look for SUB (extended register). */
10708 if (is_a <scalar_int_mode> (mode, &int_mode)
10709 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10710 {
10711 if (speed)
10712 *cost += extra_cost->alu.extend_arith;
10713
10714 op1 = aarch64_strip_extend (op1, true);
10715 *cost += rtx_cost (op1, VOIDmode,
10716 (enum rtx_code) GET_CODE (op1), 0, speed);
10717 return true;
10718 }
10719
10720 rtx new_op1 = aarch64_strip_extend (op1, false);
10721
10722 /* Cost this as an FMA-alike operation. */
10723 if ((GET_CODE (new_op1) == MULT
10724 || aarch64_shift_p (GET_CODE (new_op1)))
10725 && code != COMPARE)
10726 {
10727 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10728 (enum rtx_code) code,
10729 speed);
10730 return true;
10731 }
10732
10733 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10734
10735 if (speed)
10736 {
10737 if (VECTOR_MODE_P (mode))
10738 {
10739 /* Vector SUB. */
10740 *cost += extra_cost->vect.alu;
10741 }
10742 else if (GET_MODE_CLASS (mode) == MODE_INT)
10743 {
10744 /* SUB(S). */
10745 *cost += extra_cost->alu.arith;
10746 }
10747 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10748 {
10749 /* FSUB. */
10750 *cost += extra_cost->fp[mode == DFmode].addsub;
10751 }
10752 }
10753 return true;
10754 }
10755
10756 case PLUS:
10757 {
10758 rtx new_op0;
10759
10760 op0 = XEXP (x, 0);
10761 op1 = XEXP (x, 1);
10762
10763 cost_plus:
10764 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10765 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10766 {
10767 /* CSINC. */
10768 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10769 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10770 return true;
10771 }
10772
10773 if (GET_MODE_CLASS (mode) == MODE_INT
10774 && (aarch64_plus_immediate (op1, mode)
10775 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10776 {
10777 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10778
10779 if (speed)
10780 /* ADD (immediate). */
10781 *cost += extra_cost->alu.arith;
10782 return true;
10783 }
10784
10785 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10786
10787 /* Look for ADD (extended register). */
10788 if (is_a <scalar_int_mode> (mode, &int_mode)
10789 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10790 {
10791 if (speed)
10792 *cost += extra_cost->alu.extend_arith;
10793
10794 op0 = aarch64_strip_extend (op0, true);
10795 *cost += rtx_cost (op0, VOIDmode,
10796 (enum rtx_code) GET_CODE (op0), 0, speed);
10797 return true;
10798 }
10799
10800 /* Strip any extend, leave shifts behind as we will
10801 cost them through mult_cost. */
10802 new_op0 = aarch64_strip_extend (op0, false);
10803
10804 if (GET_CODE (new_op0) == MULT
10805 || aarch64_shift_p (GET_CODE (new_op0)))
10806 {
10807 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10808 speed);
10809 return true;
10810 }
10811
10812 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10813
10814 if (speed)
10815 {
10816 if (VECTOR_MODE_P (mode))
10817 {
10818 /* Vector ADD. */
10819 *cost += extra_cost->vect.alu;
10820 }
10821 else if (GET_MODE_CLASS (mode) == MODE_INT)
10822 {
10823 /* ADD. */
10824 *cost += extra_cost->alu.arith;
10825 }
10826 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10827 {
10828 /* FADD. */
10829 *cost += extra_cost->fp[mode == DFmode].addsub;
10830 }
10831 }
10832 return true;
10833 }
10834
10835 case BSWAP:
10836 *cost = COSTS_N_INSNS (1);
10837
10838 if (speed)
10839 {
10840 if (VECTOR_MODE_P (mode))
10841 *cost += extra_cost->vect.alu;
10842 else
10843 *cost += extra_cost->alu.rev;
10844 }
10845 return false;
10846
10847 case IOR:
10848 if (aarch_rev16_p (x))
10849 {
10850 *cost = COSTS_N_INSNS (1);
10851
10852 if (speed)
10853 {
10854 if (VECTOR_MODE_P (mode))
10855 *cost += extra_cost->vect.alu;
10856 else
10857 *cost += extra_cost->alu.rev;
10858 }
10859 return true;
10860 }
10861
10862 if (aarch64_extr_rtx_p (x, &op0, &op1))
10863 {
10864 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10865 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10866 if (speed)
10867 *cost += extra_cost->alu.shift;
10868
10869 return true;
10870 }
10871 /* Fall through. */
10872 case XOR:
10873 case AND:
10874 cost_logic:
10875 op0 = XEXP (x, 0);
10876 op1 = XEXP (x, 1);
10877
10878 if (VECTOR_MODE_P (mode))
10879 {
10880 if (speed)
10881 *cost += extra_cost->vect.alu;
10882 return true;
10883 }
10884
10885 if (code == AND
10886 && GET_CODE (op0) == MULT
10887 && CONST_INT_P (XEXP (op0, 1))
10888 && CONST_INT_P (op1)
10889 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10890 INTVAL (op1)) != 0)
10891 {
10892 /* This is a UBFM/SBFM. */
10893 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10894 if (speed)
10895 *cost += extra_cost->alu.bfx;
10896 return true;
10897 }
10898
10899 if (is_int_mode (mode, &int_mode))
10900 {
10901 if (CONST_INT_P (op1))
10902 {
10903 /* We have a mask + shift version of a UBFIZ
10904 i.e. the *andim_ashift<mode>_bfiz pattern. */
10905 if (GET_CODE (op0) == ASHIFT
10906 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10907 XEXP (op0, 1)))
10908 {
10909 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10910 (enum rtx_code) code, 0, speed);
10911 if (speed)
10912 *cost += extra_cost->alu.bfx;
10913
10914 return true;
10915 }
10916 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10917 {
10918 /* We possibly get the immediate for free, this is not
10919 modelled. */
10920 *cost += rtx_cost (op0, int_mode,
10921 (enum rtx_code) code, 0, speed);
10922 if (speed)
10923 *cost += extra_cost->alu.logical;
10924
10925 return true;
10926 }
10927 }
10928 else
10929 {
10930 rtx new_op0 = op0;
10931
10932 /* Handle ORN, EON, or BIC. */
10933 if (GET_CODE (op0) == NOT)
10934 op0 = XEXP (op0, 0);
10935
10936 new_op0 = aarch64_strip_shift (op0);
10937
10938 /* If we had a shift on op0 then this is a logical-shift-
10939 by-register/immediate operation. Otherwise, this is just
10940 a logical operation. */
10941 if (speed)
10942 {
10943 if (new_op0 != op0)
10944 {
10945 /* Shift by immediate. */
10946 if (CONST_INT_P (XEXP (op0, 1)))
10947 *cost += extra_cost->alu.log_shift;
10948 else
10949 *cost += extra_cost->alu.log_shift_reg;
10950 }
10951 else
10952 *cost += extra_cost->alu.logical;
10953 }
10954
10955 /* In both cases we want to cost both operands. */
10956 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10957 0, speed);
10958 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10959 1, speed);
10960
10961 return true;
10962 }
10963 }
10964 return false;
10965
10966 case NOT:
10967 x = XEXP (x, 0);
10968 op0 = aarch64_strip_shift (x);
10969
10970 if (VECTOR_MODE_P (mode))
10971 {
10972 /* Vector NOT. */
10973 *cost += extra_cost->vect.alu;
10974 return false;
10975 }
10976
10977 /* MVN-shifted-reg. */
10978 if (op0 != x)
10979 {
10980 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10981
10982 if (speed)
10983 *cost += extra_cost->alu.log_shift;
10984
10985 return true;
10986 }
10987 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10988 Handle the second form here taking care that 'a' in the above can
10989 be a shift. */
10990 else if (GET_CODE (op0) == XOR)
10991 {
10992 rtx newop0 = XEXP (op0, 0);
10993 rtx newop1 = XEXP (op0, 1);
10994 rtx op0_stripped = aarch64_strip_shift (newop0);
10995
10996 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10997 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10998
10999 if (speed)
11000 {
11001 if (op0_stripped != newop0)
11002 *cost += extra_cost->alu.log_shift;
11003 else
11004 *cost += extra_cost->alu.logical;
11005 }
11006
11007 return true;
11008 }
11009 /* MVN. */
11010 if (speed)
11011 *cost += extra_cost->alu.logical;
11012
11013 return false;
11014
11015 case ZERO_EXTEND:
11016
11017 op0 = XEXP (x, 0);
11018 /* If a value is written in SI mode, then zero extended to DI
11019 mode, the operation will in general be free as a write to
11020 a 'w' register implicitly zeroes the upper bits of an 'x'
11021 register. However, if this is
11022
11023 (set (reg) (zero_extend (reg)))
11024
11025 we must cost the explicit register move. */
11026 if (mode == DImode
11027 && GET_MODE (op0) == SImode
11028 && outer == SET)
11029 {
11030 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11031
11032 /* If OP_COST is non-zero, then the cost of the zero extend
11033 is effectively the cost of the inner operation. Otherwise
11034 we have a MOV instruction and we take the cost from the MOV
11035 itself. This is true independently of whether we are
11036 optimizing for space or time. */
11037 if (op_cost)
11038 *cost = op_cost;
11039
11040 return true;
11041 }
11042 else if (MEM_P (op0))
11043 {
11044 /* All loads can zero extend to any size for free. */
11045 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11046 return true;
11047 }
11048
11049 op0 = aarch64_extend_bitfield_pattern_p (x);
11050 if (op0)
11051 {
11052 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11053 if (speed)
11054 *cost += extra_cost->alu.bfx;
11055 return true;
11056 }
11057
11058 if (speed)
11059 {
11060 if (VECTOR_MODE_P (mode))
11061 {
11062 /* UMOV. */
11063 *cost += extra_cost->vect.alu;
11064 }
11065 else
11066 {
11067 /* We generate an AND instead of UXTB/UXTH. */
11068 *cost += extra_cost->alu.logical;
11069 }
11070 }
11071 return false;
11072
11073 case SIGN_EXTEND:
11074 if (MEM_P (XEXP (x, 0)))
11075 {
11076 /* LDRSH. */
11077 if (speed)
11078 {
11079 rtx address = XEXP (XEXP (x, 0), 0);
11080 *cost += extra_cost->ldst.load_sign_extend;
11081
11082 *cost +=
11083 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11084 0, speed));
11085 }
11086 return true;
11087 }
11088
11089 op0 = aarch64_extend_bitfield_pattern_p (x);
11090 if (op0)
11091 {
11092 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11093 if (speed)
11094 *cost += extra_cost->alu.bfx;
11095 return true;
11096 }
11097
11098 if (speed)
11099 {
11100 if (VECTOR_MODE_P (mode))
11101 *cost += extra_cost->vect.alu;
11102 else
11103 *cost += extra_cost->alu.extend;
11104 }
11105 return false;
11106
11107 case ASHIFT:
11108 op0 = XEXP (x, 0);
11109 op1 = XEXP (x, 1);
11110
11111 if (CONST_INT_P (op1))
11112 {
11113 if (speed)
11114 {
11115 if (VECTOR_MODE_P (mode))
11116 {
11117 /* Vector shift (immediate). */
11118 *cost += extra_cost->vect.alu;
11119 }
11120 else
11121 {
11122 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
11123 aliases. */
11124 *cost += extra_cost->alu.shift;
11125 }
11126 }
11127
11128 /* We can incorporate zero/sign extend for free. */
11129 if (GET_CODE (op0) == ZERO_EXTEND
11130 || GET_CODE (op0) == SIGN_EXTEND)
11131 op0 = XEXP (op0, 0);
11132
11133 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11134 return true;
11135 }
11136 else
11137 {
11138 if (VECTOR_MODE_P (mode))
11139 {
11140 if (speed)
11141 /* Vector shift (register). */
11142 *cost += extra_cost->vect.alu;
11143 }
11144 else
11145 {
11146 if (speed)
11147 /* LSLV. */
11148 *cost += extra_cost->alu.shift_reg;
11149
11150 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11151 && CONST_INT_P (XEXP (op1, 1))
11152 && known_eq (INTVAL (XEXP (op1, 1)),
11153 GET_MODE_BITSIZE (mode) - 1))
11154 {
11155 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11156 /* We already demanded XEXP (op1, 0) to be REG_P, so
11157 don't recurse into it. */
11158 return true;
11159 }
11160 }
11161 return false; /* All arguments need to be in registers. */
11162 }
11163
11164 case ROTATE:
11165 case ROTATERT:
11166 case LSHIFTRT:
11167 case ASHIFTRT:
11168 op0 = XEXP (x, 0);
11169 op1 = XEXP (x, 1);
11170
11171 if (CONST_INT_P (op1))
11172 {
11173 /* ASR (immediate) and friends. */
11174 if (speed)
11175 {
11176 if (VECTOR_MODE_P (mode))
11177 *cost += extra_cost->vect.alu;
11178 else
11179 *cost += extra_cost->alu.shift;
11180 }
11181
11182 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11183 return true;
11184 }
11185 else
11186 {
11187 if (VECTOR_MODE_P (mode))
11188 {
11189 if (speed)
11190 /* Vector shift (register). */
11191 *cost += extra_cost->vect.alu;
11192 }
11193 else
11194 {
11195 if (speed)
11196 /* ASR (register) and friends. */
11197 *cost += extra_cost->alu.shift_reg;
11198
11199 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11200 && CONST_INT_P (XEXP (op1, 1))
11201 && known_eq (INTVAL (XEXP (op1, 1)),
11202 GET_MODE_BITSIZE (mode) - 1))
11203 {
11204 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11205 /* We already demanded XEXP (op1, 0) to be REG_P, so
11206 don't recurse into it. */
11207 return true;
11208 }
11209 }
11210 return false; /* All arguments need to be in registers. */
11211 }
11212
11213 case SYMBOL_REF:
11214
11215 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11216 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11217 {
11218 /* LDR. */
11219 if (speed)
11220 *cost += extra_cost->ldst.load;
11221 }
11222 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11223 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11224 {
11225 /* ADRP, followed by ADD. */
11226 *cost += COSTS_N_INSNS (1);
11227 if (speed)
11228 *cost += 2 * extra_cost->alu.arith;
11229 }
11230 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11231 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11232 {
11233 /* ADR. */
11234 if (speed)
11235 *cost += extra_cost->alu.arith;
11236 }
11237
11238 if (flag_pic)
11239 {
11240 /* One extra load instruction, after accessing the GOT. */
11241 *cost += COSTS_N_INSNS (1);
11242 if (speed)
11243 *cost += extra_cost->ldst.load;
11244 }
11245 return true;
11246
11247 case HIGH:
11248 case LO_SUM:
11249 /* ADRP/ADD (immediate). */
11250 if (speed)
11251 *cost += extra_cost->alu.arith;
11252 return true;
11253
11254 case ZERO_EXTRACT:
11255 case SIGN_EXTRACT:
11256 /* UBFX/SBFX. */
11257 if (speed)
11258 {
11259 if (VECTOR_MODE_P (mode))
11260 *cost += extra_cost->vect.alu;
11261 else
11262 *cost += extra_cost->alu.bfx;
11263 }
11264
11265 /* We can trust that the immediates used will be correct (there
11266 are no by-register forms), so we need only cost op0. */
11267 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11268 return true;
11269
11270 case MULT:
11271 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11272 /* aarch64_rtx_mult_cost always handles recursion to its
11273 operands. */
11274 return true;
11275
11276 case MOD:
11277 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11278 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11279 an unconditional negate. This case should only ever be reached through
11280 the set_smod_pow2_cheap check in expmed.c. */
11281 if (CONST_INT_P (XEXP (x, 1))
11282 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11283 && (mode == SImode || mode == DImode))
11284 {
11285 /* We expand to 4 instructions. Reset the baseline. */
11286 *cost = COSTS_N_INSNS (4);
11287
11288 if (speed)
11289 *cost += 2 * extra_cost->alu.logical
11290 + 2 * extra_cost->alu.arith;
11291
11292 return true;
11293 }
11294
11295 /* Fall-through. */
11296 case UMOD:
11297 if (speed)
11298 {
11299 /* Slighly prefer UMOD over SMOD. */
11300 if (VECTOR_MODE_P (mode))
11301 *cost += extra_cost->vect.alu;
11302 else if (GET_MODE_CLASS (mode) == MODE_INT)
11303 *cost += (extra_cost->mult[mode == DImode].add
11304 + extra_cost->mult[mode == DImode].idiv
11305 + (code == MOD ? 1 : 0));
11306 }
11307 return false; /* All arguments need to be in registers. */
11308
11309 case DIV:
11310 case UDIV:
11311 case SQRT:
11312 if (speed)
11313 {
11314 if (VECTOR_MODE_P (mode))
11315 *cost += extra_cost->vect.alu;
11316 else if (GET_MODE_CLASS (mode) == MODE_INT)
11317 /* There is no integer SQRT, so only DIV and UDIV can get
11318 here. */
11319 *cost += (extra_cost->mult[mode == DImode].idiv
11320 /* Slighly prefer UDIV over SDIV. */
11321 + (code == DIV ? 1 : 0));
11322 else
11323 *cost += extra_cost->fp[mode == DFmode].div;
11324 }
11325 return false; /* All arguments need to be in registers. */
11326
11327 case IF_THEN_ELSE:
11328 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11329 XEXP (x, 2), cost, speed);
11330
11331 case EQ:
11332 case NE:
11333 case GT:
11334 case GTU:
11335 case LT:
11336 case LTU:
11337 case GE:
11338 case GEU:
11339 case LE:
11340 case LEU:
11341
11342 return false; /* All arguments must be in registers. */
11343
11344 case FMA:
11345 op0 = XEXP (x, 0);
11346 op1 = XEXP (x, 1);
11347 op2 = XEXP (x, 2);
11348
11349 if (speed)
11350 {
11351 if (VECTOR_MODE_P (mode))
11352 *cost += extra_cost->vect.alu;
11353 else
11354 *cost += extra_cost->fp[mode == DFmode].fma;
11355 }
11356
11357 /* FMSUB, FNMADD, and FNMSUB are free. */
11358 if (GET_CODE (op0) == NEG)
11359 op0 = XEXP (op0, 0);
11360
11361 if (GET_CODE (op2) == NEG)
11362 op2 = XEXP (op2, 0);
11363
11364 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11365 and the by-element operand as operand 0. */
11366 if (GET_CODE (op1) == NEG)
11367 op1 = XEXP (op1, 0);
11368
11369 /* Catch vector-by-element operations. The by-element operand can
11370 either be (vec_duplicate (vec_select (x))) or just
11371 (vec_select (x)), depending on whether we are multiplying by
11372 a vector or a scalar.
11373
11374 Canonicalization is not very good in these cases, FMA4 will put the
11375 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11376 if (GET_CODE (op0) == VEC_DUPLICATE)
11377 op0 = XEXP (op0, 0);
11378 else if (GET_CODE (op1) == VEC_DUPLICATE)
11379 op1 = XEXP (op1, 0);
11380
11381 if (GET_CODE (op0) == VEC_SELECT)
11382 op0 = XEXP (op0, 0);
11383 else if (GET_CODE (op1) == VEC_SELECT)
11384 op1 = XEXP (op1, 0);
11385
11386 /* If the remaining parameters are not registers,
11387 get the cost to put them into registers. */
11388 *cost += rtx_cost (op0, mode, FMA, 0, speed);
11389 *cost += rtx_cost (op1, mode, FMA, 1, speed);
11390 *cost += rtx_cost (op2, mode, FMA, 2, speed);
11391 return true;
11392
11393 case FLOAT:
11394 case UNSIGNED_FLOAT:
11395 if (speed)
11396 *cost += extra_cost->fp[mode == DFmode].fromint;
11397 return false;
11398
11399 case FLOAT_EXTEND:
11400 if (speed)
11401 {
11402 if (VECTOR_MODE_P (mode))
11403 {
11404 /*Vector truncate. */
11405 *cost += extra_cost->vect.alu;
11406 }
11407 else
11408 *cost += extra_cost->fp[mode == DFmode].widen;
11409 }
11410 return false;
11411
11412 case FLOAT_TRUNCATE:
11413 if (speed)
11414 {
11415 if (VECTOR_MODE_P (mode))
11416 {
11417 /*Vector conversion. */
11418 *cost += extra_cost->vect.alu;
11419 }
11420 else
11421 *cost += extra_cost->fp[mode == DFmode].narrow;
11422 }
11423 return false;
11424
11425 case FIX:
11426 case UNSIGNED_FIX:
11427 x = XEXP (x, 0);
11428 /* Strip the rounding part. They will all be implemented
11429 by the fcvt* family of instructions anyway. */
11430 if (GET_CODE (x) == UNSPEC)
11431 {
11432 unsigned int uns_code = XINT (x, 1);
11433
11434 if (uns_code == UNSPEC_FRINTA
11435 || uns_code == UNSPEC_FRINTM
11436 || uns_code == UNSPEC_FRINTN
11437 || uns_code == UNSPEC_FRINTP
11438 || uns_code == UNSPEC_FRINTZ)
11439 x = XVECEXP (x, 0, 0);
11440 }
11441
11442 if (speed)
11443 {
11444 if (VECTOR_MODE_P (mode))
11445 *cost += extra_cost->vect.alu;
11446 else
11447 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11448 }
11449
11450 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11451 fixed-point fcvt. */
11452 if (GET_CODE (x) == MULT
11453 && ((VECTOR_MODE_P (mode)
11454 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11455 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11456 {
11457 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11458 0, speed);
11459 return true;
11460 }
11461
11462 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11463 return true;
11464
11465 case ABS:
11466 if (VECTOR_MODE_P (mode))
11467 {
11468 /* ABS (vector). */
11469 if (speed)
11470 *cost += extra_cost->vect.alu;
11471 }
11472 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11473 {
11474 op0 = XEXP (x, 0);
11475
11476 /* FABD, which is analogous to FADD. */
11477 if (GET_CODE (op0) == MINUS)
11478 {
11479 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11480 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11481 if (speed)
11482 *cost += extra_cost->fp[mode == DFmode].addsub;
11483
11484 return true;
11485 }
11486 /* Simple FABS is analogous to FNEG. */
11487 if (speed)
11488 *cost += extra_cost->fp[mode == DFmode].neg;
11489 }
11490 else
11491 {
11492 /* Integer ABS will either be split to
11493 two arithmetic instructions, or will be an ABS
11494 (scalar), which we don't model. */
11495 *cost = COSTS_N_INSNS (2);
11496 if (speed)
11497 *cost += 2 * extra_cost->alu.arith;
11498 }
11499 return false;
11500
11501 case SMAX:
11502 case SMIN:
11503 if (speed)
11504 {
11505 if (VECTOR_MODE_P (mode))
11506 *cost += extra_cost->vect.alu;
11507 else
11508 {
11509 /* FMAXNM/FMINNM/FMAX/FMIN.
11510 TODO: This may not be accurate for all implementations, but
11511 we do not model this in the cost tables. */
11512 *cost += extra_cost->fp[mode == DFmode].addsub;
11513 }
11514 }
11515 return false;
11516
11517 case UNSPEC:
11518 /* The floating point round to integer frint* instructions. */
11519 if (aarch64_frint_unspec_p (XINT (x, 1)))
11520 {
11521 if (speed)
11522 *cost += extra_cost->fp[mode == DFmode].roundint;
11523
11524 return false;
11525 }
11526
11527 if (XINT (x, 1) == UNSPEC_RBIT)
11528 {
11529 if (speed)
11530 *cost += extra_cost->alu.rev;
11531
11532 return false;
11533 }
11534 break;
11535
11536 case TRUNCATE:
11537
11538 /* Decompose <su>muldi3_highpart. */
11539 if (/* (truncate:DI */
11540 mode == DImode
11541 /* (lshiftrt:TI */
11542 && GET_MODE (XEXP (x, 0)) == TImode
11543 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11544 /* (mult:TI */
11545 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11546 /* (ANY_EXTEND:TI (reg:DI))
11547 (ANY_EXTEND:TI (reg:DI))) */
11548 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11549 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11550 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11551 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11552 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11553 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11554 /* (const_int 64) */
11555 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11556 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11557 {
11558 /* UMULH/SMULH. */
11559 if (speed)
11560 *cost += extra_cost->mult[mode == DImode].extend;
11561 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11562 mode, MULT, 0, speed);
11563 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11564 mode, MULT, 1, speed);
11565 return true;
11566 }
11567
11568 /* Fall through. */
11569 default:
11570 break;
11571 }
11572
11573 if (dump_file
11574 && flag_aarch64_verbose_cost)
11575 fprintf (dump_file,
11576 "\nFailed to cost RTX. Assuming default cost.\n");
11577
11578 return true;
11579 }
11580
11581 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11582 calculated for X. This cost is stored in *COST. Returns true
11583 if the total cost of X was calculated. */
11584 static bool
11585 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11586 int param, int *cost, bool speed)
11587 {
11588 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11589
11590 if (dump_file
11591 && flag_aarch64_verbose_cost)
11592 {
11593 print_rtl_single (dump_file, x);
11594 fprintf (dump_file, "\n%s cost: %d (%s)\n",
11595 speed ? "Hot" : "Cold",
11596 *cost, result ? "final" : "partial");
11597 }
11598
11599 return result;
11600 }
11601
11602 static int
11603 aarch64_register_move_cost (machine_mode mode,
11604 reg_class_t from_i, reg_class_t to_i)
11605 {
11606 enum reg_class from = (enum reg_class) from_i;
11607 enum reg_class to = (enum reg_class) to_i;
11608 const struct cpu_regmove_cost *regmove_cost
11609 = aarch64_tune_params.regmove_cost;
11610
11611 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11612 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11613 to = GENERAL_REGS;
11614
11615 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11616 from = GENERAL_REGS;
11617
11618 /* Moving between GPR and stack cost is the same as GP2GP. */
11619 if ((from == GENERAL_REGS && to == STACK_REG)
11620 || (to == GENERAL_REGS && from == STACK_REG))
11621 return regmove_cost->GP2GP;
11622
11623 /* To/From the stack register, we move via the gprs. */
11624 if (to == STACK_REG || from == STACK_REG)
11625 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11626 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11627
11628 if (known_eq (GET_MODE_SIZE (mode), 16))
11629 {
11630 /* 128-bit operations on general registers require 2 instructions. */
11631 if (from == GENERAL_REGS && to == GENERAL_REGS)
11632 return regmove_cost->GP2GP * 2;
11633 else if (from == GENERAL_REGS)
11634 return regmove_cost->GP2FP * 2;
11635 else if (to == GENERAL_REGS)
11636 return regmove_cost->FP2GP * 2;
11637
11638 /* When AdvSIMD instructions are disabled it is not possible to move
11639 a 128-bit value directly between Q registers. This is handled in
11640 secondary reload. A general register is used as a scratch to move
11641 the upper DI value and the lower DI value is moved directly,
11642 hence the cost is the sum of three moves. */
11643 if (! TARGET_SIMD)
11644 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11645
11646 return regmove_cost->FP2FP;
11647 }
11648
11649 if (from == GENERAL_REGS && to == GENERAL_REGS)
11650 return regmove_cost->GP2GP;
11651 else if (from == GENERAL_REGS)
11652 return regmove_cost->GP2FP;
11653 else if (to == GENERAL_REGS)
11654 return regmove_cost->FP2GP;
11655
11656 return regmove_cost->FP2FP;
11657 }
11658
11659 static int
11660 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11661 reg_class_t rclass ATTRIBUTE_UNUSED,
11662 bool in ATTRIBUTE_UNUSED)
11663 {
11664 return aarch64_tune_params.memmov_cost;
11665 }
11666
11667 /* Implement TARGET_INIT_BUILTINS. */
11668 static void
11669 aarch64_init_builtins ()
11670 {
11671 aarch64_general_init_builtins ();
11672 }
11673
11674 /* Implement TARGET_FOLD_BUILTIN. */
11675 static tree
11676 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
11677 {
11678 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11679 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11680 tree type = TREE_TYPE (TREE_TYPE (fndecl));
11681 switch (code & AARCH64_BUILTIN_CLASS)
11682 {
11683 case AARCH64_BUILTIN_GENERAL:
11684 return aarch64_general_fold_builtin (subcode, type, nargs, args);
11685 }
11686 gcc_unreachable ();
11687 }
11688
11689 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
11690 static bool
11691 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
11692 {
11693 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
11694 tree fndecl = gimple_call_fndecl (stmt);
11695 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11696 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11697 gimple *new_stmt = NULL;
11698 switch (code & AARCH64_BUILTIN_CLASS)
11699 {
11700 case AARCH64_BUILTIN_GENERAL:
11701 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
11702 break;
11703 }
11704
11705 if (!new_stmt)
11706 return false;
11707
11708 gsi_replace (gsi, new_stmt, true);
11709 return true;
11710 }
11711
11712 /* Implement TARGET_EXPAND_BUILTIN. */
11713 static rtx
11714 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int)
11715 {
11716 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
11717 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11718 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11719 switch (code & AARCH64_BUILTIN_CLASS)
11720 {
11721 case AARCH64_BUILTIN_GENERAL:
11722 return aarch64_general_expand_builtin (subcode, exp, target);
11723 }
11724 gcc_unreachable ();
11725 }
11726
11727 /* Implement TARGET_BUILTIN_DECL. */
11728 static tree
11729 aarch64_builtin_decl (unsigned int code, bool initialize_p)
11730 {
11731 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11732 switch (code & AARCH64_BUILTIN_CLASS)
11733 {
11734 case AARCH64_BUILTIN_GENERAL:
11735 return aarch64_general_builtin_decl (subcode, initialize_p);
11736 }
11737 gcc_unreachable ();
11738 }
11739
11740 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11741 to optimize 1.0/sqrt. */
11742
11743 static bool
11744 use_rsqrt_p (machine_mode mode)
11745 {
11746 return (!flag_trapping_math
11747 && flag_unsafe_math_optimizations
11748 && ((aarch64_tune_params.approx_modes->recip_sqrt
11749 & AARCH64_APPROX_MODE (mode))
11750 || flag_mrecip_low_precision_sqrt));
11751 }
11752
11753 /* Function to decide when to use the approximate reciprocal square root
11754 builtin. */
11755
11756 static tree
11757 aarch64_builtin_reciprocal (tree fndecl)
11758 {
11759 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11760
11761 if (!use_rsqrt_p (mode))
11762 return NULL_TREE;
11763 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
11764 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
11765 switch (code & AARCH64_BUILTIN_CLASS)
11766 {
11767 case AARCH64_BUILTIN_GENERAL:
11768 return aarch64_general_builtin_rsqrt (subcode);
11769 }
11770 gcc_unreachable ();
11771 }
11772
11773 /* Emit instruction sequence to compute either the approximate square root
11774 or its approximate reciprocal, depending on the flag RECP, and return
11775 whether the sequence was emitted or not. */
11776
11777 bool
11778 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11779 {
11780 machine_mode mode = GET_MODE (dst);
11781
11782 if (GET_MODE_INNER (mode) == HFmode)
11783 {
11784 gcc_assert (!recp);
11785 return false;
11786 }
11787
11788 if (!recp)
11789 {
11790 if (!(flag_mlow_precision_sqrt
11791 || (aarch64_tune_params.approx_modes->sqrt
11792 & AARCH64_APPROX_MODE (mode))))
11793 return false;
11794
11795 if (flag_finite_math_only
11796 || flag_trapping_math
11797 || !flag_unsafe_math_optimizations
11798 || optimize_function_for_size_p (cfun))
11799 return false;
11800 }
11801 else
11802 /* Caller assumes we cannot fail. */
11803 gcc_assert (use_rsqrt_p (mode));
11804
11805 machine_mode mmsk = mode_for_int_vector (mode).require ();
11806 rtx xmsk = gen_reg_rtx (mmsk);
11807 if (!recp)
11808 /* When calculating the approximate square root, compare the
11809 argument with 0.0 and create a mask. */
11810 emit_insn (gen_rtx_SET (xmsk,
11811 gen_rtx_NEG (mmsk,
11812 gen_rtx_EQ (mmsk, src,
11813 CONST0_RTX (mode)))));
11814
11815 /* Estimate the approximate reciprocal square root. */
11816 rtx xdst = gen_reg_rtx (mode);
11817 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11818
11819 /* Iterate over the series twice for SF and thrice for DF. */
11820 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11821
11822 /* Optionally iterate over the series once less for faster performance
11823 while sacrificing the accuracy. */
11824 if ((recp && flag_mrecip_low_precision_sqrt)
11825 || (!recp && flag_mlow_precision_sqrt))
11826 iterations--;
11827
11828 /* Iterate over the series to calculate the approximate reciprocal square
11829 root. */
11830 rtx x1 = gen_reg_rtx (mode);
11831 while (iterations--)
11832 {
11833 rtx x2 = gen_reg_rtx (mode);
11834 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11835
11836 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11837
11838 if (iterations > 0)
11839 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11840 }
11841
11842 if (!recp)
11843 {
11844 /* Qualify the approximate reciprocal square root when the argument is
11845 0.0 by squashing the intermediary result to 0.0. */
11846 rtx xtmp = gen_reg_rtx (mmsk);
11847 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11848 gen_rtx_SUBREG (mmsk, xdst, 0)));
11849 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11850
11851 /* Calculate the approximate square root. */
11852 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11853 }
11854
11855 /* Finalize the approximation. */
11856 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11857
11858 return true;
11859 }
11860
11861 /* Emit the instruction sequence to compute the approximation for the division
11862 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11863
11864 bool
11865 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11866 {
11867 machine_mode mode = GET_MODE (quo);
11868
11869 if (GET_MODE_INNER (mode) == HFmode)
11870 return false;
11871
11872 bool use_approx_division_p = (flag_mlow_precision_div
11873 || (aarch64_tune_params.approx_modes->division
11874 & AARCH64_APPROX_MODE (mode)));
11875
11876 if (!flag_finite_math_only
11877 || flag_trapping_math
11878 || !flag_unsafe_math_optimizations
11879 || optimize_function_for_size_p (cfun)
11880 || !use_approx_division_p)
11881 return false;
11882
11883 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11884 return false;
11885
11886 /* Estimate the approximate reciprocal. */
11887 rtx xrcp = gen_reg_rtx (mode);
11888 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11889
11890 /* Iterate over the series twice for SF and thrice for DF. */
11891 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11892
11893 /* Optionally iterate over the series once less for faster performance,
11894 while sacrificing the accuracy. */
11895 if (flag_mlow_precision_div)
11896 iterations--;
11897
11898 /* Iterate over the series to calculate the approximate reciprocal. */
11899 rtx xtmp = gen_reg_rtx (mode);
11900 while (iterations--)
11901 {
11902 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11903
11904 if (iterations > 0)
11905 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11906 }
11907
11908 if (num != CONST1_RTX (mode))
11909 {
11910 /* As the approximate reciprocal of DEN is already calculated, only
11911 calculate the approximate division when NUM is not 1.0. */
11912 rtx xnum = force_reg (mode, num);
11913 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11914 }
11915
11916 /* Finalize the approximation. */
11917 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11918 return true;
11919 }
11920
11921 /* Return the number of instructions that can be issued per cycle. */
11922 static int
11923 aarch64_sched_issue_rate (void)
11924 {
11925 return aarch64_tune_params.issue_rate;
11926 }
11927
11928 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
11929 static int
11930 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
11931 {
11932 if (DEBUG_INSN_P (insn))
11933 return more;
11934
11935 rtx_code code = GET_CODE (PATTERN (insn));
11936 if (code == USE || code == CLOBBER)
11937 return more;
11938
11939 if (get_attr_type (insn) == TYPE_NO_INSN)
11940 return more;
11941
11942 return more - 1;
11943 }
11944
11945 static int
11946 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11947 {
11948 int issue_rate = aarch64_sched_issue_rate ();
11949
11950 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11951 }
11952
11953
11954 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11955 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11956 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11957
11958 static int
11959 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11960 int ready_index)
11961 {
11962 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11963 }
11964
11965
11966 /* Vectorizer cost model target hooks. */
11967
11968 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11969 static int
11970 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11971 tree vectype,
11972 int misalign ATTRIBUTE_UNUSED)
11973 {
11974 unsigned elements;
11975 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11976 bool fp = false;
11977
11978 if (vectype != NULL)
11979 fp = FLOAT_TYPE_P (vectype);
11980
11981 switch (type_of_cost)
11982 {
11983 case scalar_stmt:
11984 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11985
11986 case scalar_load:
11987 return costs->scalar_load_cost;
11988
11989 case scalar_store:
11990 return costs->scalar_store_cost;
11991
11992 case vector_stmt:
11993 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11994
11995 case vector_load:
11996 return costs->vec_align_load_cost;
11997
11998 case vector_store:
11999 return costs->vec_store_cost;
12000
12001 case vec_to_scalar:
12002 return costs->vec_to_scalar_cost;
12003
12004 case scalar_to_vec:
12005 return costs->scalar_to_vec_cost;
12006
12007 case unaligned_load:
12008 case vector_gather_load:
12009 return costs->vec_unalign_load_cost;
12010
12011 case unaligned_store:
12012 case vector_scatter_store:
12013 return costs->vec_unalign_store_cost;
12014
12015 case cond_branch_taken:
12016 return costs->cond_taken_branch_cost;
12017
12018 case cond_branch_not_taken:
12019 return costs->cond_not_taken_branch_cost;
12020
12021 case vec_perm:
12022 return costs->vec_permute_cost;
12023
12024 case vec_promote_demote:
12025 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12026
12027 case vec_construct:
12028 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
12029 return elements / 2 + 1;
12030
12031 default:
12032 gcc_unreachable ();
12033 }
12034 }
12035
12036 /* Implement targetm.vectorize.add_stmt_cost. */
12037 static unsigned
12038 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
12039 struct _stmt_vec_info *stmt_info, int misalign,
12040 enum vect_cost_model_location where)
12041 {
12042 unsigned *cost = (unsigned *) data;
12043 unsigned retval = 0;
12044
12045 if (flag_vect_cost_model)
12046 {
12047 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
12048 int stmt_cost =
12049 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
12050
12051 /* Statements in an inner loop relative to the loop being
12052 vectorized are weighted more heavily. The value here is
12053 arbitrary and could potentially be improved with analysis. */
12054 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
12055 count *= 50; /* FIXME */
12056
12057 retval = (unsigned) (count * stmt_cost);
12058 cost[where] += retval;
12059 }
12060
12061 return retval;
12062 }
12063
12064 static void initialize_aarch64_code_model (struct gcc_options *);
12065
12066 /* Parse the TO_PARSE string and put the architecture struct that it
12067 selects into RES and the architectural features into ISA_FLAGS.
12068 Return an aarch64_parse_opt_result describing the parse result.
12069 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
12070 When the TO_PARSE string contains an invalid extension,
12071 a copy of the string is created and stored to INVALID_EXTENSION. */
12072
12073 static enum aarch64_parse_opt_result
12074 aarch64_parse_arch (const char *to_parse, const struct processor **res,
12075 uint64_t *isa_flags, std::string *invalid_extension)
12076 {
12077 const char *ext;
12078 const struct processor *arch;
12079 size_t len;
12080
12081 ext = strchr (to_parse, '+');
12082
12083 if (ext != NULL)
12084 len = ext - to_parse;
12085 else
12086 len = strlen (to_parse);
12087
12088 if (len == 0)
12089 return AARCH64_PARSE_MISSING_ARG;
12090
12091
12092 /* Loop through the list of supported ARCHes to find a match. */
12093 for (arch = all_architectures; arch->name != NULL; arch++)
12094 {
12095 if (strlen (arch->name) == len
12096 && strncmp (arch->name, to_parse, len) == 0)
12097 {
12098 uint64_t isa_temp = arch->flags;
12099
12100 if (ext != NULL)
12101 {
12102 /* TO_PARSE string contains at least one extension. */
12103 enum aarch64_parse_opt_result ext_res
12104 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12105
12106 if (ext_res != AARCH64_PARSE_OK)
12107 return ext_res;
12108 }
12109 /* Extension parsing was successful. Confirm the result
12110 arch and ISA flags. */
12111 *res = arch;
12112 *isa_flags = isa_temp;
12113 return AARCH64_PARSE_OK;
12114 }
12115 }
12116
12117 /* ARCH name not found in list. */
12118 return AARCH64_PARSE_INVALID_ARG;
12119 }
12120
12121 /* Parse the TO_PARSE string and put the result tuning in RES and the
12122 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
12123 describing the parse result. If there is an error parsing, RES and
12124 ISA_FLAGS are left unchanged.
12125 When the TO_PARSE string contains an invalid extension,
12126 a copy of the string is created and stored to INVALID_EXTENSION. */
12127
12128 static enum aarch64_parse_opt_result
12129 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
12130 uint64_t *isa_flags, std::string *invalid_extension)
12131 {
12132 const char *ext;
12133 const struct processor *cpu;
12134 size_t len;
12135
12136 ext = strchr (to_parse, '+');
12137
12138 if (ext != NULL)
12139 len = ext - to_parse;
12140 else
12141 len = strlen (to_parse);
12142
12143 if (len == 0)
12144 return AARCH64_PARSE_MISSING_ARG;
12145
12146
12147 /* Loop through the list of supported CPUs to find a match. */
12148 for (cpu = all_cores; cpu->name != NULL; cpu++)
12149 {
12150 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
12151 {
12152 uint64_t isa_temp = cpu->flags;
12153
12154
12155 if (ext != NULL)
12156 {
12157 /* TO_PARSE string contains at least one extension. */
12158 enum aarch64_parse_opt_result ext_res
12159 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
12160
12161 if (ext_res != AARCH64_PARSE_OK)
12162 return ext_res;
12163 }
12164 /* Extension parsing was successfull. Confirm the result
12165 cpu and ISA flags. */
12166 *res = cpu;
12167 *isa_flags = isa_temp;
12168 return AARCH64_PARSE_OK;
12169 }
12170 }
12171
12172 /* CPU name not found in list. */
12173 return AARCH64_PARSE_INVALID_ARG;
12174 }
12175
12176 /* Parse the TO_PARSE string and put the cpu it selects into RES.
12177 Return an aarch64_parse_opt_result describing the parse result.
12178 If the parsing fails the RES does not change. */
12179
12180 static enum aarch64_parse_opt_result
12181 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12182 {
12183 const struct processor *cpu;
12184
12185 /* Loop through the list of supported CPUs to find a match. */
12186 for (cpu = all_cores; cpu->name != NULL; cpu++)
12187 {
12188 if (strcmp (cpu->name, to_parse) == 0)
12189 {
12190 *res = cpu;
12191 return AARCH64_PARSE_OK;
12192 }
12193 }
12194
12195 /* CPU name not found in list. */
12196 return AARCH64_PARSE_INVALID_ARG;
12197 }
12198
12199 /* Parse TOKEN, which has length LENGTH to see if it is an option
12200 described in FLAG. If it is, return the index bit for that fusion type.
12201 If not, error (printing OPTION_NAME) and return zero. */
12202
12203 static unsigned int
12204 aarch64_parse_one_option_token (const char *token,
12205 size_t length,
12206 const struct aarch64_flag_desc *flag,
12207 const char *option_name)
12208 {
12209 for (; flag->name != NULL; flag++)
12210 {
12211 if (length == strlen (flag->name)
12212 && !strncmp (flag->name, token, length))
12213 return flag->flag;
12214 }
12215
12216 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12217 return 0;
12218 }
12219
12220 /* Parse OPTION which is a comma-separated list of flags to enable.
12221 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12222 default state we inherit from the CPU tuning structures. OPTION_NAME
12223 gives the top-level option we are parsing in the -moverride string,
12224 for use in error messages. */
12225
12226 static unsigned int
12227 aarch64_parse_boolean_options (const char *option,
12228 const struct aarch64_flag_desc *flags,
12229 unsigned int initial_state,
12230 const char *option_name)
12231 {
12232 const char separator = '.';
12233 const char* specs = option;
12234 const char* ntoken = option;
12235 unsigned int found_flags = initial_state;
12236
12237 while ((ntoken = strchr (specs, separator)))
12238 {
12239 size_t token_length = ntoken - specs;
12240 unsigned token_ops = aarch64_parse_one_option_token (specs,
12241 token_length,
12242 flags,
12243 option_name);
12244 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12245 in the token stream, reset the supported operations. So:
12246
12247 adrp+add.cmp+branch.none.adrp+add
12248
12249 would have the result of turning on only adrp+add fusion. */
12250 if (!token_ops)
12251 found_flags = 0;
12252
12253 found_flags |= token_ops;
12254 specs = ++ntoken;
12255 }
12256
12257 /* We ended with a comma, print something. */
12258 if (!(*specs))
12259 {
12260 error ("%s string ill-formed\n", option_name);
12261 return 0;
12262 }
12263
12264 /* We still have one more token to parse. */
12265 size_t token_length = strlen (specs);
12266 unsigned token_ops = aarch64_parse_one_option_token (specs,
12267 token_length,
12268 flags,
12269 option_name);
12270 if (!token_ops)
12271 found_flags = 0;
12272
12273 found_flags |= token_ops;
12274 return found_flags;
12275 }
12276
12277 /* Support for overriding instruction fusion. */
12278
12279 static void
12280 aarch64_parse_fuse_string (const char *fuse_string,
12281 struct tune_params *tune)
12282 {
12283 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12284 aarch64_fusible_pairs,
12285 tune->fusible_ops,
12286 "fuse=");
12287 }
12288
12289 /* Support for overriding other tuning flags. */
12290
12291 static void
12292 aarch64_parse_tune_string (const char *tune_string,
12293 struct tune_params *tune)
12294 {
12295 tune->extra_tuning_flags
12296 = aarch64_parse_boolean_options (tune_string,
12297 aarch64_tuning_flags,
12298 tune->extra_tuning_flags,
12299 "tune=");
12300 }
12301
12302 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12303 Accept the valid SVE vector widths allowed by
12304 aarch64_sve_vector_bits_enum and use it to override sve_width
12305 in TUNE. */
12306
12307 static void
12308 aarch64_parse_sve_width_string (const char *tune_string,
12309 struct tune_params *tune)
12310 {
12311 int width = -1;
12312
12313 int n = sscanf (tune_string, "%d", &width);
12314 if (n == EOF)
12315 {
12316 error ("invalid format for sve_width");
12317 return;
12318 }
12319 switch (width)
12320 {
12321 case SVE_128:
12322 case SVE_256:
12323 case SVE_512:
12324 case SVE_1024:
12325 case SVE_2048:
12326 break;
12327 default:
12328 error ("invalid sve_width value: %d", width);
12329 }
12330 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12331 }
12332
12333 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12334 we understand. If it is, extract the option string and handoff to
12335 the appropriate function. */
12336
12337 void
12338 aarch64_parse_one_override_token (const char* token,
12339 size_t length,
12340 struct tune_params *tune)
12341 {
12342 const struct aarch64_tuning_override_function *fn
12343 = aarch64_tuning_override_functions;
12344
12345 const char *option_part = strchr (token, '=');
12346 if (!option_part)
12347 {
12348 error ("tuning string missing in option (%s)", token);
12349 return;
12350 }
12351
12352 /* Get the length of the option name. */
12353 length = option_part - token;
12354 /* Skip the '=' to get to the option string. */
12355 option_part++;
12356
12357 for (; fn->name != NULL; fn++)
12358 {
12359 if (!strncmp (fn->name, token, length))
12360 {
12361 fn->parse_override (option_part, tune);
12362 return;
12363 }
12364 }
12365
12366 error ("unknown tuning option (%s)",token);
12367 return;
12368 }
12369
12370 /* A checking mechanism for the implementation of the tls size. */
12371
12372 static void
12373 initialize_aarch64_tls_size (struct gcc_options *opts)
12374 {
12375 if (aarch64_tls_size == 0)
12376 aarch64_tls_size = 24;
12377
12378 switch (opts->x_aarch64_cmodel_var)
12379 {
12380 case AARCH64_CMODEL_TINY:
12381 /* Both the default and maximum TLS size allowed under tiny is 1M which
12382 needs two instructions to address, so we clamp the size to 24. */
12383 if (aarch64_tls_size > 24)
12384 aarch64_tls_size = 24;
12385 break;
12386 case AARCH64_CMODEL_SMALL:
12387 /* The maximum TLS size allowed under small is 4G. */
12388 if (aarch64_tls_size > 32)
12389 aarch64_tls_size = 32;
12390 break;
12391 case AARCH64_CMODEL_LARGE:
12392 /* The maximum TLS size allowed under large is 16E.
12393 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12394 if (aarch64_tls_size > 48)
12395 aarch64_tls_size = 48;
12396 break;
12397 default:
12398 gcc_unreachable ();
12399 }
12400
12401 return;
12402 }
12403
12404 /* Parse STRING looking for options in the format:
12405 string :: option:string
12406 option :: name=substring
12407 name :: {a-z}
12408 substring :: defined by option. */
12409
12410 static void
12411 aarch64_parse_override_string (const char* input_string,
12412 struct tune_params* tune)
12413 {
12414 const char separator = ':';
12415 size_t string_length = strlen (input_string) + 1;
12416 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12417 char *string = string_root;
12418 strncpy (string, input_string, string_length);
12419 string[string_length - 1] = '\0';
12420
12421 char* ntoken = string;
12422
12423 while ((ntoken = strchr (string, separator)))
12424 {
12425 size_t token_length = ntoken - string;
12426 /* Make this substring look like a string. */
12427 *ntoken = '\0';
12428 aarch64_parse_one_override_token (string, token_length, tune);
12429 string = ++ntoken;
12430 }
12431
12432 /* One last option to parse. */
12433 aarch64_parse_one_override_token (string, strlen (string), tune);
12434 free (string_root);
12435 }
12436
12437
12438 static void
12439 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12440 {
12441 if (accepted_branch_protection_string)
12442 {
12443 opts->x_aarch64_branch_protection_string
12444 = xstrdup (accepted_branch_protection_string);
12445 }
12446
12447 /* PR 70044: We have to be careful about being called multiple times for the
12448 same function. This means all changes should be repeatable. */
12449
12450 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12451 Disable the frame pointer flag so the mid-end will not use a frame
12452 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12453 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12454 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12455 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12456 if (opts->x_flag_omit_frame_pointer == 0)
12457 opts->x_flag_omit_frame_pointer = 2;
12458
12459 /* If not optimizing for size, set the default
12460 alignment to what the target wants. */
12461 if (!opts->x_optimize_size)
12462 {
12463 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12464 opts->x_str_align_loops = aarch64_tune_params.loop_align;
12465 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12466 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12467 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12468 opts->x_str_align_functions = aarch64_tune_params.function_align;
12469 }
12470
12471 /* We default to no pc-relative literal loads. */
12472
12473 aarch64_pcrelative_literal_loads = false;
12474
12475 /* If -mpc-relative-literal-loads is set on the command line, this
12476 implies that the user asked for PC relative literal loads. */
12477 if (opts->x_pcrelative_literal_loads == 1)
12478 aarch64_pcrelative_literal_loads = true;
12479
12480 /* In the tiny memory model it makes no sense to disallow PC relative
12481 literal pool loads. */
12482 if (aarch64_cmodel == AARCH64_CMODEL_TINY
12483 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12484 aarch64_pcrelative_literal_loads = true;
12485
12486 /* When enabling the lower precision Newton series for the square root, also
12487 enable it for the reciprocal square root, since the latter is an
12488 intermediary step for the former. */
12489 if (flag_mlow_precision_sqrt)
12490 flag_mrecip_low_precision_sqrt = true;
12491 }
12492
12493 /* 'Unpack' up the internal tuning structs and update the options
12494 in OPTS. The caller must have set up selected_tune and selected_arch
12495 as all the other target-specific codegen decisions are
12496 derived from them. */
12497
12498 void
12499 aarch64_override_options_internal (struct gcc_options *opts)
12500 {
12501 aarch64_tune_flags = selected_tune->flags;
12502 aarch64_tune = selected_tune->sched_core;
12503 /* Make a copy of the tuning parameters attached to the core, which
12504 we may later overwrite. */
12505 aarch64_tune_params = *(selected_tune->tune);
12506 aarch64_architecture_version = selected_arch->architecture_version;
12507
12508 if (opts->x_aarch64_override_tune_string)
12509 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12510 &aarch64_tune_params);
12511
12512 /* This target defaults to strict volatile bitfields. */
12513 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12514 opts->x_flag_strict_volatile_bitfields = 1;
12515
12516 if (aarch64_stack_protector_guard == SSP_GLOBAL
12517 && opts->x_aarch64_stack_protector_guard_offset_str)
12518 {
12519 error ("incompatible options %<-mstack-protector-guard=global%> and "
12520 "%<-mstack-protector-guard-offset=%s%>",
12521 aarch64_stack_protector_guard_offset_str);
12522 }
12523
12524 if (aarch64_stack_protector_guard == SSP_SYSREG
12525 && !(opts->x_aarch64_stack_protector_guard_offset_str
12526 && opts->x_aarch64_stack_protector_guard_reg_str))
12527 {
12528 error ("both %<-mstack-protector-guard-offset%> and "
12529 "%<-mstack-protector-guard-reg%> must be used "
12530 "with %<-mstack-protector-guard=sysreg%>");
12531 }
12532
12533 if (opts->x_aarch64_stack_protector_guard_reg_str)
12534 {
12535 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12536 error ("specify a system register with a small string length.");
12537 }
12538
12539 if (opts->x_aarch64_stack_protector_guard_offset_str)
12540 {
12541 char *end;
12542 const char *str = aarch64_stack_protector_guard_offset_str;
12543 errno = 0;
12544 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12545 if (!*str || *end || errno)
12546 error ("%qs is not a valid offset in %qs", str,
12547 "-mstack-protector-guard-offset=");
12548 aarch64_stack_protector_guard_offset = offs;
12549 }
12550
12551 initialize_aarch64_code_model (opts);
12552 initialize_aarch64_tls_size (opts);
12553
12554 int queue_depth = 0;
12555 switch (aarch64_tune_params.autoprefetcher_model)
12556 {
12557 case tune_params::AUTOPREFETCHER_OFF:
12558 queue_depth = -1;
12559 break;
12560 case tune_params::AUTOPREFETCHER_WEAK:
12561 queue_depth = 0;
12562 break;
12563 case tune_params::AUTOPREFETCHER_STRONG:
12564 queue_depth = max_insn_queue_index + 1;
12565 break;
12566 default:
12567 gcc_unreachable ();
12568 }
12569
12570 /* We don't mind passing in global_options_set here as we don't use
12571 the *options_set structs anyway. */
12572 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12573 queue_depth,
12574 opts->x_param_values,
12575 global_options_set.x_param_values);
12576
12577 /* Set up parameters to be used in prefetching algorithm. Do not
12578 override the defaults unless we are tuning for a core we have
12579 researched values for. */
12580 if (aarch64_tune_params.prefetch->num_slots > 0)
12581 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12582 aarch64_tune_params.prefetch->num_slots,
12583 opts->x_param_values,
12584 global_options_set.x_param_values);
12585 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12586 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12587 aarch64_tune_params.prefetch->l1_cache_size,
12588 opts->x_param_values,
12589 global_options_set.x_param_values);
12590 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12591 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12592 aarch64_tune_params.prefetch->l1_cache_line_size,
12593 opts->x_param_values,
12594 global_options_set.x_param_values);
12595 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12596 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12597 aarch64_tune_params.prefetch->l2_cache_size,
12598 opts->x_param_values,
12599 global_options_set.x_param_values);
12600 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12601 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12602 0,
12603 opts->x_param_values,
12604 global_options_set.x_param_values);
12605 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12606 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12607 aarch64_tune_params.prefetch->minimum_stride,
12608 opts->x_param_values,
12609 global_options_set.x_param_values);
12610
12611 /* Use the alternative scheduling-pressure algorithm by default. */
12612 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12613 opts->x_param_values,
12614 global_options_set.x_param_values);
12615
12616 /* If the user hasn't changed it via configure then set the default to 64 KB
12617 for the backend. */
12618 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12619 DEFAULT_STK_CLASH_GUARD_SIZE == 0
12620 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12621 opts->x_param_values,
12622 global_options_set.x_param_values);
12623
12624 /* Validate the guard size. */
12625 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12626
12627 /* Enforce that interval is the same size as size so the mid-end does the
12628 right thing. */
12629 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12630 guard_size,
12631 opts->x_param_values,
12632 global_options_set.x_param_values);
12633
12634 /* The maybe_set calls won't update the value if the user has explicitly set
12635 one. Which means we need to validate that probing interval and guard size
12636 are equal. */
12637 int probe_interval
12638 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12639 if (guard_size != probe_interval)
12640 error ("stack clash guard size %<%d%> must be equal to probing interval "
12641 "%<%d%>", guard_size, probe_interval);
12642
12643 /* Enable sw prefetching at specified optimization level for
12644 CPUS that have prefetch. Lower optimization level threshold by 1
12645 when profiling is enabled. */
12646 if (opts->x_flag_prefetch_loop_arrays < 0
12647 && !opts->x_optimize_size
12648 && aarch64_tune_params.prefetch->default_opt_level >= 0
12649 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12650 opts->x_flag_prefetch_loop_arrays = 1;
12651
12652 if (opts->x_aarch64_arch_string == NULL)
12653 opts->x_aarch64_arch_string = selected_arch->name;
12654 if (opts->x_aarch64_cpu_string == NULL)
12655 opts->x_aarch64_cpu_string = selected_cpu->name;
12656 if (opts->x_aarch64_tune_string == NULL)
12657 opts->x_aarch64_tune_string = selected_tune->name;
12658
12659 aarch64_override_options_after_change_1 (opts);
12660 }
12661
12662 /* Print a hint with a suggestion for a core or architecture name that
12663 most closely resembles what the user passed in STR. ARCH is true if
12664 the user is asking for an architecture name. ARCH is false if the user
12665 is asking for a core name. */
12666
12667 static void
12668 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12669 {
12670 auto_vec<const char *> candidates;
12671 const struct processor *entry = arch ? all_architectures : all_cores;
12672 for (; entry->name != NULL; entry++)
12673 candidates.safe_push (entry->name);
12674
12675 #ifdef HAVE_LOCAL_CPU_DETECT
12676 /* Add also "native" as possible value. */
12677 if (arch)
12678 candidates.safe_push ("native");
12679 #endif
12680
12681 char *s;
12682 const char *hint = candidates_list_and_hint (str, s, candidates);
12683 if (hint)
12684 inform (input_location, "valid arguments are: %s;"
12685 " did you mean %qs?", s, hint);
12686 else
12687 inform (input_location, "valid arguments are: %s", s);
12688
12689 XDELETEVEC (s);
12690 }
12691
12692 /* Print a hint with a suggestion for a core name that most closely resembles
12693 what the user passed in STR. */
12694
12695 inline static void
12696 aarch64_print_hint_for_core (const char *str)
12697 {
12698 aarch64_print_hint_for_core_or_arch (str, false);
12699 }
12700
12701 /* Print a hint with a suggestion for an architecture name that most closely
12702 resembles what the user passed in STR. */
12703
12704 inline static void
12705 aarch64_print_hint_for_arch (const char *str)
12706 {
12707 aarch64_print_hint_for_core_or_arch (str, true);
12708 }
12709
12710
12711 /* Print a hint with a suggestion for an extension name
12712 that most closely resembles what the user passed in STR. */
12713
12714 void
12715 aarch64_print_hint_for_extensions (const std::string &str)
12716 {
12717 auto_vec<const char *> candidates;
12718 aarch64_get_all_extension_candidates (&candidates);
12719 char *s;
12720 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12721 if (hint)
12722 inform (input_location, "valid arguments are: %s;"
12723 " did you mean %qs?", s, hint);
12724 else
12725 inform (input_location, "valid arguments are: %s;", s);
12726
12727 XDELETEVEC (s);
12728 }
12729
12730 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12731 specified in STR and throw errors if appropriate. Put the results if
12732 they are valid in RES and ISA_FLAGS. Return whether the option is
12733 valid. */
12734
12735 static bool
12736 aarch64_validate_mcpu (const char *str, const struct processor **res,
12737 uint64_t *isa_flags)
12738 {
12739 std::string invalid_extension;
12740 enum aarch64_parse_opt_result parse_res
12741 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12742
12743 if (parse_res == AARCH64_PARSE_OK)
12744 return true;
12745
12746 switch (parse_res)
12747 {
12748 case AARCH64_PARSE_MISSING_ARG:
12749 error ("missing cpu name in %<-mcpu=%s%>", str);
12750 break;
12751 case AARCH64_PARSE_INVALID_ARG:
12752 error ("unknown value %qs for %<-mcpu%>", str);
12753 aarch64_print_hint_for_core (str);
12754 break;
12755 case AARCH64_PARSE_INVALID_FEATURE:
12756 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12757 invalid_extension.c_str (), str);
12758 aarch64_print_hint_for_extensions (invalid_extension);
12759 break;
12760 default:
12761 gcc_unreachable ();
12762 }
12763
12764 return false;
12765 }
12766
12767 /* Parses CONST_STR for branch protection features specified in
12768 aarch64_branch_protect_types, and set any global variables required. Returns
12769 the parsing result and assigns LAST_STR to the last processed token from
12770 CONST_STR so that it can be used for error reporting. */
12771
12772 static enum
12773 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12774 char** last_str)
12775 {
12776 char *str_root = xstrdup (const_str);
12777 char* token_save = NULL;
12778 char *str = strtok_r (str_root, "+", &token_save);
12779 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12780 if (!str)
12781 res = AARCH64_PARSE_MISSING_ARG;
12782 else
12783 {
12784 char *next_str = strtok_r (NULL, "+", &token_save);
12785 /* Reset the branch protection features to their defaults. */
12786 aarch64_handle_no_branch_protection (NULL, NULL);
12787
12788 while (str && res == AARCH64_PARSE_OK)
12789 {
12790 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12791 bool found = false;
12792 /* Search for this type. */
12793 while (type && type->name && !found && res == AARCH64_PARSE_OK)
12794 {
12795 if (strcmp (str, type->name) == 0)
12796 {
12797 found = true;
12798 res = type->handler (str, next_str);
12799 str = next_str;
12800 next_str = strtok_r (NULL, "+", &token_save);
12801 }
12802 else
12803 type++;
12804 }
12805 if (found && res == AARCH64_PARSE_OK)
12806 {
12807 bool found_subtype = true;
12808 /* Loop through each token until we find one that isn't a
12809 subtype. */
12810 while (found_subtype)
12811 {
12812 found_subtype = false;
12813 const aarch64_branch_protect_type *subtype = type->subtypes;
12814 /* Search for the subtype. */
12815 while (str && subtype && subtype->name && !found_subtype
12816 && res == AARCH64_PARSE_OK)
12817 {
12818 if (strcmp (str, subtype->name) == 0)
12819 {
12820 found_subtype = true;
12821 res = subtype->handler (str, next_str);
12822 str = next_str;
12823 next_str = strtok_r (NULL, "+", &token_save);
12824 }
12825 else
12826 subtype++;
12827 }
12828 }
12829 }
12830 else if (!found)
12831 res = AARCH64_PARSE_INVALID_ARG;
12832 }
12833 }
12834 /* Copy the last processed token into the argument to pass it back.
12835 Used by option and attribute validation to print the offending token. */
12836 if (last_str)
12837 {
12838 if (str) strcpy (*last_str, str);
12839 else *last_str = NULL;
12840 }
12841 if (res == AARCH64_PARSE_OK)
12842 {
12843 /* If needed, alloc the accepted string then copy in const_str.
12844 Used by override_option_after_change_1. */
12845 if (!accepted_branch_protection_string)
12846 accepted_branch_protection_string = (char *) xmalloc (
12847 BRANCH_PROTECT_STR_MAX
12848 + 1);
12849 strncpy (accepted_branch_protection_string, const_str,
12850 BRANCH_PROTECT_STR_MAX + 1);
12851 /* Forcibly null-terminate. */
12852 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12853 }
12854 return res;
12855 }
12856
12857 static bool
12858 aarch64_validate_mbranch_protection (const char *const_str)
12859 {
12860 char *str = (char *) xmalloc (strlen (const_str));
12861 enum aarch64_parse_opt_result res =
12862 aarch64_parse_branch_protection (const_str, &str);
12863 if (res == AARCH64_PARSE_INVALID_ARG)
12864 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12865 else if (res == AARCH64_PARSE_MISSING_ARG)
12866 error ("missing argument for %<-mbranch-protection=%>");
12867 free (str);
12868 return res == AARCH64_PARSE_OK;
12869 }
12870
12871 /* Validate a command-line -march option. Parse the arch and extensions
12872 (if any) specified in STR and throw errors if appropriate. Put the
12873 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12874 option is valid. */
12875
12876 static bool
12877 aarch64_validate_march (const char *str, const struct processor **res,
12878 uint64_t *isa_flags)
12879 {
12880 std::string invalid_extension;
12881 enum aarch64_parse_opt_result parse_res
12882 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12883
12884 if (parse_res == AARCH64_PARSE_OK)
12885 return true;
12886
12887 switch (parse_res)
12888 {
12889 case AARCH64_PARSE_MISSING_ARG:
12890 error ("missing arch name in %<-march=%s%>", str);
12891 break;
12892 case AARCH64_PARSE_INVALID_ARG:
12893 error ("unknown value %qs for %<-march%>", str);
12894 aarch64_print_hint_for_arch (str);
12895 break;
12896 case AARCH64_PARSE_INVALID_FEATURE:
12897 error ("invalid feature modifier %qs in %<-march=%s%>",
12898 invalid_extension.c_str (), str);
12899 aarch64_print_hint_for_extensions (invalid_extension);
12900 break;
12901 default:
12902 gcc_unreachable ();
12903 }
12904
12905 return false;
12906 }
12907
12908 /* Validate a command-line -mtune option. Parse the cpu
12909 specified in STR and throw errors if appropriate. Put the
12910 result, if it is valid, in RES. Return whether the option is
12911 valid. */
12912
12913 static bool
12914 aarch64_validate_mtune (const char *str, const struct processor **res)
12915 {
12916 enum aarch64_parse_opt_result parse_res
12917 = aarch64_parse_tune (str, res);
12918
12919 if (parse_res == AARCH64_PARSE_OK)
12920 return true;
12921
12922 switch (parse_res)
12923 {
12924 case AARCH64_PARSE_MISSING_ARG:
12925 error ("missing cpu name in %<-mtune=%s%>", str);
12926 break;
12927 case AARCH64_PARSE_INVALID_ARG:
12928 error ("unknown value %qs for %<-mtune%>", str);
12929 aarch64_print_hint_for_core (str);
12930 break;
12931 default:
12932 gcc_unreachable ();
12933 }
12934 return false;
12935 }
12936
12937 /* Return the CPU corresponding to the enum CPU.
12938 If it doesn't specify a cpu, return the default. */
12939
12940 static const struct processor *
12941 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12942 {
12943 if (cpu != aarch64_none)
12944 return &all_cores[cpu];
12945
12946 /* The & 0x3f is to extract the bottom 6 bits that encode the
12947 default cpu as selected by the --with-cpu GCC configure option
12948 in config.gcc.
12949 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12950 flags mechanism should be reworked to make it more sane. */
12951 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12952 }
12953
12954 /* Return the architecture corresponding to the enum ARCH.
12955 If it doesn't specify a valid architecture, return the default. */
12956
12957 static const struct processor *
12958 aarch64_get_arch (enum aarch64_arch arch)
12959 {
12960 if (arch != aarch64_no_arch)
12961 return &all_architectures[arch];
12962
12963 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12964
12965 return &all_architectures[cpu->arch];
12966 }
12967
12968 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12969
12970 static poly_uint16
12971 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12972 {
12973 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12974 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12975 deciding which .md file patterns to use and when deciding whether
12976 something is a legitimate address or constant. */
12977 if (value == SVE_SCALABLE || value == SVE_128)
12978 return poly_uint16 (2, 2);
12979 else
12980 return (int) value / 64;
12981 }
12982
12983 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12984 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12985 tuning structs. In particular it must set selected_tune and
12986 aarch64_isa_flags that define the available ISA features and tuning
12987 decisions. It must also set selected_arch as this will be used to
12988 output the .arch asm tags for each function. */
12989
12990 static void
12991 aarch64_override_options (void)
12992 {
12993 uint64_t cpu_isa = 0;
12994 uint64_t arch_isa = 0;
12995 aarch64_isa_flags = 0;
12996
12997 bool valid_cpu = true;
12998 bool valid_tune = true;
12999 bool valid_arch = true;
13000
13001 selected_cpu = NULL;
13002 selected_arch = NULL;
13003 selected_tune = NULL;
13004
13005 if (aarch64_branch_protection_string)
13006 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
13007
13008 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
13009 If either of -march or -mtune is given, they override their
13010 respective component of -mcpu. */
13011 if (aarch64_cpu_string)
13012 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
13013 &cpu_isa);
13014
13015 if (aarch64_arch_string)
13016 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
13017 &arch_isa);
13018
13019 if (aarch64_tune_string)
13020 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
13021
13022 #ifdef SUBTARGET_OVERRIDE_OPTIONS
13023 SUBTARGET_OVERRIDE_OPTIONS;
13024 #endif
13025
13026 /* If the user did not specify a processor, choose the default
13027 one for them. This will be the CPU set during configuration using
13028 --with-cpu, otherwise it is "generic". */
13029 if (!selected_cpu)
13030 {
13031 if (selected_arch)
13032 {
13033 selected_cpu = &all_cores[selected_arch->ident];
13034 aarch64_isa_flags = arch_isa;
13035 explicit_arch = selected_arch->arch;
13036 }
13037 else
13038 {
13039 /* Get default configure-time CPU. */
13040 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
13041 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
13042 }
13043
13044 if (selected_tune)
13045 explicit_tune_core = selected_tune->ident;
13046 }
13047 /* If both -mcpu and -march are specified check that they are architecturally
13048 compatible, warn if they're not and prefer the -march ISA flags. */
13049 else if (selected_arch)
13050 {
13051 if (selected_arch->arch != selected_cpu->arch)
13052 {
13053 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
13054 all_architectures[selected_cpu->arch].name,
13055 selected_arch->name);
13056 }
13057 aarch64_isa_flags = arch_isa;
13058 explicit_arch = selected_arch->arch;
13059 explicit_tune_core = selected_tune ? selected_tune->ident
13060 : selected_cpu->ident;
13061 }
13062 else
13063 {
13064 /* -mcpu but no -march. */
13065 aarch64_isa_flags = cpu_isa;
13066 explicit_tune_core = selected_tune ? selected_tune->ident
13067 : selected_cpu->ident;
13068 gcc_assert (selected_cpu);
13069 selected_arch = &all_architectures[selected_cpu->arch];
13070 explicit_arch = selected_arch->arch;
13071 }
13072
13073 /* Set the arch as well as we will need it when outputing
13074 the .arch directive in assembly. */
13075 if (!selected_arch)
13076 {
13077 gcc_assert (selected_cpu);
13078 selected_arch = &all_architectures[selected_cpu->arch];
13079 }
13080
13081 if (!selected_tune)
13082 selected_tune = selected_cpu;
13083
13084 if (aarch64_enable_bti == 2)
13085 {
13086 #ifdef TARGET_ENABLE_BTI
13087 aarch64_enable_bti = 1;
13088 #else
13089 aarch64_enable_bti = 0;
13090 #endif
13091 }
13092
13093 /* Return address signing is currently not supported for ILP32 targets. For
13094 LP64 targets use the configured option in the absence of a command-line
13095 option for -mbranch-protection. */
13096 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
13097 {
13098 #ifdef TARGET_ENABLE_PAC_RET
13099 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
13100 #else
13101 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
13102 #endif
13103 }
13104
13105 #ifndef HAVE_AS_MABI_OPTION
13106 /* The compiler may have been configured with 2.23.* binutils, which does
13107 not have support for ILP32. */
13108 if (TARGET_ILP32)
13109 error ("assembler does not support %<-mabi=ilp32%>");
13110 #endif
13111
13112 /* Convert -msve-vector-bits to a VG count. */
13113 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
13114
13115 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
13116 sorry ("return address signing is only supported for %<-mabi=lp64%>");
13117
13118 /* Make sure we properly set up the explicit options. */
13119 if ((aarch64_cpu_string && valid_cpu)
13120 || (aarch64_tune_string && valid_tune))
13121 gcc_assert (explicit_tune_core != aarch64_none);
13122
13123 if ((aarch64_cpu_string && valid_cpu)
13124 || (aarch64_arch_string && valid_arch))
13125 gcc_assert (explicit_arch != aarch64_no_arch);
13126
13127 /* The pass to insert speculation tracking runs before
13128 shrink-wrapping and the latter does not know how to update the
13129 tracking status. So disable it in this case. */
13130 if (aarch64_track_speculation)
13131 flag_shrink_wrap = 0;
13132
13133 aarch64_override_options_internal (&global_options);
13134
13135 /* Save these options as the default ones in case we push and pop them later
13136 while processing functions with potential target attributes. */
13137 target_option_default_node = target_option_current_node
13138 = build_target_option_node (&global_options);
13139 }
13140
13141 /* Implement targetm.override_options_after_change. */
13142
13143 static void
13144 aarch64_override_options_after_change (void)
13145 {
13146 aarch64_override_options_after_change_1 (&global_options);
13147 }
13148
13149 static struct machine_function *
13150 aarch64_init_machine_status (void)
13151 {
13152 struct machine_function *machine;
13153 machine = ggc_cleared_alloc<machine_function> ();
13154 return machine;
13155 }
13156
13157 void
13158 aarch64_init_expanders (void)
13159 {
13160 init_machine_status = aarch64_init_machine_status;
13161 }
13162
13163 /* A checking mechanism for the implementation of the various code models. */
13164 static void
13165 initialize_aarch64_code_model (struct gcc_options *opts)
13166 {
13167 if (opts->x_flag_pic)
13168 {
13169 switch (opts->x_aarch64_cmodel_var)
13170 {
13171 case AARCH64_CMODEL_TINY:
13172 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
13173 break;
13174 case AARCH64_CMODEL_SMALL:
13175 #ifdef HAVE_AS_SMALL_PIC_RELOCS
13176 aarch64_cmodel = (flag_pic == 2
13177 ? AARCH64_CMODEL_SMALL_PIC
13178 : AARCH64_CMODEL_SMALL_SPIC);
13179 #else
13180 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13181 #endif
13182 break;
13183 case AARCH64_CMODEL_LARGE:
13184 sorry ("code model %qs with %<-f%s%>", "large",
13185 opts->x_flag_pic > 1 ? "PIC" : "pic");
13186 break;
13187 default:
13188 gcc_unreachable ();
13189 }
13190 }
13191 else
13192 aarch64_cmodel = opts->x_aarch64_cmodel_var;
13193 }
13194
13195 /* Implement TARGET_OPTION_SAVE. */
13196
13197 static void
13198 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13199 {
13200 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13201 ptr->x_aarch64_branch_protection_string
13202 = opts->x_aarch64_branch_protection_string;
13203 }
13204
13205 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
13206 using the information saved in PTR. */
13207
13208 static void
13209 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13210 {
13211 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13212 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13213 opts->x_explicit_arch = ptr->x_explicit_arch;
13214 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13215 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13216 opts->x_aarch64_branch_protection_string
13217 = ptr->x_aarch64_branch_protection_string;
13218 if (opts->x_aarch64_branch_protection_string)
13219 {
13220 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13221 NULL);
13222 }
13223
13224 aarch64_override_options_internal (opts);
13225 }
13226
13227 /* Implement TARGET_OPTION_PRINT. */
13228
13229 static void
13230 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13231 {
13232 const struct processor *cpu
13233 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13234 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13235 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13236 std::string extension
13237 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13238
13239 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13240 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13241 arch->name, extension.c_str ());
13242 }
13243
13244 static GTY(()) tree aarch64_previous_fndecl;
13245
13246 void
13247 aarch64_reset_previous_fndecl (void)
13248 {
13249 aarch64_previous_fndecl = NULL;
13250 }
13251
13252 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13253 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13254 make sure optab availability predicates are recomputed when necessary. */
13255
13256 void
13257 aarch64_save_restore_target_globals (tree new_tree)
13258 {
13259 if (TREE_TARGET_GLOBALS (new_tree))
13260 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13261 else if (new_tree == target_option_default_node)
13262 restore_target_globals (&default_target_globals);
13263 else
13264 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13265 }
13266
13267 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13268 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13269 of the function, if such exists. This function may be called multiple
13270 times on a single function so use aarch64_previous_fndecl to avoid
13271 setting up identical state. */
13272
13273 static void
13274 aarch64_set_current_function (tree fndecl)
13275 {
13276 if (!fndecl || fndecl == aarch64_previous_fndecl)
13277 return;
13278
13279 tree old_tree = (aarch64_previous_fndecl
13280 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13281 : NULL_TREE);
13282
13283 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13284
13285 /* If current function has no attributes but the previous one did,
13286 use the default node. */
13287 if (!new_tree && old_tree)
13288 new_tree = target_option_default_node;
13289
13290 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13291 the default have been handled by aarch64_save_restore_target_globals from
13292 aarch64_pragma_target_parse. */
13293 if (old_tree == new_tree)
13294 return;
13295
13296 aarch64_previous_fndecl = fndecl;
13297
13298 /* First set the target options. */
13299 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13300
13301 aarch64_save_restore_target_globals (new_tree);
13302 }
13303
13304 /* Enum describing the various ways we can handle attributes.
13305 In many cases we can reuse the generic option handling machinery. */
13306
13307 enum aarch64_attr_opt_type
13308 {
13309 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
13310 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
13311 aarch64_attr_enum, /* Attribute sets an enum variable. */
13312 aarch64_attr_custom /* Attribute requires a custom handling function. */
13313 };
13314
13315 /* All the information needed to handle a target attribute.
13316 NAME is the name of the attribute.
13317 ATTR_TYPE specifies the type of behavior of the attribute as described
13318 in the definition of enum aarch64_attr_opt_type.
13319 ALLOW_NEG is true if the attribute supports a "no-" form.
13320 HANDLER is the function that takes the attribute string as an argument
13321 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13322 OPT_NUM is the enum specifying the option that the attribute modifies.
13323 This is needed for attributes that mirror the behavior of a command-line
13324 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13325 aarch64_attr_enum. */
13326
13327 struct aarch64_attribute_info
13328 {
13329 const char *name;
13330 enum aarch64_attr_opt_type attr_type;
13331 bool allow_neg;
13332 bool (*handler) (const char *);
13333 enum opt_code opt_num;
13334 };
13335
13336 /* Handle the ARCH_STR argument to the arch= target attribute. */
13337
13338 static bool
13339 aarch64_handle_attr_arch (const char *str)
13340 {
13341 const struct processor *tmp_arch = NULL;
13342 std::string invalid_extension;
13343 enum aarch64_parse_opt_result parse_res
13344 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13345
13346 if (parse_res == AARCH64_PARSE_OK)
13347 {
13348 gcc_assert (tmp_arch);
13349 selected_arch = tmp_arch;
13350 explicit_arch = selected_arch->arch;
13351 return true;
13352 }
13353
13354 switch (parse_res)
13355 {
13356 case AARCH64_PARSE_MISSING_ARG:
13357 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13358 break;
13359 case AARCH64_PARSE_INVALID_ARG:
13360 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13361 aarch64_print_hint_for_arch (str);
13362 break;
13363 case AARCH64_PARSE_INVALID_FEATURE:
13364 error ("invalid feature modifier %s of value (\"%s\") in "
13365 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13366 aarch64_print_hint_for_extensions (invalid_extension);
13367 break;
13368 default:
13369 gcc_unreachable ();
13370 }
13371
13372 return false;
13373 }
13374
13375 /* Handle the argument CPU_STR to the cpu= target attribute. */
13376
13377 static bool
13378 aarch64_handle_attr_cpu (const char *str)
13379 {
13380 const struct processor *tmp_cpu = NULL;
13381 std::string invalid_extension;
13382 enum aarch64_parse_opt_result parse_res
13383 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13384
13385 if (parse_res == AARCH64_PARSE_OK)
13386 {
13387 gcc_assert (tmp_cpu);
13388 selected_tune = tmp_cpu;
13389 explicit_tune_core = selected_tune->ident;
13390
13391 selected_arch = &all_architectures[tmp_cpu->arch];
13392 explicit_arch = selected_arch->arch;
13393 return true;
13394 }
13395
13396 switch (parse_res)
13397 {
13398 case AARCH64_PARSE_MISSING_ARG:
13399 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13400 break;
13401 case AARCH64_PARSE_INVALID_ARG:
13402 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13403 aarch64_print_hint_for_core (str);
13404 break;
13405 case AARCH64_PARSE_INVALID_FEATURE:
13406 error ("invalid feature modifier %s of value (\"%s\") in "
13407 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13408 aarch64_print_hint_for_extensions (invalid_extension);
13409 break;
13410 default:
13411 gcc_unreachable ();
13412 }
13413
13414 return false;
13415 }
13416
13417 /* Handle the argument STR to the branch-protection= attribute. */
13418
13419 static bool
13420 aarch64_handle_attr_branch_protection (const char* str)
13421 {
13422 char *err_str = (char *) xmalloc (strlen (str));
13423 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13424 &err_str);
13425 bool success = false;
13426 switch (res)
13427 {
13428 case AARCH64_PARSE_MISSING_ARG:
13429 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13430 " attribute");
13431 break;
13432 case AARCH64_PARSE_INVALID_ARG:
13433 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13434 "=\")%> pragma or attribute", err_str);
13435 break;
13436 case AARCH64_PARSE_OK:
13437 success = true;
13438 /* Fall through. */
13439 case AARCH64_PARSE_INVALID_FEATURE:
13440 break;
13441 default:
13442 gcc_unreachable ();
13443 }
13444 free (err_str);
13445 return success;
13446 }
13447
13448 /* Handle the argument STR to the tune= target attribute. */
13449
13450 static bool
13451 aarch64_handle_attr_tune (const char *str)
13452 {
13453 const struct processor *tmp_tune = NULL;
13454 enum aarch64_parse_opt_result parse_res
13455 = aarch64_parse_tune (str, &tmp_tune);
13456
13457 if (parse_res == AARCH64_PARSE_OK)
13458 {
13459 gcc_assert (tmp_tune);
13460 selected_tune = tmp_tune;
13461 explicit_tune_core = selected_tune->ident;
13462 return true;
13463 }
13464
13465 switch (parse_res)
13466 {
13467 case AARCH64_PARSE_INVALID_ARG:
13468 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13469 aarch64_print_hint_for_core (str);
13470 break;
13471 default:
13472 gcc_unreachable ();
13473 }
13474
13475 return false;
13476 }
13477
13478 /* Parse an architecture extensions target attribute string specified in STR.
13479 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13480 if successful. Update aarch64_isa_flags to reflect the ISA features
13481 modified. */
13482
13483 static bool
13484 aarch64_handle_attr_isa_flags (char *str)
13485 {
13486 enum aarch64_parse_opt_result parse_res;
13487 uint64_t isa_flags = aarch64_isa_flags;
13488
13489 /* We allow "+nothing" in the beginning to clear out all architectural
13490 features if the user wants to handpick specific features. */
13491 if (strncmp ("+nothing", str, 8) == 0)
13492 {
13493 isa_flags = 0;
13494 str += 8;
13495 }
13496
13497 std::string invalid_extension;
13498 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13499
13500 if (parse_res == AARCH64_PARSE_OK)
13501 {
13502 aarch64_isa_flags = isa_flags;
13503 return true;
13504 }
13505
13506 switch (parse_res)
13507 {
13508 case AARCH64_PARSE_MISSING_ARG:
13509 error ("missing value in %<target()%> pragma or attribute");
13510 break;
13511
13512 case AARCH64_PARSE_INVALID_FEATURE:
13513 error ("invalid feature modifier %s of value (\"%s\") in "
13514 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13515 break;
13516
13517 default:
13518 gcc_unreachable ();
13519 }
13520
13521 return false;
13522 }
13523
13524 /* The target attributes that we support. On top of these we also support just
13525 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13526 handled explicitly in aarch64_process_one_target_attr. */
13527
13528 static const struct aarch64_attribute_info aarch64_attributes[] =
13529 {
13530 { "general-regs-only", aarch64_attr_mask, false, NULL,
13531 OPT_mgeneral_regs_only },
13532 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13533 OPT_mfix_cortex_a53_835769 },
13534 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13535 OPT_mfix_cortex_a53_843419 },
13536 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13537 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13538 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13539 OPT_momit_leaf_frame_pointer },
13540 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13541 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13542 OPT_march_ },
13543 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13544 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13545 OPT_mtune_ },
13546 { "branch-protection", aarch64_attr_custom, false,
13547 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13548 { "sign-return-address", aarch64_attr_enum, false, NULL,
13549 OPT_msign_return_address_ },
13550 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13551 };
13552
13553 /* Parse ARG_STR which contains the definition of one target attribute.
13554 Show appropriate errors if any or return true if the attribute is valid. */
13555
13556 static bool
13557 aarch64_process_one_target_attr (char *arg_str)
13558 {
13559 bool invert = false;
13560
13561 size_t len = strlen (arg_str);
13562
13563 if (len == 0)
13564 {
13565 error ("malformed %<target()%> pragma or attribute");
13566 return false;
13567 }
13568
13569 char *str_to_check = (char *) alloca (len + 1);
13570 strcpy (str_to_check, arg_str);
13571
13572 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13573 It is easier to detect and handle it explicitly here rather than going
13574 through the machinery for the rest of the target attributes in this
13575 function. */
13576 if (*str_to_check == '+')
13577 return aarch64_handle_attr_isa_flags (str_to_check);
13578
13579 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13580 {
13581 invert = true;
13582 str_to_check += 3;
13583 }
13584 char *arg = strchr (str_to_check, '=');
13585
13586 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13587 and point ARG to "foo". */
13588 if (arg)
13589 {
13590 *arg = '\0';
13591 arg++;
13592 }
13593 const struct aarch64_attribute_info *p_attr;
13594 bool found = false;
13595 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13596 {
13597 /* If the names don't match up, or the user has given an argument
13598 to an attribute that doesn't accept one, or didn't give an argument
13599 to an attribute that expects one, fail to match. */
13600 if (strcmp (str_to_check, p_attr->name) != 0)
13601 continue;
13602
13603 found = true;
13604 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13605 || p_attr->attr_type == aarch64_attr_enum;
13606
13607 if (attr_need_arg_p ^ (arg != NULL))
13608 {
13609 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13610 return false;
13611 }
13612
13613 /* If the name matches but the attribute does not allow "no-" versions
13614 then we can't match. */
13615 if (invert && !p_attr->allow_neg)
13616 {
13617 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13618 return false;
13619 }
13620
13621 switch (p_attr->attr_type)
13622 {
13623 /* Has a custom handler registered.
13624 For example, cpu=, arch=, tune=. */
13625 case aarch64_attr_custom:
13626 gcc_assert (p_attr->handler);
13627 if (!p_attr->handler (arg))
13628 return false;
13629 break;
13630
13631 /* Either set or unset a boolean option. */
13632 case aarch64_attr_bool:
13633 {
13634 struct cl_decoded_option decoded;
13635
13636 generate_option (p_attr->opt_num, NULL, !invert,
13637 CL_TARGET, &decoded);
13638 aarch64_handle_option (&global_options, &global_options_set,
13639 &decoded, input_location);
13640 break;
13641 }
13642 /* Set or unset a bit in the target_flags. aarch64_handle_option
13643 should know what mask to apply given the option number. */
13644 case aarch64_attr_mask:
13645 {
13646 struct cl_decoded_option decoded;
13647 /* We only need to specify the option number.
13648 aarch64_handle_option will know which mask to apply. */
13649 decoded.opt_index = p_attr->opt_num;
13650 decoded.value = !invert;
13651 aarch64_handle_option (&global_options, &global_options_set,
13652 &decoded, input_location);
13653 break;
13654 }
13655 /* Use the option setting machinery to set an option to an enum. */
13656 case aarch64_attr_enum:
13657 {
13658 gcc_assert (arg);
13659 bool valid;
13660 int value;
13661 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13662 &value, CL_TARGET);
13663 if (valid)
13664 {
13665 set_option (&global_options, NULL, p_attr->opt_num, value,
13666 NULL, DK_UNSPECIFIED, input_location,
13667 global_dc);
13668 }
13669 else
13670 {
13671 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13672 }
13673 break;
13674 }
13675 default:
13676 gcc_unreachable ();
13677 }
13678 }
13679
13680 /* If we reached here we either have found an attribute and validated
13681 it or didn't match any. If we matched an attribute but its arguments
13682 were malformed we will have returned false already. */
13683 return found;
13684 }
13685
13686 /* Count how many times the character C appears in
13687 NULL-terminated string STR. */
13688
13689 static unsigned int
13690 num_occurences_in_str (char c, char *str)
13691 {
13692 unsigned int res = 0;
13693 while (*str != '\0')
13694 {
13695 if (*str == c)
13696 res++;
13697
13698 str++;
13699 }
13700
13701 return res;
13702 }
13703
13704 /* Parse the tree in ARGS that contains the target attribute information
13705 and update the global target options space. */
13706
13707 bool
13708 aarch64_process_target_attr (tree args)
13709 {
13710 if (TREE_CODE (args) == TREE_LIST)
13711 {
13712 do
13713 {
13714 tree head = TREE_VALUE (args);
13715 if (head)
13716 {
13717 if (!aarch64_process_target_attr (head))
13718 return false;
13719 }
13720 args = TREE_CHAIN (args);
13721 } while (args);
13722
13723 return true;
13724 }
13725
13726 if (TREE_CODE (args) != STRING_CST)
13727 {
13728 error ("attribute %<target%> argument not a string");
13729 return false;
13730 }
13731
13732 size_t len = strlen (TREE_STRING_POINTER (args));
13733 char *str_to_check = (char *) alloca (len + 1);
13734 strcpy (str_to_check, TREE_STRING_POINTER (args));
13735
13736 if (len == 0)
13737 {
13738 error ("malformed %<target()%> pragma or attribute");
13739 return false;
13740 }
13741
13742 /* Used to catch empty spaces between commas i.e.
13743 attribute ((target ("attr1,,attr2"))). */
13744 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13745
13746 /* Handle multiple target attributes separated by ','. */
13747 char *token = strtok_r (str_to_check, ",", &str_to_check);
13748
13749 unsigned int num_attrs = 0;
13750 while (token)
13751 {
13752 num_attrs++;
13753 if (!aarch64_process_one_target_attr (token))
13754 {
13755 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13756 return false;
13757 }
13758
13759 token = strtok_r (NULL, ",", &str_to_check);
13760 }
13761
13762 if (num_attrs != num_commas + 1)
13763 {
13764 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13765 return false;
13766 }
13767
13768 return true;
13769 }
13770
13771 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13772 process attribute ((target ("..."))). */
13773
13774 static bool
13775 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13776 {
13777 struct cl_target_option cur_target;
13778 bool ret;
13779 tree old_optimize;
13780 tree new_target, new_optimize;
13781 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13782
13783 /* If what we're processing is the current pragma string then the
13784 target option node is already stored in target_option_current_node
13785 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13786 having to re-parse the string. This is especially useful to keep
13787 arm_neon.h compile times down since that header contains a lot
13788 of intrinsics enclosed in pragmas. */
13789 if (!existing_target && args == current_target_pragma)
13790 {
13791 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13792 return true;
13793 }
13794 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13795
13796 old_optimize = build_optimization_node (&global_options);
13797 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13798
13799 /* If the function changed the optimization levels as well as setting
13800 target options, start with the optimizations specified. */
13801 if (func_optimize && func_optimize != old_optimize)
13802 cl_optimization_restore (&global_options,
13803 TREE_OPTIMIZATION (func_optimize));
13804
13805 /* Save the current target options to restore at the end. */
13806 cl_target_option_save (&cur_target, &global_options);
13807
13808 /* If fndecl already has some target attributes applied to it, unpack
13809 them so that we add this attribute on top of them, rather than
13810 overwriting them. */
13811 if (existing_target)
13812 {
13813 struct cl_target_option *existing_options
13814 = TREE_TARGET_OPTION (existing_target);
13815
13816 if (existing_options)
13817 cl_target_option_restore (&global_options, existing_options);
13818 }
13819 else
13820 cl_target_option_restore (&global_options,
13821 TREE_TARGET_OPTION (target_option_current_node));
13822
13823 ret = aarch64_process_target_attr (args);
13824
13825 /* Set up any additional state. */
13826 if (ret)
13827 {
13828 aarch64_override_options_internal (&global_options);
13829 /* Initialize SIMD builtins if we haven't already.
13830 Set current_target_pragma to NULL for the duration so that
13831 the builtin initialization code doesn't try to tag the functions
13832 being built with the attributes specified by any current pragma, thus
13833 going into an infinite recursion. */
13834 if (TARGET_SIMD)
13835 {
13836 tree saved_current_target_pragma = current_target_pragma;
13837 current_target_pragma = NULL;
13838 aarch64_init_simd_builtins ();
13839 current_target_pragma = saved_current_target_pragma;
13840 }
13841 new_target = build_target_option_node (&global_options);
13842 }
13843 else
13844 new_target = NULL;
13845
13846 new_optimize = build_optimization_node (&global_options);
13847
13848 if (fndecl && ret)
13849 {
13850 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13851
13852 if (old_optimize != new_optimize)
13853 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13854 }
13855
13856 cl_target_option_restore (&global_options, &cur_target);
13857
13858 if (old_optimize != new_optimize)
13859 cl_optimization_restore (&global_options,
13860 TREE_OPTIMIZATION (old_optimize));
13861 return ret;
13862 }
13863
13864 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13865 tri-bool options (yes, no, don't care) and the default value is
13866 DEF, determine whether to reject inlining. */
13867
13868 static bool
13869 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13870 int dont_care, int def)
13871 {
13872 /* If the callee doesn't care, always allow inlining. */
13873 if (callee == dont_care)
13874 return true;
13875
13876 /* If the caller doesn't care, always allow inlining. */
13877 if (caller == dont_care)
13878 return true;
13879
13880 /* Otherwise, allow inlining if either the callee and caller values
13881 agree, or if the callee is using the default value. */
13882 return (callee == caller || callee == def);
13883 }
13884
13885 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13886 to inline CALLEE into CALLER based on target-specific info.
13887 Make sure that the caller and callee have compatible architectural
13888 features. Then go through the other possible target attributes
13889 and see if they can block inlining. Try not to reject always_inline
13890 callees unless they are incompatible architecturally. */
13891
13892 static bool
13893 aarch64_can_inline_p (tree caller, tree callee)
13894 {
13895 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13896 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13897
13898 struct cl_target_option *caller_opts
13899 = TREE_TARGET_OPTION (caller_tree ? caller_tree
13900 : target_option_default_node);
13901
13902 struct cl_target_option *callee_opts
13903 = TREE_TARGET_OPTION (callee_tree ? callee_tree
13904 : target_option_default_node);
13905
13906 /* Callee's ISA flags should be a subset of the caller's. */
13907 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13908 != callee_opts->x_aarch64_isa_flags)
13909 return false;
13910
13911 /* Allow non-strict aligned functions inlining into strict
13912 aligned ones. */
13913 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13914 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13915 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13916 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13917 return false;
13918
13919 bool always_inline = lookup_attribute ("always_inline",
13920 DECL_ATTRIBUTES (callee));
13921
13922 /* If the architectural features match up and the callee is always_inline
13923 then the other attributes don't matter. */
13924 if (always_inline)
13925 return true;
13926
13927 if (caller_opts->x_aarch64_cmodel_var
13928 != callee_opts->x_aarch64_cmodel_var)
13929 return false;
13930
13931 if (caller_opts->x_aarch64_tls_dialect
13932 != callee_opts->x_aarch64_tls_dialect)
13933 return false;
13934
13935 /* Honour explicit requests to workaround errata. */
13936 if (!aarch64_tribools_ok_for_inlining_p (
13937 caller_opts->x_aarch64_fix_a53_err835769,
13938 callee_opts->x_aarch64_fix_a53_err835769,
13939 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13940 return false;
13941
13942 if (!aarch64_tribools_ok_for_inlining_p (
13943 caller_opts->x_aarch64_fix_a53_err843419,
13944 callee_opts->x_aarch64_fix_a53_err843419,
13945 2, TARGET_FIX_ERR_A53_843419))
13946 return false;
13947
13948 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13949 caller and calle and they don't match up, reject inlining. */
13950 if (!aarch64_tribools_ok_for_inlining_p (
13951 caller_opts->x_flag_omit_leaf_frame_pointer,
13952 callee_opts->x_flag_omit_leaf_frame_pointer,
13953 2, 1))
13954 return false;
13955
13956 /* If the callee has specific tuning overrides, respect them. */
13957 if (callee_opts->x_aarch64_override_tune_string != NULL
13958 && caller_opts->x_aarch64_override_tune_string == NULL)
13959 return false;
13960
13961 /* If the user specified tuning override strings for the
13962 caller and callee and they don't match up, reject inlining.
13963 We just do a string compare here, we don't analyze the meaning
13964 of the string, as it would be too costly for little gain. */
13965 if (callee_opts->x_aarch64_override_tune_string
13966 && caller_opts->x_aarch64_override_tune_string
13967 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13968 caller_opts->x_aarch64_override_tune_string) != 0))
13969 return false;
13970
13971 return true;
13972 }
13973
13974 /* Return true if SYMBOL_REF X binds locally. */
13975
13976 static bool
13977 aarch64_symbol_binds_local_p (const_rtx x)
13978 {
13979 return (SYMBOL_REF_DECL (x)
13980 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13981 : SYMBOL_REF_LOCAL_P (x));
13982 }
13983
13984 /* Return true if SYMBOL_REF X is thread local */
13985 static bool
13986 aarch64_tls_symbol_p (rtx x)
13987 {
13988 if (! TARGET_HAVE_TLS)
13989 return false;
13990
13991 if (GET_CODE (x) != SYMBOL_REF)
13992 return false;
13993
13994 return SYMBOL_REF_TLS_MODEL (x) != 0;
13995 }
13996
13997 /* Classify a TLS symbol into one of the TLS kinds. */
13998 enum aarch64_symbol_type
13999 aarch64_classify_tls_symbol (rtx x)
14000 {
14001 enum tls_model tls_kind = tls_symbolic_operand_type (x);
14002
14003 switch (tls_kind)
14004 {
14005 case TLS_MODEL_GLOBAL_DYNAMIC:
14006 case TLS_MODEL_LOCAL_DYNAMIC:
14007 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
14008
14009 case TLS_MODEL_INITIAL_EXEC:
14010 switch (aarch64_cmodel)
14011 {
14012 case AARCH64_CMODEL_TINY:
14013 case AARCH64_CMODEL_TINY_PIC:
14014 return SYMBOL_TINY_TLSIE;
14015 default:
14016 return SYMBOL_SMALL_TLSIE;
14017 }
14018
14019 case TLS_MODEL_LOCAL_EXEC:
14020 if (aarch64_tls_size == 12)
14021 return SYMBOL_TLSLE12;
14022 else if (aarch64_tls_size == 24)
14023 return SYMBOL_TLSLE24;
14024 else if (aarch64_tls_size == 32)
14025 return SYMBOL_TLSLE32;
14026 else if (aarch64_tls_size == 48)
14027 return SYMBOL_TLSLE48;
14028 else
14029 gcc_unreachable ();
14030
14031 case TLS_MODEL_EMULATED:
14032 case TLS_MODEL_NONE:
14033 return SYMBOL_FORCE_TO_MEM;
14034
14035 default:
14036 gcc_unreachable ();
14037 }
14038 }
14039
14040 /* Return the correct method for accessing X + OFFSET, where X is either
14041 a SYMBOL_REF or LABEL_REF. */
14042
14043 enum aarch64_symbol_type
14044 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
14045 {
14046 if (GET_CODE (x) == LABEL_REF)
14047 {
14048 switch (aarch64_cmodel)
14049 {
14050 case AARCH64_CMODEL_LARGE:
14051 return SYMBOL_FORCE_TO_MEM;
14052
14053 case AARCH64_CMODEL_TINY_PIC:
14054 case AARCH64_CMODEL_TINY:
14055 return SYMBOL_TINY_ABSOLUTE;
14056
14057 case AARCH64_CMODEL_SMALL_SPIC:
14058 case AARCH64_CMODEL_SMALL_PIC:
14059 case AARCH64_CMODEL_SMALL:
14060 return SYMBOL_SMALL_ABSOLUTE;
14061
14062 default:
14063 gcc_unreachable ();
14064 }
14065 }
14066
14067 if (GET_CODE (x) == SYMBOL_REF)
14068 {
14069 if (aarch64_tls_symbol_p (x))
14070 return aarch64_classify_tls_symbol (x);
14071
14072 switch (aarch64_cmodel)
14073 {
14074 case AARCH64_CMODEL_TINY:
14075 /* When we retrieve symbol + offset address, we have to make sure
14076 the offset does not cause overflow of the final address. But
14077 we have no way of knowing the address of symbol at compile time
14078 so we can't accurately say if the distance between the PC and
14079 symbol + offset is outside the addressible range of +/-1M in the
14080 TINY code model. So we rely on images not being greater than
14081 1M and cap the offset at 1M and anything beyond 1M will have to
14082 be loaded using an alternative mechanism. Furthermore if the
14083 symbol is a weak reference to something that isn't known to
14084 resolve to a symbol in this module, then force to memory. */
14085 if ((SYMBOL_REF_WEAK (x)
14086 && !aarch64_symbol_binds_local_p (x))
14087 || !IN_RANGE (offset, -1048575, 1048575))
14088 return SYMBOL_FORCE_TO_MEM;
14089 return SYMBOL_TINY_ABSOLUTE;
14090
14091 case AARCH64_CMODEL_SMALL:
14092 /* Same reasoning as the tiny code model, but the offset cap here is
14093 4G. */
14094 if ((SYMBOL_REF_WEAK (x)
14095 && !aarch64_symbol_binds_local_p (x))
14096 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
14097 HOST_WIDE_INT_C (4294967264)))
14098 return SYMBOL_FORCE_TO_MEM;
14099 return SYMBOL_SMALL_ABSOLUTE;
14100
14101 case AARCH64_CMODEL_TINY_PIC:
14102 if (!aarch64_symbol_binds_local_p (x))
14103 return SYMBOL_TINY_GOT;
14104 return SYMBOL_TINY_ABSOLUTE;
14105
14106 case AARCH64_CMODEL_SMALL_SPIC:
14107 case AARCH64_CMODEL_SMALL_PIC:
14108 if (!aarch64_symbol_binds_local_p (x))
14109 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
14110 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
14111 return SYMBOL_SMALL_ABSOLUTE;
14112
14113 case AARCH64_CMODEL_LARGE:
14114 /* This is alright even in PIC code as the constant
14115 pool reference is always PC relative and within
14116 the same translation unit. */
14117 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
14118 return SYMBOL_SMALL_ABSOLUTE;
14119 else
14120 return SYMBOL_FORCE_TO_MEM;
14121
14122 default:
14123 gcc_unreachable ();
14124 }
14125 }
14126
14127 /* By default push everything into the constant pool. */
14128 return SYMBOL_FORCE_TO_MEM;
14129 }
14130
14131 bool
14132 aarch64_constant_address_p (rtx x)
14133 {
14134 return (CONSTANT_P (x) && memory_address_p (DImode, x));
14135 }
14136
14137 bool
14138 aarch64_legitimate_pic_operand_p (rtx x)
14139 {
14140 if (GET_CODE (x) == SYMBOL_REF
14141 || (GET_CODE (x) == CONST
14142 && GET_CODE (XEXP (x, 0)) == PLUS
14143 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
14144 return false;
14145
14146 return true;
14147 }
14148
14149 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
14150 that should be rematerialized rather than spilled. */
14151
14152 static bool
14153 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
14154 {
14155 /* Support CSE and rematerialization of common constants. */
14156 if (CONST_INT_P (x)
14157 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
14158 || GET_CODE (x) == CONST_VECTOR)
14159 return true;
14160
14161 /* Do not allow vector struct mode constants for Advanced SIMD.
14162 We could support 0 and -1 easily, but they need support in
14163 aarch64-simd.md. */
14164 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14165 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14166 return false;
14167
14168 /* Only accept variable-length vector constants if they can be
14169 handled directly.
14170
14171 ??? It would be possible to handle rematerialization of other
14172 constants via secondary reloads. */
14173 if (vec_flags & VEC_ANY_SVE)
14174 return aarch64_simd_valid_immediate (x, NULL);
14175
14176 if (GET_CODE (x) == HIGH)
14177 x = XEXP (x, 0);
14178
14179 /* Accept polynomial constants that can be calculated by using the
14180 destination of a move as the sole temporary. Constants that
14181 require a second temporary cannot be rematerialized (they can't be
14182 forced to memory and also aren't legitimate constants). */
14183 poly_int64 offset;
14184 if (poly_int_rtx_p (x, &offset))
14185 return aarch64_offset_temporaries (false, offset) <= 1;
14186
14187 /* If an offset is being added to something else, we need to allow the
14188 base to be moved into the destination register, meaning that there
14189 are no free temporaries for the offset. */
14190 x = strip_offset (x, &offset);
14191 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14192 return false;
14193
14194 /* Do not allow const (plus (anchor_symbol, const_int)). */
14195 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14196 return false;
14197
14198 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
14199 so spilling them is better than rematerialization. */
14200 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14201 return true;
14202
14203 /* Label references are always constant. */
14204 if (GET_CODE (x) == LABEL_REF)
14205 return true;
14206
14207 return false;
14208 }
14209
14210 rtx
14211 aarch64_load_tp (rtx target)
14212 {
14213 if (!target
14214 || GET_MODE (target) != Pmode
14215 || !register_operand (target, Pmode))
14216 target = gen_reg_rtx (Pmode);
14217
14218 /* Can return in any reg. */
14219 emit_insn (gen_aarch64_load_tp_hard (target));
14220 return target;
14221 }
14222
14223 /* On AAPCS systems, this is the "struct __va_list". */
14224 static GTY(()) tree va_list_type;
14225
14226 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14227 Return the type to use as __builtin_va_list.
14228
14229 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14230
14231 struct __va_list
14232 {
14233 void *__stack;
14234 void *__gr_top;
14235 void *__vr_top;
14236 int __gr_offs;
14237 int __vr_offs;
14238 }; */
14239
14240 static tree
14241 aarch64_build_builtin_va_list (void)
14242 {
14243 tree va_list_name;
14244 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14245
14246 /* Create the type. */
14247 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14248 /* Give it the required name. */
14249 va_list_name = build_decl (BUILTINS_LOCATION,
14250 TYPE_DECL,
14251 get_identifier ("__va_list"),
14252 va_list_type);
14253 DECL_ARTIFICIAL (va_list_name) = 1;
14254 TYPE_NAME (va_list_type) = va_list_name;
14255 TYPE_STUB_DECL (va_list_type) = va_list_name;
14256
14257 /* Create the fields. */
14258 f_stack = build_decl (BUILTINS_LOCATION,
14259 FIELD_DECL, get_identifier ("__stack"),
14260 ptr_type_node);
14261 f_grtop = build_decl (BUILTINS_LOCATION,
14262 FIELD_DECL, get_identifier ("__gr_top"),
14263 ptr_type_node);
14264 f_vrtop = build_decl (BUILTINS_LOCATION,
14265 FIELD_DECL, get_identifier ("__vr_top"),
14266 ptr_type_node);
14267 f_groff = build_decl (BUILTINS_LOCATION,
14268 FIELD_DECL, get_identifier ("__gr_offs"),
14269 integer_type_node);
14270 f_vroff = build_decl (BUILTINS_LOCATION,
14271 FIELD_DECL, get_identifier ("__vr_offs"),
14272 integer_type_node);
14273
14274 /* Tell tree-stdarg pass about our internal offset fields.
14275 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14276 purpose to identify whether the code is updating va_list internal
14277 offset fields through irregular way. */
14278 va_list_gpr_counter_field = f_groff;
14279 va_list_fpr_counter_field = f_vroff;
14280
14281 DECL_ARTIFICIAL (f_stack) = 1;
14282 DECL_ARTIFICIAL (f_grtop) = 1;
14283 DECL_ARTIFICIAL (f_vrtop) = 1;
14284 DECL_ARTIFICIAL (f_groff) = 1;
14285 DECL_ARTIFICIAL (f_vroff) = 1;
14286
14287 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14288 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14289 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14290 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14291 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14292
14293 TYPE_FIELDS (va_list_type) = f_stack;
14294 DECL_CHAIN (f_stack) = f_grtop;
14295 DECL_CHAIN (f_grtop) = f_vrtop;
14296 DECL_CHAIN (f_vrtop) = f_groff;
14297 DECL_CHAIN (f_groff) = f_vroff;
14298
14299 /* Compute its layout. */
14300 layout_type (va_list_type);
14301
14302 return va_list_type;
14303 }
14304
14305 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14306 static void
14307 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14308 {
14309 const CUMULATIVE_ARGS *cum;
14310 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14311 tree stack, grtop, vrtop, groff, vroff;
14312 tree t;
14313 int gr_save_area_size = cfun->va_list_gpr_size;
14314 int vr_save_area_size = cfun->va_list_fpr_size;
14315 int vr_offset;
14316
14317 cum = &crtl->args.info;
14318 if (cfun->va_list_gpr_size)
14319 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14320 cfun->va_list_gpr_size);
14321 if (cfun->va_list_fpr_size)
14322 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14323 * UNITS_PER_VREG, cfun->va_list_fpr_size);
14324
14325 if (!TARGET_FLOAT)
14326 {
14327 gcc_assert (cum->aapcs_nvrn == 0);
14328 vr_save_area_size = 0;
14329 }
14330
14331 f_stack = TYPE_FIELDS (va_list_type_node);
14332 f_grtop = DECL_CHAIN (f_stack);
14333 f_vrtop = DECL_CHAIN (f_grtop);
14334 f_groff = DECL_CHAIN (f_vrtop);
14335 f_vroff = DECL_CHAIN (f_groff);
14336
14337 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14338 NULL_TREE);
14339 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14340 NULL_TREE);
14341 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14342 NULL_TREE);
14343 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14344 NULL_TREE);
14345 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14346 NULL_TREE);
14347
14348 /* Emit code to initialize STACK, which points to the next varargs stack
14349 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14350 by named arguments. STACK is 8-byte aligned. */
14351 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14352 if (cum->aapcs_stack_size > 0)
14353 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14354 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14355 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14356
14357 /* Emit code to initialize GRTOP, the top of the GR save area.
14358 virtual_incoming_args_rtx should have been 16 byte aligned. */
14359 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14360 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14361 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14362
14363 /* Emit code to initialize VRTOP, the top of the VR save area.
14364 This address is gr_save_area_bytes below GRTOP, rounded
14365 down to the next 16-byte boundary. */
14366 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14367 vr_offset = ROUND_UP (gr_save_area_size,
14368 STACK_BOUNDARY / BITS_PER_UNIT);
14369
14370 if (vr_offset)
14371 t = fold_build_pointer_plus_hwi (t, -vr_offset);
14372 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14373 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14374
14375 /* Emit code to initialize GROFF, the offset from GRTOP of the
14376 next GPR argument. */
14377 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14378 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14379 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14380
14381 /* Likewise emit code to initialize VROFF, the offset from FTOP
14382 of the next VR argument. */
14383 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14384 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14385 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14386 }
14387
14388 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14389
14390 static tree
14391 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14392 gimple_seq *post_p ATTRIBUTE_UNUSED)
14393 {
14394 tree addr;
14395 bool indirect_p;
14396 bool is_ha; /* is HFA or HVA. */
14397 bool dw_align; /* double-word align. */
14398 machine_mode ag_mode = VOIDmode;
14399 int nregs;
14400 machine_mode mode;
14401
14402 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14403 tree stack, f_top, f_off, off, arg, roundup, on_stack;
14404 HOST_WIDE_INT size, rsize, adjust, align;
14405 tree t, u, cond1, cond2;
14406
14407 indirect_p = pass_va_arg_by_reference (type);
14408 if (indirect_p)
14409 type = build_pointer_type (type);
14410
14411 mode = TYPE_MODE (type);
14412
14413 f_stack = TYPE_FIELDS (va_list_type_node);
14414 f_grtop = DECL_CHAIN (f_stack);
14415 f_vrtop = DECL_CHAIN (f_grtop);
14416 f_groff = DECL_CHAIN (f_vrtop);
14417 f_vroff = DECL_CHAIN (f_groff);
14418
14419 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14420 f_stack, NULL_TREE);
14421 size = int_size_in_bytes (type);
14422
14423 bool abi_break;
14424 align
14425 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14426
14427 dw_align = false;
14428 adjust = 0;
14429 if (aarch64_vfp_is_call_or_return_candidate (mode,
14430 type,
14431 &ag_mode,
14432 &nregs,
14433 &is_ha))
14434 {
14435 /* No frontends can create types with variable-sized modes, so we
14436 shouldn't be asked to pass or return them. */
14437 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14438
14439 /* TYPE passed in fp/simd registers. */
14440 if (!TARGET_FLOAT)
14441 aarch64_err_no_fpadvsimd (mode);
14442
14443 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14444 unshare_expr (valist), f_vrtop, NULL_TREE);
14445 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14446 unshare_expr (valist), f_vroff, NULL_TREE);
14447
14448 rsize = nregs * UNITS_PER_VREG;
14449
14450 if (is_ha)
14451 {
14452 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14453 adjust = UNITS_PER_VREG - ag_size;
14454 }
14455 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14456 && size < UNITS_PER_VREG)
14457 {
14458 adjust = UNITS_PER_VREG - size;
14459 }
14460 }
14461 else
14462 {
14463 /* TYPE passed in general registers. */
14464 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14465 unshare_expr (valist), f_grtop, NULL_TREE);
14466 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14467 unshare_expr (valist), f_groff, NULL_TREE);
14468 rsize = ROUND_UP (size, UNITS_PER_WORD);
14469 nregs = rsize / UNITS_PER_WORD;
14470
14471 if (align > 8)
14472 {
14473 if (abi_break && warn_psabi)
14474 inform (input_location, "parameter passing for argument of type "
14475 "%qT changed in GCC 9.1", type);
14476 dw_align = true;
14477 }
14478
14479 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14480 && size < UNITS_PER_WORD)
14481 {
14482 adjust = UNITS_PER_WORD - size;
14483 }
14484 }
14485
14486 /* Get a local temporary for the field value. */
14487 off = get_initialized_tmp_var (f_off, pre_p, NULL);
14488
14489 /* Emit code to branch if off >= 0. */
14490 t = build2 (GE_EXPR, boolean_type_node, off,
14491 build_int_cst (TREE_TYPE (off), 0));
14492 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14493
14494 if (dw_align)
14495 {
14496 /* Emit: offs = (offs + 15) & -16. */
14497 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14498 build_int_cst (TREE_TYPE (off), 15));
14499 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14500 build_int_cst (TREE_TYPE (off), -16));
14501 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14502 }
14503 else
14504 roundup = NULL;
14505
14506 /* Update ap.__[g|v]r_offs */
14507 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14508 build_int_cst (TREE_TYPE (off), rsize));
14509 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14510
14511 /* String up. */
14512 if (roundup)
14513 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14514
14515 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14516 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14517 build_int_cst (TREE_TYPE (f_off), 0));
14518 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14519
14520 /* String up: make sure the assignment happens before the use. */
14521 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14522 COND_EXPR_ELSE (cond1) = t;
14523
14524 /* Prepare the trees handling the argument that is passed on the stack;
14525 the top level node will store in ON_STACK. */
14526 arg = get_initialized_tmp_var (stack, pre_p, NULL);
14527 if (align > 8)
14528 {
14529 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14530 t = fold_build_pointer_plus_hwi (arg, 15);
14531 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14532 build_int_cst (TREE_TYPE (t), -16));
14533 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14534 }
14535 else
14536 roundup = NULL;
14537 /* Advance ap.__stack */
14538 t = fold_build_pointer_plus_hwi (arg, size + 7);
14539 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14540 build_int_cst (TREE_TYPE (t), -8));
14541 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14542 /* String up roundup and advance. */
14543 if (roundup)
14544 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14545 /* String up with arg */
14546 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14547 /* Big-endianness related address adjustment. */
14548 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14549 && size < UNITS_PER_WORD)
14550 {
14551 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14552 size_int (UNITS_PER_WORD - size));
14553 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14554 }
14555
14556 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14557 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14558
14559 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14560 t = off;
14561 if (adjust)
14562 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14563 build_int_cst (TREE_TYPE (off), adjust));
14564
14565 t = fold_convert (sizetype, t);
14566 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14567
14568 if (is_ha)
14569 {
14570 /* type ha; // treat as "struct {ftype field[n];}"
14571 ... [computing offs]
14572 for (i = 0; i <nregs; ++i, offs += 16)
14573 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14574 return ha; */
14575 int i;
14576 tree tmp_ha, field_t, field_ptr_t;
14577
14578 /* Declare a local variable. */
14579 tmp_ha = create_tmp_var_raw (type, "ha");
14580 gimple_add_tmp_var (tmp_ha);
14581
14582 /* Establish the base type. */
14583 switch (ag_mode)
14584 {
14585 case E_SFmode:
14586 field_t = float_type_node;
14587 field_ptr_t = float_ptr_type_node;
14588 break;
14589 case E_DFmode:
14590 field_t = double_type_node;
14591 field_ptr_t = double_ptr_type_node;
14592 break;
14593 case E_TFmode:
14594 field_t = long_double_type_node;
14595 field_ptr_t = long_double_ptr_type_node;
14596 break;
14597 case E_HFmode:
14598 field_t = aarch64_fp16_type_node;
14599 field_ptr_t = aarch64_fp16_ptr_type_node;
14600 break;
14601 case E_V2SImode:
14602 case E_V4SImode:
14603 {
14604 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14605 field_t = build_vector_type_for_mode (innertype, ag_mode);
14606 field_ptr_t = build_pointer_type (field_t);
14607 }
14608 break;
14609 default:
14610 gcc_assert (0);
14611 }
14612
14613 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14614 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14615 addr = t;
14616 t = fold_convert (field_ptr_t, addr);
14617 t = build2 (MODIFY_EXPR, field_t,
14618 build1 (INDIRECT_REF, field_t, tmp_ha),
14619 build1 (INDIRECT_REF, field_t, t));
14620
14621 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14622 for (i = 1; i < nregs; ++i)
14623 {
14624 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14625 u = fold_convert (field_ptr_t, addr);
14626 u = build2 (MODIFY_EXPR, field_t,
14627 build2 (MEM_REF, field_t, tmp_ha,
14628 build_int_cst (field_ptr_t,
14629 (i *
14630 int_size_in_bytes (field_t)))),
14631 build1 (INDIRECT_REF, field_t, u));
14632 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14633 }
14634
14635 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14636 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14637 }
14638
14639 COND_EXPR_ELSE (cond2) = t;
14640 addr = fold_convert (build_pointer_type (type), cond1);
14641 addr = build_va_arg_indirect_ref (addr);
14642
14643 if (indirect_p)
14644 addr = build_va_arg_indirect_ref (addr);
14645
14646 return addr;
14647 }
14648
14649 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14650
14651 static void
14652 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
14653 const function_arg_info &arg,
14654 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
14655 {
14656 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14657 CUMULATIVE_ARGS local_cum;
14658 int gr_saved = cfun->va_list_gpr_size;
14659 int vr_saved = cfun->va_list_fpr_size;
14660
14661 /* The caller has advanced CUM up to, but not beyond, the last named
14662 argument. Advance a local copy of CUM past the last "real" named
14663 argument, to find out how many registers are left over. */
14664 local_cum = *cum;
14665 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
14666
14667 /* Found out how many registers we need to save.
14668 Honor tree-stdvar analysis results. */
14669 if (cfun->va_list_gpr_size)
14670 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14671 cfun->va_list_gpr_size / UNITS_PER_WORD);
14672 if (cfun->va_list_fpr_size)
14673 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14674 cfun->va_list_fpr_size / UNITS_PER_VREG);
14675
14676 if (!TARGET_FLOAT)
14677 {
14678 gcc_assert (local_cum.aapcs_nvrn == 0);
14679 vr_saved = 0;
14680 }
14681
14682 if (!no_rtl)
14683 {
14684 if (gr_saved > 0)
14685 {
14686 rtx ptr, mem;
14687
14688 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14689 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14690 - gr_saved * UNITS_PER_WORD);
14691 mem = gen_frame_mem (BLKmode, ptr);
14692 set_mem_alias_set (mem, get_varargs_alias_set ());
14693
14694 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14695 mem, gr_saved);
14696 }
14697 if (vr_saved > 0)
14698 {
14699 /* We can't use move_block_from_reg, because it will use
14700 the wrong mode, storing D regs only. */
14701 machine_mode mode = TImode;
14702 int off, i, vr_start;
14703
14704 /* Set OFF to the offset from virtual_incoming_args_rtx of
14705 the first vector register. The VR save area lies below
14706 the GR one, and is aligned to 16 bytes. */
14707 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14708 STACK_BOUNDARY / BITS_PER_UNIT);
14709 off -= vr_saved * UNITS_PER_VREG;
14710
14711 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14712 for (i = 0; i < vr_saved; ++i)
14713 {
14714 rtx ptr, mem;
14715
14716 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14717 mem = gen_frame_mem (mode, ptr);
14718 set_mem_alias_set (mem, get_varargs_alias_set ());
14719 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14720 off += UNITS_PER_VREG;
14721 }
14722 }
14723 }
14724
14725 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14726 any complication of having crtl->args.pretend_args_size changed. */
14727 cfun->machine->frame.saved_varargs_size
14728 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14729 STACK_BOUNDARY / BITS_PER_UNIT)
14730 + vr_saved * UNITS_PER_VREG);
14731 }
14732
14733 static void
14734 aarch64_conditional_register_usage (void)
14735 {
14736 int i;
14737 if (!TARGET_FLOAT)
14738 {
14739 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14740 {
14741 fixed_regs[i] = 1;
14742 call_used_regs[i] = 1;
14743 }
14744 }
14745 if (!TARGET_SVE)
14746 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14747 {
14748 fixed_regs[i] = 1;
14749 call_used_regs[i] = 1;
14750 }
14751
14752 /* When tracking speculation, we need a couple of call-clobbered registers
14753 to track the speculation state. It would be nice to just use
14754 IP0 and IP1, but currently there are numerous places that just
14755 assume these registers are free for other uses (eg pointer
14756 authentication). */
14757 if (aarch64_track_speculation)
14758 {
14759 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14760 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14761 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14762 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14763 }
14764 }
14765
14766 /* Walk down the type tree of TYPE counting consecutive base elements.
14767 If *MODEP is VOIDmode, then set it to the first valid floating point
14768 type. If a non-floating point type is found, or if a floating point
14769 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14770 otherwise return the count in the sub-tree. */
14771 static int
14772 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14773 {
14774 machine_mode mode;
14775 HOST_WIDE_INT size;
14776
14777 switch (TREE_CODE (type))
14778 {
14779 case REAL_TYPE:
14780 mode = TYPE_MODE (type);
14781 if (mode != DFmode && mode != SFmode
14782 && mode != TFmode && mode != HFmode)
14783 return -1;
14784
14785 if (*modep == VOIDmode)
14786 *modep = mode;
14787
14788 if (*modep == mode)
14789 return 1;
14790
14791 break;
14792
14793 case COMPLEX_TYPE:
14794 mode = TYPE_MODE (TREE_TYPE (type));
14795 if (mode != DFmode && mode != SFmode
14796 && mode != TFmode && mode != HFmode)
14797 return -1;
14798
14799 if (*modep == VOIDmode)
14800 *modep = mode;
14801
14802 if (*modep == mode)
14803 return 2;
14804
14805 break;
14806
14807 case VECTOR_TYPE:
14808 /* Use V2SImode and V4SImode as representatives of all 64-bit
14809 and 128-bit vector types. */
14810 size = int_size_in_bytes (type);
14811 switch (size)
14812 {
14813 case 8:
14814 mode = V2SImode;
14815 break;
14816 case 16:
14817 mode = V4SImode;
14818 break;
14819 default:
14820 return -1;
14821 }
14822
14823 if (*modep == VOIDmode)
14824 *modep = mode;
14825
14826 /* Vector modes are considered to be opaque: two vectors are
14827 equivalent for the purposes of being homogeneous aggregates
14828 if they are the same size. */
14829 if (*modep == mode)
14830 return 1;
14831
14832 break;
14833
14834 case ARRAY_TYPE:
14835 {
14836 int count;
14837 tree index = TYPE_DOMAIN (type);
14838
14839 /* Can't handle incomplete types nor sizes that are not
14840 fixed. */
14841 if (!COMPLETE_TYPE_P (type)
14842 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14843 return -1;
14844
14845 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14846 if (count == -1
14847 || !index
14848 || !TYPE_MAX_VALUE (index)
14849 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14850 || !TYPE_MIN_VALUE (index)
14851 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14852 || count < 0)
14853 return -1;
14854
14855 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14856 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14857
14858 /* There must be no padding. */
14859 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14860 count * GET_MODE_BITSIZE (*modep)))
14861 return -1;
14862
14863 return count;
14864 }
14865
14866 case RECORD_TYPE:
14867 {
14868 int count = 0;
14869 int sub_count;
14870 tree field;
14871
14872 /* Can't handle incomplete types nor sizes that are not
14873 fixed. */
14874 if (!COMPLETE_TYPE_P (type)
14875 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14876 return -1;
14877
14878 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14879 {
14880 if (TREE_CODE (field) != FIELD_DECL)
14881 continue;
14882
14883 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14884 if (sub_count < 0)
14885 return -1;
14886 count += sub_count;
14887 }
14888
14889 /* There must be no padding. */
14890 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14891 count * GET_MODE_BITSIZE (*modep)))
14892 return -1;
14893
14894 return count;
14895 }
14896
14897 case UNION_TYPE:
14898 case QUAL_UNION_TYPE:
14899 {
14900 /* These aren't very interesting except in a degenerate case. */
14901 int count = 0;
14902 int sub_count;
14903 tree field;
14904
14905 /* Can't handle incomplete types nor sizes that are not
14906 fixed. */
14907 if (!COMPLETE_TYPE_P (type)
14908 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14909 return -1;
14910
14911 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14912 {
14913 if (TREE_CODE (field) != FIELD_DECL)
14914 continue;
14915
14916 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14917 if (sub_count < 0)
14918 return -1;
14919 count = count > sub_count ? count : sub_count;
14920 }
14921
14922 /* There must be no padding. */
14923 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14924 count * GET_MODE_BITSIZE (*modep)))
14925 return -1;
14926
14927 return count;
14928 }
14929
14930 default:
14931 break;
14932 }
14933
14934 return -1;
14935 }
14936
14937 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14938 type as described in AAPCS64 \S 4.1.2.
14939
14940 See the comment above aarch64_composite_type_p for the notes on MODE. */
14941
14942 static bool
14943 aarch64_short_vector_p (const_tree type,
14944 machine_mode mode)
14945 {
14946 poly_int64 size = -1;
14947
14948 if (type && TREE_CODE (type) == VECTOR_TYPE)
14949 size = int_size_in_bytes (type);
14950 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14951 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14952 size = GET_MODE_SIZE (mode);
14953
14954 return known_eq (size, 8) || known_eq (size, 16);
14955 }
14956
14957 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14958 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14959 array types. The C99 floating-point complex types are also considered
14960 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14961 types, which are GCC extensions and out of the scope of AAPCS64, are
14962 treated as composite types here as well.
14963
14964 Note that MODE itself is not sufficient in determining whether a type
14965 is such a composite type or not. This is because
14966 stor-layout.c:compute_record_mode may have already changed the MODE
14967 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14968 structure with only one field may have its MODE set to the mode of the
14969 field. Also an integer mode whose size matches the size of the
14970 RECORD_TYPE type may be used to substitute the original mode
14971 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14972 solely relied on. */
14973
14974 static bool
14975 aarch64_composite_type_p (const_tree type,
14976 machine_mode mode)
14977 {
14978 if (aarch64_short_vector_p (type, mode))
14979 return false;
14980
14981 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14982 return true;
14983
14984 if (mode == BLKmode
14985 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14986 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14987 return true;
14988
14989 return false;
14990 }
14991
14992 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14993 shall be passed or returned in simd/fp register(s) (providing these
14994 parameter passing registers are available).
14995
14996 Upon successful return, *COUNT returns the number of needed registers,
14997 *BASE_MODE returns the mode of the individual register and when IS_HAF
14998 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14999 floating-point aggregate or a homogeneous short-vector aggregate. */
15000
15001 static bool
15002 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
15003 const_tree type,
15004 machine_mode *base_mode,
15005 int *count,
15006 bool *is_ha)
15007 {
15008 machine_mode new_mode = VOIDmode;
15009 bool composite_p = aarch64_composite_type_p (type, mode);
15010
15011 if (is_ha != NULL) *is_ha = false;
15012
15013 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
15014 || aarch64_short_vector_p (type, mode))
15015 {
15016 *count = 1;
15017 new_mode = mode;
15018 }
15019 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
15020 {
15021 if (is_ha != NULL) *is_ha = true;
15022 *count = 2;
15023 new_mode = GET_MODE_INNER (mode);
15024 }
15025 else if (type && composite_p)
15026 {
15027 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
15028
15029 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
15030 {
15031 if (is_ha != NULL) *is_ha = true;
15032 *count = ag_count;
15033 }
15034 else
15035 return false;
15036 }
15037 else
15038 return false;
15039
15040 *base_mode = new_mode;
15041 return true;
15042 }
15043
15044 /* Implement TARGET_STRUCT_VALUE_RTX. */
15045
15046 static rtx
15047 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
15048 int incoming ATTRIBUTE_UNUSED)
15049 {
15050 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
15051 }
15052
15053 /* Implements target hook vector_mode_supported_p. */
15054 static bool
15055 aarch64_vector_mode_supported_p (machine_mode mode)
15056 {
15057 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15058 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
15059 }
15060
15061 /* Return the full-width SVE vector mode for element mode MODE, if one
15062 exists. */
15063 opt_machine_mode
15064 aarch64_full_sve_mode (scalar_mode mode)
15065 {
15066 switch (mode)
15067 {
15068 case E_DFmode:
15069 return VNx2DFmode;
15070 case E_SFmode:
15071 return VNx4SFmode;
15072 case E_HFmode:
15073 return VNx8HFmode;
15074 case E_DImode:
15075 return VNx2DImode;
15076 case E_SImode:
15077 return VNx4SImode;
15078 case E_HImode:
15079 return VNx8HImode;
15080 case E_QImode:
15081 return VNx16QImode;
15082 default:
15083 return opt_machine_mode ();
15084 }
15085 }
15086
15087 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
15088 if it exists. */
15089 opt_machine_mode
15090 aarch64_vq_mode (scalar_mode mode)
15091 {
15092 switch (mode)
15093 {
15094 case E_DFmode:
15095 return V2DFmode;
15096 case E_SFmode:
15097 return V4SFmode;
15098 case E_HFmode:
15099 return V8HFmode;
15100 case E_SImode:
15101 return V4SImode;
15102 case E_HImode:
15103 return V8HImode;
15104 case E_QImode:
15105 return V16QImode;
15106 case E_DImode:
15107 return V2DImode;
15108 default:
15109 return opt_machine_mode ();
15110 }
15111 }
15112
15113 /* Return appropriate SIMD container
15114 for MODE within a vector of WIDTH bits. */
15115 static machine_mode
15116 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
15117 {
15118 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
15119 return aarch64_full_sve_mode (mode).else_mode (word_mode);
15120
15121 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
15122 if (TARGET_SIMD)
15123 {
15124 if (known_eq (width, 128))
15125 return aarch64_vq_mode (mode).else_mode (word_mode);
15126 else
15127 switch (mode)
15128 {
15129 case E_SFmode:
15130 return V2SFmode;
15131 case E_HFmode:
15132 return V4HFmode;
15133 case E_SImode:
15134 return V2SImode;
15135 case E_HImode:
15136 return V4HImode;
15137 case E_QImode:
15138 return V8QImode;
15139 default:
15140 break;
15141 }
15142 }
15143 return word_mode;
15144 }
15145
15146 /* Return 128-bit container as the preferred SIMD mode for MODE. */
15147 static machine_mode
15148 aarch64_preferred_simd_mode (scalar_mode mode)
15149 {
15150 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
15151 return aarch64_simd_container_mode (mode, bits);
15152 }
15153
15154 /* Return a list of possible vector sizes for the vectorizer
15155 to iterate over. */
15156 static void
15157 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
15158 {
15159 if (TARGET_SVE)
15160 sizes->safe_push (BYTES_PER_SVE_VECTOR);
15161 sizes->safe_push (16);
15162 sizes->safe_push (8);
15163 }
15164
15165 /* Implement TARGET_MANGLE_TYPE. */
15166
15167 static const char *
15168 aarch64_mangle_type (const_tree type)
15169 {
15170 /* The AArch64 ABI documents say that "__va_list" has to be
15171 mangled as if it is in the "std" namespace. */
15172 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
15173 return "St9__va_list";
15174
15175 /* Half-precision float. */
15176 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
15177 return "Dh";
15178
15179 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
15180 builtin types. */
15181 if (TYPE_NAME (type) != NULL)
15182 return aarch64_general_mangle_builtin_type (type);
15183
15184 /* Use the default mangling. */
15185 return NULL;
15186 }
15187
15188 /* Find the first rtx_insn before insn that will generate an assembly
15189 instruction. */
15190
15191 static rtx_insn *
15192 aarch64_prev_real_insn (rtx_insn *insn)
15193 {
15194 if (!insn)
15195 return NULL;
15196
15197 do
15198 {
15199 insn = prev_real_insn (insn);
15200 }
15201 while (insn && recog_memoized (insn) < 0);
15202
15203 return insn;
15204 }
15205
15206 static bool
15207 is_madd_op (enum attr_type t1)
15208 {
15209 unsigned int i;
15210 /* A number of these may be AArch32 only. */
15211 enum attr_type mlatypes[] = {
15212 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15213 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15214 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15215 };
15216
15217 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15218 {
15219 if (t1 == mlatypes[i])
15220 return true;
15221 }
15222
15223 return false;
15224 }
15225
15226 /* Check if there is a register dependency between a load and the insn
15227 for which we hold recog_data. */
15228
15229 static bool
15230 dep_between_memop_and_curr (rtx memop)
15231 {
15232 rtx load_reg;
15233 int opno;
15234
15235 gcc_assert (GET_CODE (memop) == SET);
15236
15237 if (!REG_P (SET_DEST (memop)))
15238 return false;
15239
15240 load_reg = SET_DEST (memop);
15241 for (opno = 1; opno < recog_data.n_operands; opno++)
15242 {
15243 rtx operand = recog_data.operand[opno];
15244 if (REG_P (operand)
15245 && reg_overlap_mentioned_p (load_reg, operand))
15246 return true;
15247
15248 }
15249 return false;
15250 }
15251
15252
15253 /* When working around the Cortex-A53 erratum 835769,
15254 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15255 instruction and has a preceding memory instruction such that a NOP
15256 should be inserted between them. */
15257
15258 bool
15259 aarch64_madd_needs_nop (rtx_insn* insn)
15260 {
15261 enum attr_type attr_type;
15262 rtx_insn *prev;
15263 rtx body;
15264
15265 if (!TARGET_FIX_ERR_A53_835769)
15266 return false;
15267
15268 if (!INSN_P (insn) || recog_memoized (insn) < 0)
15269 return false;
15270
15271 attr_type = get_attr_type (insn);
15272 if (!is_madd_op (attr_type))
15273 return false;
15274
15275 prev = aarch64_prev_real_insn (insn);
15276 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15277 Restore recog state to INSN to avoid state corruption. */
15278 extract_constrain_insn_cached (insn);
15279
15280 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15281 return false;
15282
15283 body = single_set (prev);
15284
15285 /* If the previous insn is a memory op and there is no dependency between
15286 it and the DImode madd, emit a NOP between them. If body is NULL then we
15287 have a complex memory operation, probably a load/store pair.
15288 Be conservative for now and emit a NOP. */
15289 if (GET_MODE (recog_data.operand[0]) == DImode
15290 && (!body || !dep_between_memop_and_curr (body)))
15291 return true;
15292
15293 return false;
15294
15295 }
15296
15297
15298 /* Implement FINAL_PRESCAN_INSN. */
15299
15300 void
15301 aarch64_final_prescan_insn (rtx_insn *insn)
15302 {
15303 if (aarch64_madd_needs_nop (insn))
15304 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15305 }
15306
15307
15308 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15309 instruction. */
15310
15311 bool
15312 aarch64_sve_index_immediate_p (rtx base_or_step)
15313 {
15314 return (CONST_INT_P (base_or_step)
15315 && IN_RANGE (INTVAL (base_or_step), -16, 15));
15316 }
15317
15318 /* Return true if X is a valid immediate for the SVE ADD and SUB
15319 instructions. Negate X first if NEGATE_P is true. */
15320
15321 bool
15322 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15323 {
15324 rtx elt;
15325
15326 if (!const_vec_duplicate_p (x, &elt)
15327 || !CONST_INT_P (elt))
15328 return false;
15329
15330 HOST_WIDE_INT val = INTVAL (elt);
15331 if (negate_p)
15332 val = -val;
15333 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15334
15335 if (val & 0xff)
15336 return IN_RANGE (val, 0, 0xff);
15337 return IN_RANGE (val, 0, 0xff00);
15338 }
15339
15340 /* Return true if X is a valid immediate operand for an SVE logical
15341 instruction such as AND. */
15342
15343 bool
15344 aarch64_sve_bitmask_immediate_p (rtx x)
15345 {
15346 rtx elt;
15347
15348 return (const_vec_duplicate_p (x, &elt)
15349 && CONST_INT_P (elt)
15350 && aarch64_bitmask_imm (INTVAL (elt),
15351 GET_MODE_INNER (GET_MODE (x))));
15352 }
15353
15354 /* Return true if X is a valid immediate for the SVE DUP and CPY
15355 instructions. */
15356
15357 bool
15358 aarch64_sve_dup_immediate_p (rtx x)
15359 {
15360 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15361 if (!CONST_INT_P (x))
15362 return false;
15363
15364 HOST_WIDE_INT val = INTVAL (x);
15365 if (val & 0xff)
15366 return IN_RANGE (val, -0x80, 0x7f);
15367 return IN_RANGE (val, -0x8000, 0x7f00);
15368 }
15369
15370 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15371 SIGNED_P says whether the operand is signed rather than unsigned. */
15372
15373 bool
15374 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15375 {
15376 rtx elt;
15377
15378 return (const_vec_duplicate_p (x, &elt)
15379 && CONST_INT_P (elt)
15380 && (signed_p
15381 ? IN_RANGE (INTVAL (elt), -16, 15)
15382 : IN_RANGE (INTVAL (elt), 0, 127)));
15383 }
15384
15385 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15386 instruction. Negate X first if NEGATE_P is true. */
15387
15388 bool
15389 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15390 {
15391 rtx elt;
15392 REAL_VALUE_TYPE r;
15393
15394 if (!const_vec_duplicate_p (x, &elt)
15395 || GET_CODE (elt) != CONST_DOUBLE)
15396 return false;
15397
15398 r = *CONST_DOUBLE_REAL_VALUE (elt);
15399
15400 if (negate_p)
15401 r = real_value_negate (&r);
15402
15403 if (real_equal (&r, &dconst1))
15404 return true;
15405 if (real_equal (&r, &dconsthalf))
15406 return true;
15407 return false;
15408 }
15409
15410 /* Return true if X is a valid immediate operand for an SVE FMUL
15411 instruction. */
15412
15413 bool
15414 aarch64_sve_float_mul_immediate_p (rtx x)
15415 {
15416 rtx elt;
15417
15418 return (const_vec_duplicate_p (x, &elt)
15419 && GET_CODE (elt) == CONST_DOUBLE
15420 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15421 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15422 }
15423
15424 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15425 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15426 is nonnull, use it to describe valid immediates. */
15427 static bool
15428 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15429 simd_immediate_info *info,
15430 enum simd_immediate_check which,
15431 simd_immediate_info::insn_type insn)
15432 {
15433 /* Try a 4-byte immediate with LSL. */
15434 for (unsigned int shift = 0; shift < 32; shift += 8)
15435 if ((val32 & (0xff << shift)) == val32)
15436 {
15437 if (info)
15438 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15439 simd_immediate_info::LSL, shift);
15440 return true;
15441 }
15442
15443 /* Try a 2-byte immediate with LSL. */
15444 unsigned int imm16 = val32 & 0xffff;
15445 if (imm16 == (val32 >> 16))
15446 for (unsigned int shift = 0; shift < 16; shift += 8)
15447 if ((imm16 & (0xff << shift)) == imm16)
15448 {
15449 if (info)
15450 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15451 simd_immediate_info::LSL, shift);
15452 return true;
15453 }
15454
15455 /* Try a 4-byte immediate with MSL, except for cases that MVN
15456 can handle. */
15457 if (which == AARCH64_CHECK_MOV)
15458 for (unsigned int shift = 8; shift < 24; shift += 8)
15459 {
15460 unsigned int low = (1 << shift) - 1;
15461 if (((val32 & (0xff << shift)) | low) == val32)
15462 {
15463 if (info)
15464 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15465 simd_immediate_info::MSL, shift);
15466 return true;
15467 }
15468 }
15469
15470 return false;
15471 }
15472
15473 /* Return true if replicating VAL64 is a valid immediate for the
15474 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15475 use it to describe valid immediates. */
15476 static bool
15477 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15478 simd_immediate_info *info,
15479 enum simd_immediate_check which)
15480 {
15481 unsigned int val32 = val64 & 0xffffffff;
15482 unsigned int val16 = val64 & 0xffff;
15483 unsigned int val8 = val64 & 0xff;
15484
15485 if (val32 == (val64 >> 32))
15486 {
15487 if ((which & AARCH64_CHECK_ORR) != 0
15488 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15489 simd_immediate_info::MOV))
15490 return true;
15491
15492 if ((which & AARCH64_CHECK_BIC) != 0
15493 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15494 simd_immediate_info::MVN))
15495 return true;
15496
15497 /* Try using a replicated byte. */
15498 if (which == AARCH64_CHECK_MOV
15499 && val16 == (val32 >> 16)
15500 && val8 == (val16 >> 8))
15501 {
15502 if (info)
15503 *info = simd_immediate_info (QImode, val8);
15504 return true;
15505 }
15506 }
15507
15508 /* Try using a bit-to-bytemask. */
15509 if (which == AARCH64_CHECK_MOV)
15510 {
15511 unsigned int i;
15512 for (i = 0; i < 64; i += 8)
15513 {
15514 unsigned char byte = (val64 >> i) & 0xff;
15515 if (byte != 0 && byte != 0xff)
15516 break;
15517 }
15518 if (i == 64)
15519 {
15520 if (info)
15521 *info = simd_immediate_info (DImode, val64);
15522 return true;
15523 }
15524 }
15525 return false;
15526 }
15527
15528 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15529 instruction. If INFO is nonnull, use it to describe valid immediates. */
15530
15531 static bool
15532 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15533 simd_immediate_info *info)
15534 {
15535 scalar_int_mode mode = DImode;
15536 unsigned int val32 = val64 & 0xffffffff;
15537 if (val32 == (val64 >> 32))
15538 {
15539 mode = SImode;
15540 unsigned int val16 = val32 & 0xffff;
15541 if (val16 == (val32 >> 16))
15542 {
15543 mode = HImode;
15544 unsigned int val8 = val16 & 0xff;
15545 if (val8 == (val16 >> 8))
15546 mode = QImode;
15547 }
15548 }
15549 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15550 if (IN_RANGE (val, -0x80, 0x7f))
15551 {
15552 /* DUP with no shift. */
15553 if (info)
15554 *info = simd_immediate_info (mode, val);
15555 return true;
15556 }
15557 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15558 {
15559 /* DUP with LSL #8. */
15560 if (info)
15561 *info = simd_immediate_info (mode, val);
15562 return true;
15563 }
15564 if (aarch64_bitmask_imm (val64, mode))
15565 {
15566 /* DUPM. */
15567 if (info)
15568 *info = simd_immediate_info (mode, val);
15569 return true;
15570 }
15571 return false;
15572 }
15573
15574 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15575 it to describe valid immediates. */
15576
15577 static bool
15578 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15579 {
15580 if (x == CONST0_RTX (GET_MODE (x)))
15581 {
15582 if (info)
15583 *info = simd_immediate_info (DImode, 0);
15584 return true;
15585 }
15586
15587 /* Analyze the value as a VNx16BImode. This should be relatively
15588 efficient, since rtx_vector_builder has enough built-in capacity
15589 to store all VLA predicate constants without needing the heap. */
15590 rtx_vector_builder builder;
15591 if (!aarch64_get_sve_pred_bits (builder, x))
15592 return false;
15593
15594 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15595 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15596 {
15597 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15598 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15599 if (pattern != AARCH64_NUM_SVPATTERNS)
15600 {
15601 if (info)
15602 {
15603 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15604 *info = simd_immediate_info (int_mode, pattern);
15605 }
15606 return true;
15607 }
15608 }
15609 return false;
15610 }
15611
15612 /* Return true if OP is a valid SIMD immediate for the operation
15613 described by WHICH. If INFO is nonnull, use it to describe valid
15614 immediates. */
15615 bool
15616 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15617 enum simd_immediate_check which)
15618 {
15619 machine_mode mode = GET_MODE (op);
15620 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15621 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15622 return false;
15623
15624 if (vec_flags & VEC_SVE_PRED)
15625 return aarch64_sve_pred_valid_immediate (op, info);
15626
15627 scalar_mode elt_mode = GET_MODE_INNER (mode);
15628 rtx base, step;
15629 unsigned int n_elts;
15630 if (GET_CODE (op) == CONST_VECTOR
15631 && CONST_VECTOR_DUPLICATE_P (op))
15632 n_elts = CONST_VECTOR_NPATTERNS (op);
15633 else if ((vec_flags & VEC_SVE_DATA)
15634 && const_vec_series_p (op, &base, &step))
15635 {
15636 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15637 if (!aarch64_sve_index_immediate_p (base)
15638 || !aarch64_sve_index_immediate_p (step))
15639 return false;
15640
15641 if (info)
15642 *info = simd_immediate_info (elt_mode, base, step);
15643 return true;
15644 }
15645 else if (GET_CODE (op) == CONST_VECTOR
15646 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15647 /* N_ELTS set above. */;
15648 else
15649 return false;
15650
15651 scalar_float_mode elt_float_mode;
15652 if (n_elts == 1
15653 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15654 {
15655 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15656 if (aarch64_float_const_zero_rtx_p (elt)
15657 || aarch64_float_const_representable_p (elt))
15658 {
15659 if (info)
15660 *info = simd_immediate_info (elt_float_mode, elt);
15661 return true;
15662 }
15663 }
15664
15665 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15666 if (elt_size > 8)
15667 return false;
15668
15669 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15670
15671 /* Expand the vector constant out into a byte vector, with the least
15672 significant byte of the register first. */
15673 auto_vec<unsigned char, 16> bytes;
15674 bytes.reserve (n_elts * elt_size);
15675 for (unsigned int i = 0; i < n_elts; i++)
15676 {
15677 /* The vector is provided in gcc endian-neutral fashion.
15678 For aarch64_be Advanced SIMD, it must be laid out in the vector
15679 register in reverse order. */
15680 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15681 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15682
15683 if (elt_mode != elt_int_mode)
15684 elt = gen_lowpart (elt_int_mode, elt);
15685
15686 if (!CONST_INT_P (elt))
15687 return false;
15688
15689 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15690 for (unsigned int byte = 0; byte < elt_size; byte++)
15691 {
15692 bytes.quick_push (elt_val & 0xff);
15693 elt_val >>= BITS_PER_UNIT;
15694 }
15695 }
15696
15697 /* The immediate must repeat every eight bytes. */
15698 unsigned int nbytes = bytes.length ();
15699 for (unsigned i = 8; i < nbytes; ++i)
15700 if (bytes[i] != bytes[i - 8])
15701 return false;
15702
15703 /* Get the repeating 8-byte value as an integer. No endian correction
15704 is needed here because bytes is already in lsb-first order. */
15705 unsigned HOST_WIDE_INT val64 = 0;
15706 for (unsigned int i = 0; i < 8; i++)
15707 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15708 << (i * BITS_PER_UNIT));
15709
15710 if (vec_flags & VEC_SVE_DATA)
15711 return aarch64_sve_valid_immediate (val64, info);
15712 else
15713 return aarch64_advsimd_valid_immediate (val64, info, which);
15714 }
15715
15716 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15717 has a step in the range of INDEX. Return the index expression if so,
15718 otherwise return null. */
15719 rtx
15720 aarch64_check_zero_based_sve_index_immediate (rtx x)
15721 {
15722 rtx base, step;
15723 if (const_vec_series_p (x, &base, &step)
15724 && base == const0_rtx
15725 && aarch64_sve_index_immediate_p (step))
15726 return step;
15727 return NULL_RTX;
15728 }
15729
15730 /* Check of immediate shift constants are within range. */
15731 bool
15732 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15733 {
15734 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15735 if (left)
15736 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15737 else
15738 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15739 }
15740
15741 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15742 operation of width WIDTH at bit position POS. */
15743
15744 rtx
15745 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15746 {
15747 gcc_assert (CONST_INT_P (width));
15748 gcc_assert (CONST_INT_P (pos));
15749
15750 unsigned HOST_WIDE_INT mask
15751 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15752 return GEN_INT (mask << UINTVAL (pos));
15753 }
15754
15755 bool
15756 aarch64_mov_operand_p (rtx x, machine_mode mode)
15757 {
15758 if (GET_CODE (x) == HIGH
15759 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15760 return true;
15761
15762 if (CONST_INT_P (x))
15763 return true;
15764
15765 if (VECTOR_MODE_P (GET_MODE (x)))
15766 {
15767 /* Require predicate constants to be VNx16BI before RA, so that we
15768 force everything to have a canonical form. */
15769 if (!lra_in_progress
15770 && !reload_completed
15771 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15772 && GET_MODE (x) != VNx16BImode)
15773 return false;
15774
15775 return aarch64_simd_valid_immediate (x, NULL);
15776 }
15777
15778 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15779 return true;
15780
15781 if (aarch64_sve_cnt_immediate_p (x))
15782 return true;
15783
15784 return aarch64_classify_symbolic_expression (x)
15785 == SYMBOL_TINY_ABSOLUTE;
15786 }
15787
15788 /* Return a const_int vector of VAL. */
15789 rtx
15790 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15791 {
15792 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15793 return gen_const_vec_duplicate (mode, c);
15794 }
15795
15796 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15797
15798 bool
15799 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15800 {
15801 machine_mode vmode;
15802
15803 vmode = aarch64_simd_container_mode (mode, 64);
15804 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15805 return aarch64_simd_valid_immediate (op_v, NULL);
15806 }
15807
15808 /* Construct and return a PARALLEL RTX vector with elements numbering the
15809 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15810 the vector - from the perspective of the architecture. This does not
15811 line up with GCC's perspective on lane numbers, so we end up with
15812 different masks depending on our target endian-ness. The diagram
15813 below may help. We must draw the distinction when building masks
15814 which select one half of the vector. An instruction selecting
15815 architectural low-lanes for a big-endian target, must be described using
15816 a mask selecting GCC high-lanes.
15817
15818 Big-Endian Little-Endian
15819
15820 GCC 0 1 2 3 3 2 1 0
15821 | x | x | x | x | | x | x | x | x |
15822 Architecture 3 2 1 0 3 2 1 0
15823
15824 Low Mask: { 2, 3 } { 0, 1 }
15825 High Mask: { 0, 1 } { 2, 3 }
15826
15827 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15828
15829 rtx
15830 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15831 {
15832 rtvec v = rtvec_alloc (nunits / 2);
15833 int high_base = nunits / 2;
15834 int low_base = 0;
15835 int base;
15836 rtx t1;
15837 int i;
15838
15839 if (BYTES_BIG_ENDIAN)
15840 base = high ? low_base : high_base;
15841 else
15842 base = high ? high_base : low_base;
15843
15844 for (i = 0; i < nunits / 2; i++)
15845 RTVEC_ELT (v, i) = GEN_INT (base + i);
15846
15847 t1 = gen_rtx_PARALLEL (mode, v);
15848 return t1;
15849 }
15850
15851 /* Check OP for validity as a PARALLEL RTX vector with elements
15852 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15853 from the perspective of the architecture. See the diagram above
15854 aarch64_simd_vect_par_cnst_half for more details. */
15855
15856 bool
15857 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15858 bool high)
15859 {
15860 int nelts;
15861 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15862 return false;
15863
15864 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15865 HOST_WIDE_INT count_op = XVECLEN (op, 0);
15866 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15867 int i = 0;
15868
15869 if (count_op != count_ideal)
15870 return false;
15871
15872 for (i = 0; i < count_ideal; i++)
15873 {
15874 rtx elt_op = XVECEXP (op, 0, i);
15875 rtx elt_ideal = XVECEXP (ideal, 0, i);
15876
15877 if (!CONST_INT_P (elt_op)
15878 || INTVAL (elt_ideal) != INTVAL (elt_op))
15879 return false;
15880 }
15881 return true;
15882 }
15883
15884 /* Return a PARALLEL containing NELTS elements, with element I equal
15885 to BASE + I * STEP. */
15886
15887 rtx
15888 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15889 {
15890 rtvec vec = rtvec_alloc (nelts);
15891 for (unsigned int i = 0; i < nelts; ++i)
15892 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15893 return gen_rtx_PARALLEL (VOIDmode, vec);
15894 }
15895
15896 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15897 series with step STEP. */
15898
15899 bool
15900 aarch64_stepped_int_parallel_p (rtx op, int step)
15901 {
15902 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15903 return false;
15904
15905 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15906 for (int i = 1; i < XVECLEN (op, 0); ++i)
15907 if (!CONST_INT_P (XVECEXP (op, 0, i))
15908 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15909 return false;
15910
15911 return true;
15912 }
15913
15914 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15915 HIGH (exclusive). */
15916 void
15917 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15918 const_tree exp)
15919 {
15920 HOST_WIDE_INT lane;
15921 gcc_assert (CONST_INT_P (operand));
15922 lane = INTVAL (operand);
15923
15924 if (lane < low || lane >= high)
15925 {
15926 if (exp)
15927 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15928 else
15929 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15930 }
15931 }
15932
15933 /* Peform endian correction on lane number N, which indexes a vector
15934 of mode MODE, and return the result as an SImode rtx. */
15935
15936 rtx
15937 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15938 {
15939 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15940 }
15941
15942 /* Return TRUE if OP is a valid vector addressing mode. */
15943
15944 bool
15945 aarch64_simd_mem_operand_p (rtx op)
15946 {
15947 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15948 || REG_P (XEXP (op, 0)));
15949 }
15950
15951 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15952
15953 bool
15954 aarch64_sve_ld1r_operand_p (rtx op)
15955 {
15956 struct aarch64_address_info addr;
15957 scalar_mode mode;
15958
15959 return (MEM_P (op)
15960 && is_a <scalar_mode> (GET_MODE (op), &mode)
15961 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15962 && addr.type == ADDRESS_REG_IMM
15963 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15964 }
15965
15966 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15967 bool
15968 aarch64_sve_ld1rq_operand_p (rtx op)
15969 {
15970 struct aarch64_address_info addr;
15971 scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15972 if (!MEM_P (op)
15973 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15974 return false;
15975
15976 if (addr.type == ADDRESS_REG_IMM)
15977 return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15978
15979 if (addr.type == ADDRESS_REG_REG)
15980 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15981
15982 return false;
15983 }
15984
15985 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15986 The conditions for STR are the same. */
15987 bool
15988 aarch64_sve_ldr_operand_p (rtx op)
15989 {
15990 struct aarch64_address_info addr;
15991
15992 return (MEM_P (op)
15993 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15994 false, ADDR_QUERY_ANY)
15995 && addr.type == ADDRESS_REG_IMM);
15996 }
15997
15998 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15999 We need to be able to access the individual pieces, so the range
16000 is different from LD[234] and ST[234]. */
16001 bool
16002 aarch64_sve_struct_memory_operand_p (rtx op)
16003 {
16004 if (!MEM_P (op))
16005 return false;
16006
16007 machine_mode mode = GET_MODE (op);
16008 struct aarch64_address_info addr;
16009 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
16010 ADDR_QUERY_ANY)
16011 || addr.type != ADDRESS_REG_IMM)
16012 return false;
16013
16014 poly_int64 first = addr.const_offset;
16015 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
16016 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
16017 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
16018 }
16019
16020 /* Emit a register copy from operand to operand, taking care not to
16021 early-clobber source registers in the process.
16022
16023 COUNT is the number of components into which the copy needs to be
16024 decomposed. */
16025 void
16026 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
16027 unsigned int count)
16028 {
16029 unsigned int i;
16030 int rdest = REGNO (operands[0]);
16031 int rsrc = REGNO (operands[1]);
16032
16033 if (!reg_overlap_mentioned_p (operands[0], operands[1])
16034 || rdest < rsrc)
16035 for (i = 0; i < count; i++)
16036 emit_move_insn (gen_rtx_REG (mode, rdest + i),
16037 gen_rtx_REG (mode, rsrc + i));
16038 else
16039 for (i = 0; i < count; i++)
16040 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
16041 gen_rtx_REG (mode, rsrc + count - i - 1));
16042 }
16043
16044 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
16045 one of VSTRUCT modes: OI, CI, or XI. */
16046 int
16047 aarch64_simd_attr_length_rglist (machine_mode mode)
16048 {
16049 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
16050 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
16051 }
16052
16053 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
16054 alignment of a vector to 128 bits. SVE predicates have an alignment of
16055 16 bits. */
16056 static HOST_WIDE_INT
16057 aarch64_simd_vector_alignment (const_tree type)
16058 {
16059 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
16060 be set for non-predicate vectors of booleans. Modes are the most
16061 direct way we have of identifying real SVE predicate types. */
16062 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
16063 return 16;
16064 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16065 return 128;
16066 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
16067 }
16068
16069 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
16070 static poly_uint64
16071 aarch64_vectorize_preferred_vector_alignment (const_tree type)
16072 {
16073 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
16074 {
16075 /* If the length of the vector is fixed, try to align to that length,
16076 otherwise don't try to align at all. */
16077 HOST_WIDE_INT result;
16078 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
16079 result = TYPE_ALIGN (TREE_TYPE (type));
16080 return result;
16081 }
16082 return TYPE_ALIGN (type);
16083 }
16084
16085 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
16086 static bool
16087 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
16088 {
16089 if (is_packed)
16090 return false;
16091
16092 /* For fixed-length vectors, check that the vectorizer will aim for
16093 full-vector alignment. This isn't true for generic GCC vectors
16094 that are wider than the ABI maximum of 128 bits. */
16095 poly_uint64 preferred_alignment =
16096 aarch64_vectorize_preferred_vector_alignment (type);
16097 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
16098 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
16099 preferred_alignment))
16100 return false;
16101
16102 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
16103 return true;
16104 }
16105
16106 /* Return true if the vector misalignment factor is supported by the
16107 target. */
16108 static bool
16109 aarch64_builtin_support_vector_misalignment (machine_mode mode,
16110 const_tree type, int misalignment,
16111 bool is_packed)
16112 {
16113 if (TARGET_SIMD && STRICT_ALIGNMENT)
16114 {
16115 /* Return if movmisalign pattern is not supported for this mode. */
16116 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
16117 return false;
16118
16119 /* Misalignment factor is unknown at compile time. */
16120 if (misalignment == -1)
16121 return false;
16122 }
16123 return default_builtin_support_vector_misalignment (mode, type, misalignment,
16124 is_packed);
16125 }
16126
16127 /* If VALS is a vector constant that can be loaded into a register
16128 using DUP, generate instructions to do so and return an RTX to
16129 assign to the register. Otherwise return NULL_RTX. */
16130 static rtx
16131 aarch64_simd_dup_constant (rtx vals)
16132 {
16133 machine_mode mode = GET_MODE (vals);
16134 machine_mode inner_mode = GET_MODE_INNER (mode);
16135 rtx x;
16136
16137 if (!const_vec_duplicate_p (vals, &x))
16138 return NULL_RTX;
16139
16140 /* We can load this constant by using DUP and a constant in a
16141 single ARM register. This will be cheaper than a vector
16142 load. */
16143 x = copy_to_mode_reg (inner_mode, x);
16144 return gen_vec_duplicate (mode, x);
16145 }
16146
16147
16148 /* Generate code to load VALS, which is a PARALLEL containing only
16149 constants (for vec_init) or CONST_VECTOR, efficiently into a
16150 register. Returns an RTX to copy into the register, or NULL_RTX
16151 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
16152 static rtx
16153 aarch64_simd_make_constant (rtx vals)
16154 {
16155 machine_mode mode = GET_MODE (vals);
16156 rtx const_dup;
16157 rtx const_vec = NULL_RTX;
16158 int n_const = 0;
16159 int i;
16160
16161 if (GET_CODE (vals) == CONST_VECTOR)
16162 const_vec = vals;
16163 else if (GET_CODE (vals) == PARALLEL)
16164 {
16165 /* A CONST_VECTOR must contain only CONST_INTs and
16166 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
16167 Only store valid constants in a CONST_VECTOR. */
16168 int n_elts = XVECLEN (vals, 0);
16169 for (i = 0; i < n_elts; ++i)
16170 {
16171 rtx x = XVECEXP (vals, 0, i);
16172 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16173 n_const++;
16174 }
16175 if (n_const == n_elts)
16176 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
16177 }
16178 else
16179 gcc_unreachable ();
16180
16181 if (const_vec != NULL_RTX
16182 && aarch64_simd_valid_immediate (const_vec, NULL))
16183 /* Load using MOVI/MVNI. */
16184 return const_vec;
16185 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16186 /* Loaded using DUP. */
16187 return const_dup;
16188 else if (const_vec != NULL_RTX)
16189 /* Load from constant pool. We cannot take advantage of single-cycle
16190 LD1 because we need a PC-relative addressing mode. */
16191 return const_vec;
16192 else
16193 /* A PARALLEL containing something not valid inside CONST_VECTOR.
16194 We cannot construct an initializer. */
16195 return NULL_RTX;
16196 }
16197
16198 /* Expand a vector initialisation sequence, such that TARGET is
16199 initialised to contain VALS. */
16200
16201 void
16202 aarch64_expand_vector_init (rtx target, rtx vals)
16203 {
16204 machine_mode mode = GET_MODE (target);
16205 scalar_mode inner_mode = GET_MODE_INNER (mode);
16206 /* The number of vector elements. */
16207 int n_elts = XVECLEN (vals, 0);
16208 /* The number of vector elements which are not constant. */
16209 int n_var = 0;
16210 rtx any_const = NULL_RTX;
16211 /* The first element of vals. */
16212 rtx v0 = XVECEXP (vals, 0, 0);
16213 bool all_same = true;
16214
16215 /* This is a special vec_init<M><N> where N is not an element mode but a
16216 vector mode with half the elements of M. We expect to find two entries
16217 of mode N in VALS and we must put their concatentation into TARGET. */
16218 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16219 {
16220 gcc_assert (known_eq (GET_MODE_SIZE (mode),
16221 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16222 rtx lo = XVECEXP (vals, 0, 0);
16223 rtx hi = XVECEXP (vals, 0, 1);
16224 machine_mode narrow_mode = GET_MODE (lo);
16225 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16226 gcc_assert (narrow_mode == GET_MODE (hi));
16227
16228 /* When we want to concatenate a half-width vector with zeroes we can
16229 use the aarch64_combinez[_be] patterns. Just make sure that the
16230 zeroes are in the right half. */
16231 if (BYTES_BIG_ENDIAN
16232 && aarch64_simd_imm_zero (lo, narrow_mode)
16233 && general_operand (hi, narrow_mode))
16234 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16235 else if (!BYTES_BIG_ENDIAN
16236 && aarch64_simd_imm_zero (hi, narrow_mode)
16237 && general_operand (lo, narrow_mode))
16238 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16239 else
16240 {
16241 /* Else create the two half-width registers and combine them. */
16242 if (!REG_P (lo))
16243 lo = force_reg (GET_MODE (lo), lo);
16244 if (!REG_P (hi))
16245 hi = force_reg (GET_MODE (hi), hi);
16246
16247 if (BYTES_BIG_ENDIAN)
16248 std::swap (lo, hi);
16249 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16250 }
16251 return;
16252 }
16253
16254 /* Count the number of variable elements to initialise. */
16255 for (int i = 0; i < n_elts; ++i)
16256 {
16257 rtx x = XVECEXP (vals, 0, i);
16258 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16259 ++n_var;
16260 else
16261 any_const = x;
16262
16263 all_same &= rtx_equal_p (x, v0);
16264 }
16265
16266 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16267 how best to handle this. */
16268 if (n_var == 0)
16269 {
16270 rtx constant = aarch64_simd_make_constant (vals);
16271 if (constant != NULL_RTX)
16272 {
16273 emit_move_insn (target, constant);
16274 return;
16275 }
16276 }
16277
16278 /* Splat a single non-constant element if we can. */
16279 if (all_same)
16280 {
16281 rtx x = copy_to_mode_reg (inner_mode, v0);
16282 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16283 return;
16284 }
16285
16286 enum insn_code icode = optab_handler (vec_set_optab, mode);
16287 gcc_assert (icode != CODE_FOR_nothing);
16288
16289 /* If there are only variable elements, try to optimize
16290 the insertion using dup for the most common element
16291 followed by insertions. */
16292
16293 /* The algorithm will fill matches[*][0] with the earliest matching element,
16294 and matches[X][1] with the count of duplicate elements (if X is the
16295 earliest element which has duplicates). */
16296
16297 if (n_var == n_elts && n_elts <= 16)
16298 {
16299 int matches[16][2] = {0};
16300 for (int i = 0; i < n_elts; i++)
16301 {
16302 for (int j = 0; j <= i; j++)
16303 {
16304 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16305 {
16306 matches[i][0] = j;
16307 matches[j][1]++;
16308 break;
16309 }
16310 }
16311 }
16312 int maxelement = 0;
16313 int maxv = 0;
16314 for (int i = 0; i < n_elts; i++)
16315 if (matches[i][1] > maxv)
16316 {
16317 maxelement = i;
16318 maxv = matches[i][1];
16319 }
16320
16321 /* Create a duplicate of the most common element, unless all elements
16322 are equally useless to us, in which case just immediately set the
16323 vector register using the first element. */
16324
16325 if (maxv == 1)
16326 {
16327 /* For vectors of two 64-bit elements, we can do even better. */
16328 if (n_elts == 2
16329 && (inner_mode == E_DImode
16330 || inner_mode == E_DFmode))
16331
16332 {
16333 rtx x0 = XVECEXP (vals, 0, 0);
16334 rtx x1 = XVECEXP (vals, 0, 1);
16335 /* Combine can pick up this case, but handling it directly
16336 here leaves clearer RTL.
16337
16338 This is load_pair_lanes<mode>, and also gives us a clean-up
16339 for store_pair_lanes<mode>. */
16340 if (memory_operand (x0, inner_mode)
16341 && memory_operand (x1, inner_mode)
16342 && !STRICT_ALIGNMENT
16343 && rtx_equal_p (XEXP (x1, 0),
16344 plus_constant (Pmode,
16345 XEXP (x0, 0),
16346 GET_MODE_SIZE (inner_mode))))
16347 {
16348 rtx t;
16349 if (inner_mode == DFmode)
16350 t = gen_load_pair_lanesdf (target, x0, x1);
16351 else
16352 t = gen_load_pair_lanesdi (target, x0, x1);
16353 emit_insn (t);
16354 return;
16355 }
16356 }
16357 /* The subreg-move sequence below will move into lane zero of the
16358 vector register. For big-endian we want that position to hold
16359 the last element of VALS. */
16360 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16361 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16362 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16363 }
16364 else
16365 {
16366 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16367 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16368 }
16369
16370 /* Insert the rest. */
16371 for (int i = 0; i < n_elts; i++)
16372 {
16373 rtx x = XVECEXP (vals, 0, i);
16374 if (matches[i][0] == maxelement)
16375 continue;
16376 x = copy_to_mode_reg (inner_mode, x);
16377 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16378 }
16379 return;
16380 }
16381
16382 /* Initialise a vector which is part-variable. We want to first try
16383 to build those lanes which are constant in the most efficient way we
16384 can. */
16385 if (n_var != n_elts)
16386 {
16387 rtx copy = copy_rtx (vals);
16388
16389 /* Load constant part of vector. We really don't care what goes into the
16390 parts we will overwrite, but we're more likely to be able to load the
16391 constant efficiently if it has fewer, larger, repeating parts
16392 (see aarch64_simd_valid_immediate). */
16393 for (int i = 0; i < n_elts; i++)
16394 {
16395 rtx x = XVECEXP (vals, 0, i);
16396 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16397 continue;
16398 rtx subst = any_const;
16399 for (int bit = n_elts / 2; bit > 0; bit /= 2)
16400 {
16401 /* Look in the copied vector, as more elements are const. */
16402 rtx test = XVECEXP (copy, 0, i ^ bit);
16403 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16404 {
16405 subst = test;
16406 break;
16407 }
16408 }
16409 XVECEXP (copy, 0, i) = subst;
16410 }
16411 aarch64_expand_vector_init (target, copy);
16412 }
16413
16414 /* Insert the variable lanes directly. */
16415 for (int i = 0; i < n_elts; i++)
16416 {
16417 rtx x = XVECEXP (vals, 0, i);
16418 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16419 continue;
16420 x = copy_to_mode_reg (inner_mode, x);
16421 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16422 }
16423 }
16424
16425 /* Emit RTL corresponding to:
16426 insr TARGET, ELEM. */
16427
16428 static void
16429 emit_insr (rtx target, rtx elem)
16430 {
16431 machine_mode mode = GET_MODE (target);
16432 scalar_mode elem_mode = GET_MODE_INNER (mode);
16433 elem = force_reg (elem_mode, elem);
16434
16435 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16436 gcc_assert (icode != CODE_FOR_nothing);
16437 emit_insn (GEN_FCN (icode) (target, target, elem));
16438 }
16439
16440 /* Subroutine of aarch64_sve_expand_vector_init for handling
16441 trailing constants.
16442 This function works as follows:
16443 (a) Create a new vector consisting of trailing constants.
16444 (b) Initialize TARGET with the constant vector using emit_move_insn.
16445 (c) Insert remaining elements in TARGET using insr.
16446 NELTS is the total number of elements in original vector while
16447 while NELTS_REQD is the number of elements that are actually
16448 significant.
16449
16450 ??? The heuristic used is to do above only if number of constants
16451 is at least half the total number of elements. May need fine tuning. */
16452
16453 static bool
16454 aarch64_sve_expand_vector_init_handle_trailing_constants
16455 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16456 {
16457 machine_mode mode = GET_MODE (target);
16458 scalar_mode elem_mode = GET_MODE_INNER (mode);
16459 int n_trailing_constants = 0;
16460
16461 for (int i = nelts_reqd - 1;
16462 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16463 i--)
16464 n_trailing_constants++;
16465
16466 if (n_trailing_constants >= nelts_reqd / 2)
16467 {
16468 rtx_vector_builder v (mode, 1, nelts);
16469 for (int i = 0; i < nelts; i++)
16470 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16471 rtx const_vec = v.build ();
16472 emit_move_insn (target, const_vec);
16473
16474 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16475 emit_insr (target, builder.elt (i));
16476
16477 return true;
16478 }
16479
16480 return false;
16481 }
16482
16483 /* Subroutine of aarch64_sve_expand_vector_init.
16484 Works as follows:
16485 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16486 (b) Skip trailing elements from BUILDER, which are the same as
16487 element NELTS_REQD - 1.
16488 (c) Insert earlier elements in reverse order in TARGET using insr. */
16489
16490 static void
16491 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16492 const rtx_vector_builder &builder,
16493 int nelts_reqd)
16494 {
16495 machine_mode mode = GET_MODE (target);
16496 scalar_mode elem_mode = GET_MODE_INNER (mode);
16497
16498 struct expand_operand ops[2];
16499 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16500 gcc_assert (icode != CODE_FOR_nothing);
16501
16502 create_output_operand (&ops[0], target, mode);
16503 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16504 expand_insn (icode, 2, ops);
16505
16506 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16507 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16508 emit_insr (target, builder.elt (i));
16509 }
16510
16511 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16512 when all trailing elements of builder are same.
16513 This works as follows:
16514 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16515 (b) Insert remaining elements in TARGET using insr.
16516
16517 ??? The heuristic used is to do above if number of same trailing elements
16518 is at least 3/4 of total number of elements, loosely based on
16519 heuristic from mostly_zeros_p. May need fine-tuning. */
16520
16521 static bool
16522 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16523 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16524 {
16525 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16526 if (ndups >= (3 * nelts_reqd) / 4)
16527 {
16528 aarch64_sve_expand_vector_init_insert_elems (target, builder,
16529 nelts_reqd - ndups + 1);
16530 return true;
16531 }
16532
16533 return false;
16534 }
16535
16536 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16537 of elements in BUILDER.
16538
16539 The function tries to initialize TARGET from BUILDER if it fits one
16540 of the special cases outlined below.
16541
16542 Failing that, the function divides BUILDER into two sub-vectors:
16543 v_even = even elements of BUILDER;
16544 v_odd = odd elements of BUILDER;
16545
16546 and recursively calls itself with v_even and v_odd.
16547
16548 if (recursive call succeeded for v_even or v_odd)
16549 TARGET = zip (v_even, v_odd)
16550
16551 The function returns true if it managed to build TARGET from BUILDER
16552 with one of the special cases, false otherwise.
16553
16554 Example: {a, 1, b, 2, c, 3, d, 4}
16555
16556 The vector gets divided into:
16557 v_even = {a, b, c, d}
16558 v_odd = {1, 2, 3, 4}
16559
16560 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16561 initialize tmp2 from constant vector v_odd using emit_move_insn.
16562
16563 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16564 4 elements, so we construct tmp1 from v_even using insr:
16565 tmp1 = dup(d)
16566 insr tmp1, c
16567 insr tmp1, b
16568 insr tmp1, a
16569
16570 And finally:
16571 TARGET = zip (tmp1, tmp2)
16572 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16573
16574 static bool
16575 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16576 int nelts, int nelts_reqd)
16577 {
16578 machine_mode mode = GET_MODE (target);
16579
16580 /* Case 1: Vector contains trailing constants. */
16581
16582 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16583 (target, builder, nelts, nelts_reqd))
16584 return true;
16585
16586 /* Case 2: Vector contains leading constants. */
16587
16588 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16589 for (int i = 0; i < nelts_reqd; i++)
16590 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16591 rev_builder.finalize ();
16592
16593 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16594 (target, rev_builder, nelts, nelts_reqd))
16595 {
16596 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16597 return true;
16598 }
16599
16600 /* Case 3: Vector contains trailing same element. */
16601
16602 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16603 (target, builder, nelts_reqd))
16604 return true;
16605
16606 /* Case 4: Vector contains leading same element. */
16607
16608 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16609 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16610 {
16611 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16612 return true;
16613 }
16614
16615 /* Avoid recursing below 4-elements.
16616 ??? The threshold 4 may need fine-tuning. */
16617
16618 if (nelts_reqd <= 4)
16619 return false;
16620
16621 rtx_vector_builder v_even (mode, 1, nelts);
16622 rtx_vector_builder v_odd (mode, 1, nelts);
16623
16624 for (int i = 0; i < nelts * 2; i += 2)
16625 {
16626 v_even.quick_push (builder.elt (i));
16627 v_odd.quick_push (builder.elt (i + 1));
16628 }
16629
16630 v_even.finalize ();
16631 v_odd.finalize ();
16632
16633 rtx tmp1 = gen_reg_rtx (mode);
16634 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16635 nelts, nelts_reqd / 2);
16636
16637 rtx tmp2 = gen_reg_rtx (mode);
16638 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16639 nelts, nelts_reqd / 2);
16640
16641 if (!did_even_p && !did_odd_p)
16642 return false;
16643
16644 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16645 special cases and zip v_even, v_odd. */
16646
16647 if (!did_even_p)
16648 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16649
16650 if (!did_odd_p)
16651 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16652
16653 rtvec v = gen_rtvec (2, tmp1, tmp2);
16654 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16655 return true;
16656 }
16657
16658 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16659
16660 void
16661 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16662 {
16663 machine_mode mode = GET_MODE (target);
16664 int nelts = XVECLEN (vals, 0);
16665
16666 rtx_vector_builder v (mode, 1, nelts);
16667 for (int i = 0; i < nelts; i++)
16668 v.quick_push (XVECEXP (vals, 0, i));
16669 v.finalize ();
16670
16671 /* If neither sub-vectors of v could be initialized specially,
16672 then use INSR to insert all elements from v into TARGET.
16673 ??? This might not be optimal for vectors with large
16674 initializers like 16-element or above.
16675 For nelts < 4, it probably isn't useful to handle specially. */
16676
16677 if (nelts < 4
16678 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16679 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16680 }
16681
16682 /* Check whether VALUE is a vector constant in which every element
16683 is either a power of 2 or a negated power of 2. If so, return
16684 a constant vector of log2s, and flip CODE between PLUS and MINUS
16685 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
16686
16687 static rtx
16688 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16689 {
16690 if (GET_CODE (value) != CONST_VECTOR)
16691 return NULL_RTX;
16692
16693 rtx_vector_builder builder;
16694 if (!builder.new_unary_operation (GET_MODE (value), value, false))
16695 return NULL_RTX;
16696
16697 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16698 /* 1 if the result of the multiplication must be negated,
16699 0 if it mustn't, or -1 if we don't yet care. */
16700 int negate = -1;
16701 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16702 for (unsigned int i = 0; i < encoded_nelts; ++i)
16703 {
16704 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16705 if (!CONST_SCALAR_INT_P (elt))
16706 return NULL_RTX;
16707 rtx_mode_t val (elt, int_mode);
16708 wide_int pow2 = wi::neg (val);
16709 if (val != pow2)
16710 {
16711 /* It matters whether we negate or not. Make that choice,
16712 and make sure that it's consistent with previous elements. */
16713 if (negate == !wi::neg_p (val))
16714 return NULL_RTX;
16715 negate = wi::neg_p (val);
16716 if (!negate)
16717 pow2 = val;
16718 }
16719 /* POW2 is now the value that we want to be a power of 2. */
16720 int shift = wi::exact_log2 (pow2);
16721 if (shift < 0)
16722 return NULL_RTX;
16723 builder.quick_push (gen_int_mode (shift, int_mode));
16724 }
16725 if (negate == -1)
16726 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
16727 code = PLUS;
16728 else if (negate == 1)
16729 code = code == PLUS ? MINUS : PLUS;
16730 return builder.build ();
16731 }
16732
16733 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16734 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
16735 operands array, in the same order as for fma_optab. Return true if
16736 the function emitted all the necessary instructions, false if the caller
16737 should generate the pattern normally with the new OPERANDS array. */
16738
16739 bool
16740 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16741 {
16742 machine_mode mode = GET_MODE (operands[0]);
16743 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16744 {
16745 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16746 NULL_RTX, true, OPTAB_DIRECT);
16747 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16748 operands[3], product, operands[0], true,
16749 OPTAB_DIRECT);
16750 return true;
16751 }
16752 operands[2] = force_reg (mode, operands[2]);
16753 return false;
16754 }
16755
16756 /* Likewise, but for a conditional pattern. */
16757
16758 bool
16759 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16760 {
16761 machine_mode mode = GET_MODE (operands[0]);
16762 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16763 {
16764 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16765 NULL_RTX, true, OPTAB_DIRECT);
16766 emit_insn (gen_cond (code, mode, operands[0], operands[1],
16767 operands[4], product, operands[5]));
16768 return true;
16769 }
16770 operands[3] = force_reg (mode, operands[3]);
16771 return false;
16772 }
16773
16774 static unsigned HOST_WIDE_INT
16775 aarch64_shift_truncation_mask (machine_mode mode)
16776 {
16777 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16778 return 0;
16779 return GET_MODE_UNIT_BITSIZE (mode) - 1;
16780 }
16781
16782 /* Select a format to encode pointers in exception handling data. */
16783 int
16784 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16785 {
16786 int type;
16787 switch (aarch64_cmodel)
16788 {
16789 case AARCH64_CMODEL_TINY:
16790 case AARCH64_CMODEL_TINY_PIC:
16791 case AARCH64_CMODEL_SMALL:
16792 case AARCH64_CMODEL_SMALL_PIC:
16793 case AARCH64_CMODEL_SMALL_SPIC:
16794 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16795 for everything. */
16796 type = DW_EH_PE_sdata4;
16797 break;
16798 default:
16799 /* No assumptions here. 8-byte relocs required. */
16800 type = DW_EH_PE_sdata8;
16801 break;
16802 }
16803 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16804 }
16805
16806 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16807
16808 static void
16809 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16810 {
16811 if (aarch64_simd_decl_p (decl))
16812 {
16813 fprintf (stream, "\t.variant_pcs\t");
16814 assemble_name (stream, name);
16815 fprintf (stream, "\n");
16816 }
16817 }
16818
16819 /* The last .arch and .tune assembly strings that we printed. */
16820 static std::string aarch64_last_printed_arch_string;
16821 static std::string aarch64_last_printed_tune_string;
16822
16823 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16824 by the function fndecl. */
16825
16826 void
16827 aarch64_declare_function_name (FILE *stream, const char* name,
16828 tree fndecl)
16829 {
16830 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16831
16832 struct cl_target_option *targ_options;
16833 if (target_parts)
16834 targ_options = TREE_TARGET_OPTION (target_parts);
16835 else
16836 targ_options = TREE_TARGET_OPTION (target_option_current_node);
16837 gcc_assert (targ_options);
16838
16839 const struct processor *this_arch
16840 = aarch64_get_arch (targ_options->x_explicit_arch);
16841
16842 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16843 std::string extension
16844 = aarch64_get_extension_string_for_isa_flags (isa_flags,
16845 this_arch->flags);
16846 /* Only update the assembler .arch string if it is distinct from the last
16847 such string we printed. */
16848 std::string to_print = this_arch->name + extension;
16849 if (to_print != aarch64_last_printed_arch_string)
16850 {
16851 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16852 aarch64_last_printed_arch_string = to_print;
16853 }
16854
16855 /* Print the cpu name we're tuning for in the comments, might be
16856 useful to readers of the generated asm. Do it only when it changes
16857 from function to function and verbose assembly is requested. */
16858 const struct processor *this_tune
16859 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16860
16861 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16862 {
16863 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16864 this_tune->name);
16865 aarch64_last_printed_tune_string = this_tune->name;
16866 }
16867
16868 aarch64_asm_output_variant_pcs (stream, fndecl, name);
16869
16870 /* Don't forget the type directive for ELF. */
16871 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16872 ASM_OUTPUT_LABEL (stream, name);
16873 }
16874
16875 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16876
16877 void
16878 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16879 {
16880 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16881 const char *value = IDENTIFIER_POINTER (target);
16882 aarch64_asm_output_variant_pcs (stream, decl, name);
16883 ASM_OUTPUT_DEF (stream, name, value);
16884 }
16885
16886 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16887 function symbol references. */
16888
16889 void
16890 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16891 {
16892 default_elf_asm_output_external (stream, decl, name);
16893 aarch64_asm_output_variant_pcs (stream, decl, name);
16894 }
16895
16896 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16897 Used to output the .cfi_b_key_frame directive when signing the current
16898 function with the B key. */
16899
16900 void
16901 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16902 {
16903 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16904 && aarch64_ra_sign_key == AARCH64_KEY_B)
16905 asm_fprintf (f, "\t.cfi_b_key_frame\n");
16906 }
16907
16908 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16909
16910 static void
16911 aarch64_start_file (void)
16912 {
16913 struct cl_target_option *default_options
16914 = TREE_TARGET_OPTION (target_option_default_node);
16915
16916 const struct processor *default_arch
16917 = aarch64_get_arch (default_options->x_explicit_arch);
16918 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16919 std::string extension
16920 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16921 default_arch->flags);
16922
16923 aarch64_last_printed_arch_string = default_arch->name + extension;
16924 aarch64_last_printed_tune_string = "";
16925 asm_fprintf (asm_out_file, "\t.arch %s\n",
16926 aarch64_last_printed_arch_string.c_str ());
16927
16928 default_file_start ();
16929 }
16930
16931 /* Emit load exclusive. */
16932
16933 static void
16934 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16935 rtx mem, rtx model_rtx)
16936 {
16937 if (mode == TImode)
16938 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
16939 gen_highpart (DImode, rval),
16940 mem, model_rtx));
16941 else
16942 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16943 }
16944
16945 /* Emit store exclusive. */
16946
16947 static void
16948 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16949 rtx mem, rtx rval, rtx model_rtx)
16950 {
16951 if (mode == TImode)
16952 emit_insn (gen_aarch64_store_exclusive_pair
16953 (bval, mem, operand_subword (rval, 0, 0, TImode),
16954 operand_subword (rval, 1, 0, TImode), model_rtx));
16955 else
16956 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
16957 }
16958
16959 /* Mark the previous jump instruction as unlikely. */
16960
16961 static void
16962 aarch64_emit_unlikely_jump (rtx insn)
16963 {
16964 rtx_insn *jump = emit_jump_insn (insn);
16965 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16966 }
16967
16968 /* We store the names of the various atomic helpers in a 5x4 array.
16969 Return the libcall function given MODE, MODEL and NAMES. */
16970
16971 rtx
16972 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
16973 const atomic_ool_names *names)
16974 {
16975 memmodel model = memmodel_base (INTVAL (model_rtx));
16976 int mode_idx, model_idx;
16977
16978 switch (mode)
16979 {
16980 case E_QImode:
16981 mode_idx = 0;
16982 break;
16983 case E_HImode:
16984 mode_idx = 1;
16985 break;
16986 case E_SImode:
16987 mode_idx = 2;
16988 break;
16989 case E_DImode:
16990 mode_idx = 3;
16991 break;
16992 case E_TImode:
16993 mode_idx = 4;
16994 break;
16995 default:
16996 gcc_unreachable ();
16997 }
16998
16999 switch (model)
17000 {
17001 case MEMMODEL_RELAXED:
17002 model_idx = 0;
17003 break;
17004 case MEMMODEL_CONSUME:
17005 case MEMMODEL_ACQUIRE:
17006 model_idx = 1;
17007 break;
17008 case MEMMODEL_RELEASE:
17009 model_idx = 2;
17010 break;
17011 case MEMMODEL_ACQ_REL:
17012 case MEMMODEL_SEQ_CST:
17013 model_idx = 3;
17014 break;
17015 default:
17016 gcc_unreachable ();
17017 }
17018
17019 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
17020 VISIBILITY_HIDDEN);
17021 }
17022
17023 #define DEF0(B, N) \
17024 { "__aarch64_" #B #N "_relax", \
17025 "__aarch64_" #B #N "_acq", \
17026 "__aarch64_" #B #N "_rel", \
17027 "__aarch64_" #B #N "_acq_rel" }
17028
17029 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
17030 { NULL, NULL, NULL, NULL }
17031 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
17032
17033 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
17034 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
17035 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
17036 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
17037 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
17038 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
17039
17040 #undef DEF0
17041 #undef DEF4
17042 #undef DEF5
17043
17044 /* Expand a compare and swap pattern. */
17045
17046 void
17047 aarch64_expand_compare_and_swap (rtx operands[])
17048 {
17049 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
17050 machine_mode mode, r_mode;
17051
17052 bval = operands[0];
17053 rval = operands[1];
17054 mem = operands[2];
17055 oldval = operands[3];
17056 newval = operands[4];
17057 is_weak = operands[5];
17058 mod_s = operands[6];
17059 mod_f = operands[7];
17060 mode = GET_MODE (mem);
17061
17062 /* Normally the succ memory model must be stronger than fail, but in the
17063 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
17064 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
17065 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
17066 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
17067 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
17068
17069 r_mode = mode;
17070 if (mode == QImode || mode == HImode)
17071 {
17072 r_mode = SImode;
17073 rval = gen_reg_rtx (r_mode);
17074 }
17075
17076 if (TARGET_LSE)
17077 {
17078 /* The CAS insn requires oldval and rval overlap, but we need to
17079 have a copy of oldval saved across the operation to tell if
17080 the operation is successful. */
17081 if (reg_overlap_mentioned_p (rval, oldval))
17082 rval = copy_to_mode_reg (r_mode, oldval);
17083 else
17084 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
17085
17086 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
17087 newval, mod_s));
17088 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17089 }
17090 else if (TARGET_OUTLINE_ATOMICS)
17091 {
17092 /* Oldval must satisfy compare afterward. */
17093 if (!aarch64_plus_operand (oldval, mode))
17094 oldval = force_reg (mode, oldval);
17095 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
17096 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
17097 oldval, mode, newval, mode,
17098 XEXP (mem, 0), Pmode);
17099 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17100 }
17101 else
17102 {
17103 /* The oldval predicate varies by mode. Test it and force to reg. */
17104 insn_code code = code_for_aarch64_compare_and_swap (mode);
17105 if (!insn_data[code].operand[2].predicate (oldval, mode))
17106 oldval = force_reg (mode, oldval);
17107
17108 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
17109 is_weak, mod_s, mod_f));
17110 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
17111 }
17112
17113 if (r_mode != mode)
17114 rval = gen_lowpart (mode, rval);
17115 emit_move_insn (operands[1], rval);
17116
17117 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
17118 emit_insn (gen_rtx_SET (bval, x));
17119 }
17120
17121 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
17122 sequence implementing an atomic operation. */
17123
17124 static void
17125 aarch64_emit_post_barrier (enum memmodel model)
17126 {
17127 const enum memmodel base_model = memmodel_base (model);
17128
17129 if (is_mm_sync (model)
17130 && (base_model == MEMMODEL_ACQUIRE
17131 || base_model == MEMMODEL_ACQ_REL
17132 || base_model == MEMMODEL_SEQ_CST))
17133 {
17134 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
17135 }
17136 }
17137
17138 /* Split a compare and swap pattern. */
17139
17140 void
17141 aarch64_split_compare_and_swap (rtx operands[])
17142 {
17143 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
17144 machine_mode mode;
17145 bool is_weak;
17146 rtx_code_label *label1, *label2;
17147 enum memmodel model;
17148
17149 rval = operands[0];
17150 mem = operands[1];
17151 oldval = operands[2];
17152 newval = operands[3];
17153 is_weak = (operands[4] != const0_rtx);
17154 model_rtx = operands[5];
17155 scratch = operands[7];
17156 mode = GET_MODE (mem);
17157 model = memmodel_from_int (INTVAL (model_rtx));
17158
17159 /* When OLDVAL is zero and we want the strong version we can emit a tighter
17160 loop:
17161 .label1:
17162 LD[A]XR rval, [mem]
17163 CBNZ rval, .label2
17164 ST[L]XR scratch, newval, [mem]
17165 CBNZ scratch, .label1
17166 .label2:
17167 CMP rval, 0. */
17168 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
17169 oldval == const0_rtx && mode != TImode);
17170
17171 label1 = NULL;
17172 if (!is_weak)
17173 {
17174 label1 = gen_label_rtx ();
17175 emit_label (label1);
17176 }
17177 label2 = gen_label_rtx ();
17178
17179 /* The initial load can be relaxed for a __sync operation since a final
17180 barrier will be emitted to stop code hoisting. */
17181 if (is_mm_sync (model))
17182 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
17183 else
17184 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
17185
17186 if (strong_zero_p)
17187 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
17188 else
17189 {
17190 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
17191 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
17192 }
17193 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17194 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
17195 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17196
17197 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
17198
17199 if (!is_weak)
17200 {
17201 if (aarch64_track_speculation)
17202 {
17203 /* Emit an explicit compare instruction, so that we can correctly
17204 track the condition codes. */
17205 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
17206 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17207 }
17208 else
17209 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
17210
17211 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17212 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
17213 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17214 }
17215 else
17216 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
17217
17218 emit_label (label2);
17219
17220 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
17221 to set the condition flags. If this is not used it will be removed by
17222 later passes. */
17223 if (strong_zero_p)
17224 aarch64_gen_compare_reg (NE, rval, const0_rtx);
17225
17226 /* Emit any final barrier needed for a __sync operation. */
17227 if (is_mm_sync (model))
17228 aarch64_emit_post_barrier (model);
17229 }
17230
17231 /* Split an atomic operation. */
17232
17233 void
17234 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
17235 rtx value, rtx model_rtx, rtx cond)
17236 {
17237 machine_mode mode = GET_MODE (mem);
17238 machine_mode wmode = (mode == DImode ? DImode : SImode);
17239 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
17240 const bool is_sync = is_mm_sync (model);
17241 rtx_code_label *label;
17242 rtx x;
17243
17244 /* Split the atomic operation into a sequence. */
17245 label = gen_label_rtx ();
17246 emit_label (label);
17247
17248 if (new_out)
17249 new_out = gen_lowpart (wmode, new_out);
17250 if (old_out)
17251 old_out = gen_lowpart (wmode, old_out);
17252 else
17253 old_out = new_out;
17254 value = simplify_gen_subreg (wmode, value, mode, 0);
17255
17256 /* The initial load can be relaxed for a __sync operation since a final
17257 barrier will be emitted to stop code hoisting. */
17258 if (is_sync)
17259 aarch64_emit_load_exclusive (mode, old_out, mem,
17260 GEN_INT (MEMMODEL_RELAXED));
17261 else
17262 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17263
17264 switch (code)
17265 {
17266 case SET:
17267 new_out = value;
17268 break;
17269
17270 case NOT:
17271 x = gen_rtx_AND (wmode, old_out, value);
17272 emit_insn (gen_rtx_SET (new_out, x));
17273 x = gen_rtx_NOT (wmode, new_out);
17274 emit_insn (gen_rtx_SET (new_out, x));
17275 break;
17276
17277 case MINUS:
17278 if (CONST_INT_P (value))
17279 {
17280 value = GEN_INT (-INTVAL (value));
17281 code = PLUS;
17282 }
17283 /* Fall through. */
17284
17285 default:
17286 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17287 emit_insn (gen_rtx_SET (new_out, x));
17288 break;
17289 }
17290
17291 aarch64_emit_store_exclusive (mode, cond, mem,
17292 gen_lowpart (mode, new_out), model_rtx);
17293
17294 if (aarch64_track_speculation)
17295 {
17296 /* Emit an explicit compare instruction, so that we can correctly
17297 track the condition codes. */
17298 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17299 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17300 }
17301 else
17302 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17303
17304 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17305 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17306 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17307
17308 /* Emit any final barrier needed for a __sync operation. */
17309 if (is_sync)
17310 aarch64_emit_post_barrier (model);
17311 }
17312
17313 static void
17314 aarch64_init_libfuncs (void)
17315 {
17316 /* Half-precision float operations. The compiler handles all operations
17317 with NULL libfuncs by converting to SFmode. */
17318
17319 /* Conversions. */
17320 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17321 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17322
17323 /* Arithmetic. */
17324 set_optab_libfunc (add_optab, HFmode, NULL);
17325 set_optab_libfunc (sdiv_optab, HFmode, NULL);
17326 set_optab_libfunc (smul_optab, HFmode, NULL);
17327 set_optab_libfunc (neg_optab, HFmode, NULL);
17328 set_optab_libfunc (sub_optab, HFmode, NULL);
17329
17330 /* Comparisons. */
17331 set_optab_libfunc (eq_optab, HFmode, NULL);
17332 set_optab_libfunc (ne_optab, HFmode, NULL);
17333 set_optab_libfunc (lt_optab, HFmode, NULL);
17334 set_optab_libfunc (le_optab, HFmode, NULL);
17335 set_optab_libfunc (ge_optab, HFmode, NULL);
17336 set_optab_libfunc (gt_optab, HFmode, NULL);
17337 set_optab_libfunc (unord_optab, HFmode, NULL);
17338 }
17339
17340 /* Target hook for c_mode_for_suffix. */
17341 static machine_mode
17342 aarch64_c_mode_for_suffix (char suffix)
17343 {
17344 if (suffix == 'q')
17345 return TFmode;
17346
17347 return VOIDmode;
17348 }
17349
17350 /* We can only represent floating point constants which will fit in
17351 "quarter-precision" values. These values are characterised by
17352 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
17353 by:
17354
17355 (-1)^s * (n/16) * 2^r
17356
17357 Where:
17358 's' is the sign bit.
17359 'n' is an integer in the range 16 <= n <= 31.
17360 'r' is an integer in the range -3 <= r <= 4. */
17361
17362 /* Return true iff X can be represented by a quarter-precision
17363 floating point immediate operand X. Note, we cannot represent 0.0. */
17364 bool
17365 aarch64_float_const_representable_p (rtx x)
17366 {
17367 /* This represents our current view of how many bits
17368 make up the mantissa. */
17369 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17370 int exponent;
17371 unsigned HOST_WIDE_INT mantissa, mask;
17372 REAL_VALUE_TYPE r, m;
17373 bool fail;
17374
17375 x = unwrap_const_vec_duplicate (x);
17376 if (!CONST_DOUBLE_P (x))
17377 return false;
17378
17379 if (GET_MODE (x) == VOIDmode
17380 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17381 return false;
17382
17383 r = *CONST_DOUBLE_REAL_VALUE (x);
17384
17385 /* We cannot represent infinities, NaNs or +/-zero. We won't
17386 know if we have +zero until we analyse the mantissa, but we
17387 can reject the other invalid values. */
17388 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17389 || REAL_VALUE_MINUS_ZERO (r))
17390 return false;
17391
17392 /* Extract exponent. */
17393 r = real_value_abs (&r);
17394 exponent = REAL_EXP (&r);
17395
17396 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17397 highest (sign) bit, with a fixed binary point at bit point_pos.
17398 m1 holds the low part of the mantissa, m2 the high part.
17399 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17400 bits for the mantissa, this can fail (low bits will be lost). */
17401 real_ldexp (&m, &r, point_pos - exponent);
17402 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17403
17404 /* If the low part of the mantissa has bits set we cannot represent
17405 the value. */
17406 if (w.ulow () != 0)
17407 return false;
17408 /* We have rejected the lower HOST_WIDE_INT, so update our
17409 understanding of how many bits lie in the mantissa and
17410 look only at the high HOST_WIDE_INT. */
17411 mantissa = w.elt (1);
17412 point_pos -= HOST_BITS_PER_WIDE_INT;
17413
17414 /* We can only represent values with a mantissa of the form 1.xxxx. */
17415 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17416 if ((mantissa & mask) != 0)
17417 return false;
17418
17419 /* Having filtered unrepresentable values, we may now remove all
17420 but the highest 5 bits. */
17421 mantissa >>= point_pos - 5;
17422
17423 /* We cannot represent the value 0.0, so reject it. This is handled
17424 elsewhere. */
17425 if (mantissa == 0)
17426 return false;
17427
17428 /* Then, as bit 4 is always set, we can mask it off, leaving
17429 the mantissa in the range [0, 15]. */
17430 mantissa &= ~(1 << 4);
17431 gcc_assert (mantissa <= 15);
17432
17433 /* GCC internally does not use IEEE754-like encoding (where normalized
17434 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17435 Our mantissa values are shifted 4 places to the left relative to
17436 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17437 by 5 places to correct for GCC's representation. */
17438 exponent = 5 - exponent;
17439
17440 return (exponent >= 0 && exponent <= 7);
17441 }
17442
17443 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17444 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17445 output MOVI/MVNI, ORR or BIC immediate. */
17446 char*
17447 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17448 enum simd_immediate_check which)
17449 {
17450 bool is_valid;
17451 static char templ[40];
17452 const char *mnemonic;
17453 const char *shift_op;
17454 unsigned int lane_count = 0;
17455 char element_char;
17456
17457 struct simd_immediate_info info;
17458
17459 /* This will return true to show const_vector is legal for use as either
17460 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17461 It will also update INFO to show how the immediate should be generated.
17462 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17463 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17464 gcc_assert (is_valid);
17465
17466 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17467 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17468
17469 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17470 {
17471 gcc_assert (info.insn == simd_immediate_info::MOV
17472 && info.u.mov.shift == 0);
17473 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17474 move immediate path. */
17475 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17476 info.u.mov.value = GEN_INT (0);
17477 else
17478 {
17479 const unsigned int buf_size = 20;
17480 char float_buf[buf_size] = {'\0'};
17481 real_to_decimal_for_mode (float_buf,
17482 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17483 buf_size, buf_size, 1, info.elt_mode);
17484
17485 if (lane_count == 1)
17486 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17487 else
17488 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17489 lane_count, element_char, float_buf);
17490 return templ;
17491 }
17492 }
17493
17494 gcc_assert (CONST_INT_P (info.u.mov.value));
17495
17496 if (which == AARCH64_CHECK_MOV)
17497 {
17498 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17499 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17500 ? "msl" : "lsl");
17501 if (lane_count == 1)
17502 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17503 mnemonic, UINTVAL (info.u.mov.value));
17504 else if (info.u.mov.shift)
17505 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17506 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17507 element_char, UINTVAL (info.u.mov.value), shift_op,
17508 info.u.mov.shift);
17509 else
17510 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17511 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17512 element_char, UINTVAL (info.u.mov.value));
17513 }
17514 else
17515 {
17516 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17517 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17518 if (info.u.mov.shift)
17519 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17520 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17521 element_char, UINTVAL (info.u.mov.value), "lsl",
17522 info.u.mov.shift);
17523 else
17524 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17525 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17526 element_char, UINTVAL (info.u.mov.value));
17527 }
17528 return templ;
17529 }
17530
17531 char*
17532 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17533 {
17534
17535 /* If a floating point number was passed and we desire to use it in an
17536 integer mode do the conversion to integer. */
17537 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17538 {
17539 unsigned HOST_WIDE_INT ival;
17540 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17541 gcc_unreachable ();
17542 immediate = gen_int_mode (ival, mode);
17543 }
17544
17545 machine_mode vmode;
17546 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17547 a 128 bit vector mode. */
17548 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17549
17550 vmode = aarch64_simd_container_mode (mode, width);
17551 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17552 return aarch64_output_simd_mov_immediate (v_op, width);
17553 }
17554
17555 /* Return the output string to use for moving immediate CONST_VECTOR
17556 into an SVE register. */
17557
17558 char *
17559 aarch64_output_sve_mov_immediate (rtx const_vector)
17560 {
17561 static char templ[40];
17562 struct simd_immediate_info info;
17563 char element_char;
17564
17565 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17566 gcc_assert (is_valid);
17567
17568 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17569
17570 machine_mode vec_mode = GET_MODE (const_vector);
17571 if (aarch64_sve_pred_mode_p (vec_mode))
17572 {
17573 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17574 if (info.insn == simd_immediate_info::MOV)
17575 {
17576 gcc_assert (info.u.mov.value == const0_rtx);
17577 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17578 }
17579 else
17580 {
17581 gcc_assert (info.insn == simd_immediate_info::PTRUE);
17582 unsigned int total_bytes;
17583 if (info.u.pattern == AARCH64_SV_ALL
17584 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17585 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17586 total_bytes / GET_MODE_SIZE (info.elt_mode));
17587 else
17588 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17589 svpattern_token (info.u.pattern));
17590 }
17591 return buf;
17592 }
17593
17594 if (info.insn == simd_immediate_info::INDEX)
17595 {
17596 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17597 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17598 element_char, INTVAL (info.u.index.base),
17599 INTVAL (info.u.index.step));
17600 return templ;
17601 }
17602
17603 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17604 {
17605 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17606 info.u.mov.value = GEN_INT (0);
17607 else
17608 {
17609 const int buf_size = 20;
17610 char float_buf[buf_size] = {};
17611 real_to_decimal_for_mode (float_buf,
17612 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17613 buf_size, buf_size, 1, info.elt_mode);
17614
17615 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17616 element_char, float_buf);
17617 return templ;
17618 }
17619 }
17620
17621 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17622 element_char, INTVAL (info.u.mov.value));
17623 return templ;
17624 }
17625
17626 /* Split operands into moves from op[1] + op[2] into op[0]. */
17627
17628 void
17629 aarch64_split_combinev16qi (rtx operands[3])
17630 {
17631 unsigned int dest = REGNO (operands[0]);
17632 unsigned int src1 = REGNO (operands[1]);
17633 unsigned int src2 = REGNO (operands[2]);
17634 machine_mode halfmode = GET_MODE (operands[1]);
17635 unsigned int halfregs = REG_NREGS (operands[1]);
17636 rtx destlo, desthi;
17637
17638 gcc_assert (halfmode == V16QImode);
17639
17640 if (src1 == dest && src2 == dest + halfregs)
17641 {
17642 /* No-op move. Can't split to nothing; emit something. */
17643 emit_note (NOTE_INSN_DELETED);
17644 return;
17645 }
17646
17647 /* Preserve register attributes for variable tracking. */
17648 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17649 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17650 GET_MODE_SIZE (halfmode));
17651
17652 /* Special case of reversed high/low parts. */
17653 if (reg_overlap_mentioned_p (operands[2], destlo)
17654 && reg_overlap_mentioned_p (operands[1], desthi))
17655 {
17656 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17657 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17658 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17659 }
17660 else if (!reg_overlap_mentioned_p (operands[2], destlo))
17661 {
17662 /* Try to avoid unnecessary moves if part of the result
17663 is in the right place already. */
17664 if (src1 != dest)
17665 emit_move_insn (destlo, operands[1]);
17666 if (src2 != dest + halfregs)
17667 emit_move_insn (desthi, operands[2]);
17668 }
17669 else
17670 {
17671 if (src2 != dest + halfregs)
17672 emit_move_insn (desthi, operands[2]);
17673 if (src1 != dest)
17674 emit_move_insn (destlo, operands[1]);
17675 }
17676 }
17677
17678 /* vec_perm support. */
17679
17680 struct expand_vec_perm_d
17681 {
17682 rtx target, op0, op1;
17683 vec_perm_indices perm;
17684 machine_mode vmode;
17685 unsigned int vec_flags;
17686 bool one_vector_p;
17687 bool testing_p;
17688 };
17689
17690 /* Generate a variable permutation. */
17691
17692 static void
17693 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17694 {
17695 machine_mode vmode = GET_MODE (target);
17696 bool one_vector_p = rtx_equal_p (op0, op1);
17697
17698 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17699 gcc_checking_assert (GET_MODE (op0) == vmode);
17700 gcc_checking_assert (GET_MODE (op1) == vmode);
17701 gcc_checking_assert (GET_MODE (sel) == vmode);
17702 gcc_checking_assert (TARGET_SIMD);
17703
17704 if (one_vector_p)
17705 {
17706 if (vmode == V8QImode)
17707 {
17708 /* Expand the argument to a V16QI mode by duplicating it. */
17709 rtx pair = gen_reg_rtx (V16QImode);
17710 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17711 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17712 }
17713 else
17714 {
17715 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17716 }
17717 }
17718 else
17719 {
17720 rtx pair;
17721
17722 if (vmode == V8QImode)
17723 {
17724 pair = gen_reg_rtx (V16QImode);
17725 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17726 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17727 }
17728 else
17729 {
17730 pair = gen_reg_rtx (OImode);
17731 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17732 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17733 }
17734 }
17735 }
17736
17737 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17738 NELT is the number of elements in the vector. */
17739
17740 void
17741 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17742 unsigned int nelt)
17743 {
17744 machine_mode vmode = GET_MODE (target);
17745 bool one_vector_p = rtx_equal_p (op0, op1);
17746 rtx mask;
17747
17748 /* The TBL instruction does not use a modulo index, so we must take care
17749 of that ourselves. */
17750 mask = aarch64_simd_gen_const_vector_dup (vmode,
17751 one_vector_p ? nelt - 1 : 2 * nelt - 1);
17752 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17753
17754 /* For big-endian, we also need to reverse the index within the vector
17755 (but not which vector). */
17756 if (BYTES_BIG_ENDIAN)
17757 {
17758 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17759 if (!one_vector_p)
17760 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17761 sel = expand_simple_binop (vmode, XOR, sel, mask,
17762 NULL, 0, OPTAB_LIB_WIDEN);
17763 }
17764 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17765 }
17766
17767 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17768
17769 static void
17770 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17771 {
17772 emit_insn (gen_rtx_SET (target,
17773 gen_rtx_UNSPEC (GET_MODE (target),
17774 gen_rtvec (2, op0, op1), code)));
17775 }
17776
17777 /* Expand an SVE vec_perm with the given operands. */
17778
17779 void
17780 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17781 {
17782 machine_mode data_mode = GET_MODE (target);
17783 machine_mode sel_mode = GET_MODE (sel);
17784 /* Enforced by the pattern condition. */
17785 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17786
17787 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17788 size of the two value vectors, i.e. the upper bits of the indices
17789 are effectively ignored. SVE TBL instead produces 0 for any
17790 out-of-range indices, so we need to modulo all the vec_perm indices
17791 to ensure they are all in range. */
17792 rtx sel_reg = force_reg (sel_mode, sel);
17793
17794 /* Check if the sel only references the first values vector. */
17795 if (GET_CODE (sel) == CONST_VECTOR
17796 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17797 {
17798 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17799 return;
17800 }
17801
17802 /* Check if the two values vectors are the same. */
17803 if (rtx_equal_p (op0, op1))
17804 {
17805 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17806 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17807 NULL, 0, OPTAB_DIRECT);
17808 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17809 return;
17810 }
17811
17812 /* Run TBL on for each value vector and combine the results. */
17813
17814 rtx res0 = gen_reg_rtx (data_mode);
17815 rtx res1 = gen_reg_rtx (data_mode);
17816 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17817 if (GET_CODE (sel) != CONST_VECTOR
17818 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17819 {
17820 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17821 2 * nunits - 1);
17822 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17823 NULL, 0, OPTAB_DIRECT);
17824 }
17825 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17826 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17827 NULL, 0, OPTAB_DIRECT);
17828 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17829 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17830 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17831 else
17832 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17833 }
17834
17835 /* Recognize patterns suitable for the TRN instructions. */
17836 static bool
17837 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17838 {
17839 HOST_WIDE_INT odd;
17840 poly_uint64 nelt = d->perm.length ();
17841 rtx out, in0, in1, x;
17842 machine_mode vmode = d->vmode;
17843
17844 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17845 return false;
17846
17847 /* Note that these are little-endian tests.
17848 We correct for big-endian later. */
17849 if (!d->perm[0].is_constant (&odd)
17850 || (odd != 0 && odd != 1)
17851 || !d->perm.series_p (0, 2, odd, 2)
17852 || !d->perm.series_p (1, 2, nelt + odd, 2))
17853 return false;
17854
17855 /* Success! */
17856 if (d->testing_p)
17857 return true;
17858
17859 in0 = d->op0;
17860 in1 = d->op1;
17861 /* We don't need a big-endian lane correction for SVE; see the comment
17862 at the head of aarch64-sve.md for details. */
17863 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17864 {
17865 x = in0, in0 = in1, in1 = x;
17866 odd = !odd;
17867 }
17868 out = d->target;
17869
17870 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17871 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17872 return true;
17873 }
17874
17875 /* Recognize patterns suitable for the UZP instructions. */
17876 static bool
17877 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17878 {
17879 HOST_WIDE_INT odd;
17880 rtx out, in0, in1, x;
17881 machine_mode vmode = d->vmode;
17882
17883 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17884 return false;
17885
17886 /* Note that these are little-endian tests.
17887 We correct for big-endian later. */
17888 if (!d->perm[0].is_constant (&odd)
17889 || (odd != 0 && odd != 1)
17890 || !d->perm.series_p (0, 1, odd, 2))
17891 return false;
17892
17893 /* Success! */
17894 if (d->testing_p)
17895 return true;
17896
17897 in0 = d->op0;
17898 in1 = d->op1;
17899 /* We don't need a big-endian lane correction for SVE; see the comment
17900 at the head of aarch64-sve.md for details. */
17901 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17902 {
17903 x = in0, in0 = in1, in1 = x;
17904 odd = !odd;
17905 }
17906 out = d->target;
17907
17908 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17909 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17910 return true;
17911 }
17912
17913 /* Recognize patterns suitable for the ZIP instructions. */
17914 static bool
17915 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17916 {
17917 unsigned int high;
17918 poly_uint64 nelt = d->perm.length ();
17919 rtx out, in0, in1, x;
17920 machine_mode vmode = d->vmode;
17921
17922 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17923 return false;
17924
17925 /* Note that these are little-endian tests.
17926 We correct for big-endian later. */
17927 poly_uint64 first = d->perm[0];
17928 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17929 || !d->perm.series_p (0, 2, first, 1)
17930 || !d->perm.series_p (1, 2, first + nelt, 1))
17931 return false;
17932 high = maybe_ne (first, 0U);
17933
17934 /* Success! */
17935 if (d->testing_p)
17936 return true;
17937
17938 in0 = d->op0;
17939 in1 = d->op1;
17940 /* We don't need a big-endian lane correction for SVE; see the comment
17941 at the head of aarch64-sve.md for details. */
17942 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17943 {
17944 x = in0, in0 = in1, in1 = x;
17945 high = !high;
17946 }
17947 out = d->target;
17948
17949 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17950 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17951 return true;
17952 }
17953
17954 /* Recognize patterns for the EXT insn. */
17955
17956 static bool
17957 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17958 {
17959 HOST_WIDE_INT location;
17960 rtx offset;
17961
17962 /* The first element always refers to the first vector.
17963 Check if the extracted indices are increasing by one. */
17964 if (d->vec_flags == VEC_SVE_PRED
17965 || !d->perm[0].is_constant (&location)
17966 || !d->perm.series_p (0, 1, location, 1))
17967 return false;
17968
17969 /* Success! */
17970 if (d->testing_p)
17971 return true;
17972
17973 /* The case where (location == 0) is a no-op for both big- and little-endian,
17974 and is removed by the mid-end at optimization levels -O1 and higher.
17975
17976 We don't need a big-endian lane correction for SVE; see the comment
17977 at the head of aarch64-sve.md for details. */
17978 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17979 {
17980 /* After setup, we want the high elements of the first vector (stored
17981 at the LSB end of the register), and the low elements of the second
17982 vector (stored at the MSB end of the register). So swap. */
17983 std::swap (d->op0, d->op1);
17984 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17985 to_constant () is safe since this is restricted to Advanced SIMD
17986 vectors. */
17987 location = d->perm.length ().to_constant () - location;
17988 }
17989
17990 offset = GEN_INT (location);
17991 emit_set_insn (d->target,
17992 gen_rtx_UNSPEC (d->vmode,
17993 gen_rtvec (3, d->op0, d->op1, offset),
17994 UNSPEC_EXT));
17995 return true;
17996 }
17997
17998 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17999 within each 64-bit, 32-bit or 16-bit granule. */
18000
18001 static bool
18002 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
18003 {
18004 HOST_WIDE_INT diff;
18005 unsigned int i, size, unspec;
18006 machine_mode pred_mode;
18007
18008 if (d->vec_flags == VEC_SVE_PRED
18009 || !d->one_vector_p
18010 || !d->perm[0].is_constant (&diff))
18011 return false;
18012
18013 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
18014 if (size == 8)
18015 {
18016 unspec = UNSPEC_REV64;
18017 pred_mode = VNx2BImode;
18018 }
18019 else if (size == 4)
18020 {
18021 unspec = UNSPEC_REV32;
18022 pred_mode = VNx4BImode;
18023 }
18024 else if (size == 2)
18025 {
18026 unspec = UNSPEC_REV16;
18027 pred_mode = VNx8BImode;
18028 }
18029 else
18030 return false;
18031
18032 unsigned int step = diff + 1;
18033 for (i = 0; i < step; ++i)
18034 if (!d->perm.series_p (i, step, diff - i, step))
18035 return false;
18036
18037 /* Success! */
18038 if (d->testing_p)
18039 return true;
18040
18041 if (d->vec_flags == VEC_SVE_DATA)
18042 {
18043 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
18044 rtx target = gen_reg_rtx (int_mode);
18045 if (BYTES_BIG_ENDIAN)
18046 /* The act of taking a subreg between INT_MODE and d->vmode
18047 is itself a reversing operation on big-endian targets;
18048 see the comment at the head of aarch64-sve.md for details.
18049 First reinterpret OP0 as INT_MODE without using a subreg
18050 and without changing the contents. */
18051 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
18052 else
18053 {
18054 /* For SVE we use REV[BHW] unspecs derived from the element size
18055 of v->mode and vector modes whose elements have SIZE bytes.
18056 This ensures that the vector modes match the predicate modes. */
18057 int unspec = aarch64_sve_rev_unspec (d->vmode);
18058 rtx pred = aarch64_ptrue_reg (pred_mode);
18059 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
18060 gen_lowpart (int_mode, d->op0)));
18061 }
18062 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
18063 return true;
18064 }
18065 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
18066 emit_set_insn (d->target, src);
18067 return true;
18068 }
18069
18070 /* Recognize patterns for the REV insn, which reverses elements within
18071 a full vector. */
18072
18073 static bool
18074 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
18075 {
18076 poly_uint64 nelt = d->perm.length ();
18077
18078 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
18079 return false;
18080
18081 if (!d->perm.series_p (0, 1, nelt - 1, -1))
18082 return false;
18083
18084 /* Success! */
18085 if (d->testing_p)
18086 return true;
18087
18088 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
18089 emit_set_insn (d->target, src);
18090 return true;
18091 }
18092
18093 static bool
18094 aarch64_evpc_dup (struct expand_vec_perm_d *d)
18095 {
18096 rtx out = d->target;
18097 rtx in0;
18098 HOST_WIDE_INT elt;
18099 machine_mode vmode = d->vmode;
18100 rtx lane;
18101
18102 if (d->vec_flags == VEC_SVE_PRED
18103 || d->perm.encoding ().encoded_nelts () != 1
18104 || !d->perm[0].is_constant (&elt))
18105 return false;
18106
18107 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
18108 return false;
18109
18110 /* Success! */
18111 if (d->testing_p)
18112 return true;
18113
18114 /* The generic preparation in aarch64_expand_vec_perm_const_1
18115 swaps the operand order and the permute indices if it finds
18116 d->perm[0] to be in the second operand. Thus, we can always
18117 use d->op0 and need not do any extra arithmetic to get the
18118 correct lane number. */
18119 in0 = d->op0;
18120 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
18121
18122 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
18123 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
18124 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
18125 return true;
18126 }
18127
18128 static bool
18129 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
18130 {
18131 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
18132 machine_mode vmode = d->vmode;
18133
18134 /* Make sure that the indices are constant. */
18135 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
18136 for (unsigned int i = 0; i < encoded_nelts; ++i)
18137 if (!d->perm[i].is_constant ())
18138 return false;
18139
18140 if (d->testing_p)
18141 return true;
18142
18143 /* Generic code will try constant permutation twice. Once with the
18144 original mode and again with the elements lowered to QImode.
18145 So wait and don't do the selector expansion ourselves. */
18146 if (vmode != V8QImode && vmode != V16QImode)
18147 return false;
18148
18149 /* to_constant is safe since this routine is specific to Advanced SIMD
18150 vectors. */
18151 unsigned int nelt = d->perm.length ().to_constant ();
18152 for (unsigned int i = 0; i < nelt; ++i)
18153 /* If big-endian and two vectors we end up with a weird mixed-endian
18154 mode on NEON. Reverse the index within each word but not the word
18155 itself. to_constant is safe because we checked is_constant above. */
18156 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
18157 ? d->perm[i].to_constant () ^ (nelt - 1)
18158 : d->perm[i].to_constant ());
18159
18160 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
18161 sel = force_reg (vmode, sel);
18162
18163 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
18164 return true;
18165 }
18166
18167 /* Try to implement D using an SVE TBL instruction. */
18168
18169 static bool
18170 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
18171 {
18172 unsigned HOST_WIDE_INT nelt;
18173
18174 /* Permuting two variable-length vectors could overflow the
18175 index range. */
18176 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
18177 return false;
18178
18179 if (d->testing_p)
18180 return true;
18181
18182 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
18183 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
18184 if (d->one_vector_p)
18185 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
18186 else
18187 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
18188 return true;
18189 }
18190
18191 /* Try to implement D using SVE SEL instruction. */
18192
18193 static bool
18194 aarch64_evpc_sel (struct expand_vec_perm_d *d)
18195 {
18196 machine_mode vmode = d->vmode;
18197 int unit_size = GET_MODE_UNIT_SIZE (vmode);
18198
18199 if (d->vec_flags != VEC_SVE_DATA
18200 || unit_size > 8)
18201 return false;
18202
18203 int n_patterns = d->perm.encoding ().npatterns ();
18204 poly_int64 vec_len = d->perm.length ();
18205
18206 for (int i = 0; i < n_patterns; ++i)
18207 if (!known_eq (d->perm[i], i)
18208 && !known_eq (d->perm[i], vec_len + i))
18209 return false;
18210
18211 for (int i = n_patterns; i < n_patterns * 2; i++)
18212 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
18213 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
18214 return false;
18215
18216 if (d->testing_p)
18217 return true;
18218
18219 machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
18220
18221 rtx_vector_builder builder (pred_mode, n_patterns, 2);
18222 for (int i = 0; i < n_patterns * 2; i++)
18223 {
18224 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
18225 : CONST0_RTX (BImode);
18226 builder.quick_push (elem);
18227 }
18228
18229 rtx const_vec = builder.build ();
18230 rtx pred = force_reg (pred_mode, const_vec);
18231 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op1, d->op0, pred));
18232 return true;
18233 }
18234
18235 static bool
18236 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
18237 {
18238 /* The pattern matching functions above are written to look for a small
18239 number to begin the sequence (0, 1, N/2). If we begin with an index
18240 from the second operand, we can swap the operands. */
18241 poly_int64 nelt = d->perm.length ();
18242 if (known_ge (d->perm[0], nelt))
18243 {
18244 d->perm.rotate_inputs (1);
18245 std::swap (d->op0, d->op1);
18246 }
18247
18248 if ((d->vec_flags == VEC_ADVSIMD
18249 || d->vec_flags == VEC_SVE_DATA
18250 || d->vec_flags == VEC_SVE_PRED)
18251 && known_gt (nelt, 1))
18252 {
18253 if (aarch64_evpc_rev_local (d))
18254 return true;
18255 else if (aarch64_evpc_rev_global (d))
18256 return true;
18257 else if (aarch64_evpc_ext (d))
18258 return true;
18259 else if (aarch64_evpc_dup (d))
18260 return true;
18261 else if (aarch64_evpc_zip (d))
18262 return true;
18263 else if (aarch64_evpc_uzp (d))
18264 return true;
18265 else if (aarch64_evpc_trn (d))
18266 return true;
18267 else if (aarch64_evpc_sel (d))
18268 return true;
18269 if (d->vec_flags == VEC_SVE_DATA)
18270 return aarch64_evpc_sve_tbl (d);
18271 else if (d->vec_flags == VEC_ADVSIMD)
18272 return aarch64_evpc_tbl (d);
18273 }
18274 return false;
18275 }
18276
18277 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
18278
18279 static bool
18280 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
18281 rtx op1, const vec_perm_indices &sel)
18282 {
18283 struct expand_vec_perm_d d;
18284
18285 /* Check whether the mask can be applied to a single vector. */
18286 if (sel.ninputs () == 1
18287 || (op0 && rtx_equal_p (op0, op1)))
18288 d.one_vector_p = true;
18289 else if (sel.all_from_input_p (0))
18290 {
18291 d.one_vector_p = true;
18292 op1 = op0;
18293 }
18294 else if (sel.all_from_input_p (1))
18295 {
18296 d.one_vector_p = true;
18297 op0 = op1;
18298 }
18299 else
18300 d.one_vector_p = false;
18301
18302 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18303 sel.nelts_per_input ());
18304 d.vmode = vmode;
18305 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18306 d.target = target;
18307 d.op0 = op0;
18308 d.op1 = op1;
18309 d.testing_p = !target;
18310
18311 if (!d.testing_p)
18312 return aarch64_expand_vec_perm_const_1 (&d);
18313
18314 rtx_insn *last = get_last_insn ();
18315 bool ret = aarch64_expand_vec_perm_const_1 (&d);
18316 gcc_assert (last == get_last_insn ());
18317
18318 return ret;
18319 }
18320
18321 /* Generate a byte permute mask for a register of mode MODE,
18322 which has NUNITS units. */
18323
18324 rtx
18325 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18326 {
18327 /* We have to reverse each vector because we dont have
18328 a permuted load that can reverse-load according to ABI rules. */
18329 rtx mask;
18330 rtvec v = rtvec_alloc (16);
18331 unsigned int i, j;
18332 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18333
18334 gcc_assert (BYTES_BIG_ENDIAN);
18335 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18336
18337 for (i = 0; i < nunits; i++)
18338 for (j = 0; j < usize; j++)
18339 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18340 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18341 return force_reg (V16QImode, mask);
18342 }
18343
18344 /* Expand an SVE integer comparison using the SVE equivalent of:
18345
18346 (set TARGET (CODE OP0 OP1)). */
18347
18348 void
18349 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18350 {
18351 machine_mode pred_mode = GET_MODE (target);
18352 machine_mode data_mode = GET_MODE (op0);
18353 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18354 op0, op1);
18355 if (!rtx_equal_p (target, res))
18356 emit_move_insn (target, res);
18357 }
18358
18359 /* Return the UNSPEC_COND_* code for comparison CODE. */
18360
18361 static unsigned int
18362 aarch64_unspec_cond_code (rtx_code code)
18363 {
18364 switch (code)
18365 {
18366 case NE:
18367 return UNSPEC_COND_FCMNE;
18368 case EQ:
18369 return UNSPEC_COND_FCMEQ;
18370 case LT:
18371 return UNSPEC_COND_FCMLT;
18372 case GT:
18373 return UNSPEC_COND_FCMGT;
18374 case LE:
18375 return UNSPEC_COND_FCMLE;
18376 case GE:
18377 return UNSPEC_COND_FCMGE;
18378 case UNORDERED:
18379 return UNSPEC_COND_FCMUO;
18380 default:
18381 gcc_unreachable ();
18382 }
18383 }
18384
18385 /* Emit:
18386
18387 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18388
18389 where <X> is the operation associated with comparison CODE.
18390 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18391
18392 static void
18393 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18394 bool known_ptrue_p, rtx op0, rtx op1)
18395 {
18396 rtx flag = gen_int_mode (known_ptrue_p, SImode);
18397 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18398 gen_rtvec (4, pred, flag, op0, op1),
18399 aarch64_unspec_cond_code (code));
18400 emit_set_insn (target, unspec);
18401 }
18402
18403 /* Emit the SVE equivalent of:
18404
18405 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18406 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18407 (set TARGET (ior:PRED_MODE TMP1 TMP2))
18408
18409 where <Xi> is the operation associated with comparison CODEi.
18410 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18411
18412 static void
18413 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18414 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18415 {
18416 machine_mode pred_mode = GET_MODE (pred);
18417 rtx tmp1 = gen_reg_rtx (pred_mode);
18418 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18419 rtx tmp2 = gen_reg_rtx (pred_mode);
18420 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18421 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18422 }
18423
18424 /* Emit the SVE equivalent of:
18425
18426 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18427 (set TARGET (not TMP))
18428
18429 where <X> is the operation associated with comparison CODE.
18430 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18431
18432 static void
18433 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18434 bool known_ptrue_p, rtx op0, rtx op1)
18435 {
18436 machine_mode pred_mode = GET_MODE (pred);
18437 rtx tmp = gen_reg_rtx (pred_mode);
18438 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18439 aarch64_emit_unop (target, one_cmpl_optab, tmp);
18440 }
18441
18442 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18443
18444 (set TARGET (CODE OP0 OP1))
18445
18446 If CAN_INVERT_P is true, the caller can also handle inverted results;
18447 return true if the result is in fact inverted. */
18448
18449 bool
18450 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18451 rtx op0, rtx op1, bool can_invert_p)
18452 {
18453 machine_mode pred_mode = GET_MODE (target);
18454 machine_mode data_mode = GET_MODE (op0);
18455
18456 rtx ptrue = aarch64_ptrue_reg (pred_mode);
18457 switch (code)
18458 {
18459 case UNORDERED:
18460 /* UNORDERED has no immediate form. */
18461 op1 = force_reg (data_mode, op1);
18462 /* fall through */
18463 case LT:
18464 case LE:
18465 case GT:
18466 case GE:
18467 case EQ:
18468 case NE:
18469 {
18470 /* There is native support for the comparison. */
18471 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18472 return false;
18473 }
18474
18475 case LTGT:
18476 /* This is a trapping operation (LT or GT). */
18477 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18478 return false;
18479
18480 case UNEQ:
18481 if (!flag_trapping_math)
18482 {
18483 /* This would trap for signaling NaNs. */
18484 op1 = force_reg (data_mode, op1);
18485 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18486 ptrue, true, op0, op1);
18487 return false;
18488 }
18489 /* fall through */
18490 case UNLT:
18491 case UNLE:
18492 case UNGT:
18493 case UNGE:
18494 if (flag_trapping_math)
18495 {
18496 /* Work out which elements are ordered. */
18497 rtx ordered = gen_reg_rtx (pred_mode);
18498 op1 = force_reg (data_mode, op1);
18499 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18500 ptrue, true, op0, op1);
18501
18502 /* Test the opposite condition for the ordered elements,
18503 then invert the result. */
18504 if (code == UNEQ)
18505 code = NE;
18506 else
18507 code = reverse_condition_maybe_unordered (code);
18508 if (can_invert_p)
18509 {
18510 aarch64_emit_sve_fp_cond (target, code,
18511 ordered, false, op0, op1);
18512 return true;
18513 }
18514 aarch64_emit_sve_invert_fp_cond (target, code,
18515 ordered, false, op0, op1);
18516 return false;
18517 }
18518 break;
18519
18520 case ORDERED:
18521 /* ORDERED has no immediate form. */
18522 op1 = force_reg (data_mode, op1);
18523 break;
18524
18525 default:
18526 gcc_unreachable ();
18527 }
18528
18529 /* There is native support for the inverse comparison. */
18530 code = reverse_condition_maybe_unordered (code);
18531 if (can_invert_p)
18532 {
18533 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18534 return true;
18535 }
18536 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18537 return false;
18538 }
18539
18540 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18541 of the data being selected and CMP_MODE is the mode of the values being
18542 compared. */
18543
18544 void
18545 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18546 rtx *ops)
18547 {
18548 machine_mode pred_mode
18549 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18550 GET_MODE_SIZE (cmp_mode)).require ();
18551 rtx pred = gen_reg_rtx (pred_mode);
18552 if (FLOAT_MODE_P (cmp_mode))
18553 {
18554 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18555 ops[4], ops[5], true))
18556 std::swap (ops[1], ops[2]);
18557 }
18558 else
18559 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18560
18561 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18562 ops[1] = force_reg (data_mode, ops[1]);
18563 /* The "false" value can only be zero if the "true" value is a constant. */
18564 if (register_operand (ops[1], data_mode)
18565 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18566 ops[2] = force_reg (data_mode, ops[2]);
18567
18568 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18569 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18570 }
18571
18572 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18573 true. However due to issues with register allocation it is preferable
18574 to avoid tieing integer scalar and FP scalar modes. Executing integer
18575 operations in general registers is better than treating them as scalar
18576 vector operations. This reduces latency and avoids redundant int<->FP
18577 moves. So tie modes if they are either the same class, or vector modes
18578 with other vector modes, vector structs or any scalar mode. */
18579
18580 static bool
18581 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18582 {
18583 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18584 return true;
18585
18586 /* We specifically want to allow elements of "structure" modes to
18587 be tieable to the structure. This more general condition allows
18588 other rarer situations too. The reason we don't extend this to
18589 predicate modes is that there are no predicate structure modes
18590 nor any specific instructions for extracting part of a predicate
18591 register. */
18592 if (aarch64_vector_data_mode_p (mode1)
18593 && aarch64_vector_data_mode_p (mode2))
18594 return true;
18595
18596 /* Also allow any scalar modes with vectors. */
18597 if (aarch64_vector_mode_supported_p (mode1)
18598 || aarch64_vector_mode_supported_p (mode2))
18599 return true;
18600
18601 return false;
18602 }
18603
18604 /* Return a new RTX holding the result of moving POINTER forward by
18605 AMOUNT bytes. */
18606
18607 static rtx
18608 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18609 {
18610 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18611
18612 return adjust_automodify_address (pointer, GET_MODE (pointer),
18613 next, amount);
18614 }
18615
18616 /* Return a new RTX holding the result of moving POINTER forward by the
18617 size of the mode it points to. */
18618
18619 static rtx
18620 aarch64_progress_pointer (rtx pointer)
18621 {
18622 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18623 }
18624
18625 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18626 MODE bytes. */
18627
18628 static void
18629 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18630 machine_mode mode)
18631 {
18632 rtx reg = gen_reg_rtx (mode);
18633
18634 /* "Cast" the pointers to the correct mode. */
18635 *src = adjust_address (*src, mode, 0);
18636 *dst = adjust_address (*dst, mode, 0);
18637 /* Emit the memcpy. */
18638 emit_move_insn (reg, *src);
18639 emit_move_insn (*dst, reg);
18640 /* Move the pointers forward. */
18641 *src = aarch64_progress_pointer (*src);
18642 *dst = aarch64_progress_pointer (*dst);
18643 }
18644
18645 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18646 we succeed, otherwise return false. */
18647
18648 bool
18649 aarch64_expand_cpymem (rtx *operands)
18650 {
18651 int n, mode_bits;
18652 rtx dst = operands[0];
18653 rtx src = operands[1];
18654 rtx base;
18655 machine_mode cur_mode = BLKmode, next_mode;
18656 bool speed_p = !optimize_function_for_size_p (cfun);
18657
18658 /* When optimizing for size, give a better estimate of the length of a
18659 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18660 will always require an even number of instructions to do now. And each
18661 operation requires both a load+store, so devide the max number by 2. */
18662 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18663
18664 /* We can't do anything smart if the amount to copy is not constant. */
18665 if (!CONST_INT_P (operands[2]))
18666 return false;
18667
18668 n = INTVAL (operands[2]);
18669
18670 /* Try to keep the number of instructions low. For all cases we will do at
18671 most two moves for the residual amount, since we'll always overlap the
18672 remainder. */
18673 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18674 return false;
18675
18676 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18677 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18678
18679 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18680 src = adjust_automodify_address (src, VOIDmode, base, 0);
18681
18682 /* Convert n to bits to make the rest of the code simpler. */
18683 n = n * BITS_PER_UNIT;
18684
18685 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18686 larger than TImode, but we should not use them for loads/stores here. */
18687 const int copy_limit = GET_MODE_BITSIZE (TImode);
18688
18689 while (n > 0)
18690 {
18691 /* Find the largest mode in which to do the copy in without over reading
18692 or writing. */
18693 opt_scalar_int_mode mode_iter;
18694 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18695 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18696 cur_mode = mode_iter.require ();
18697
18698 gcc_assert (cur_mode != BLKmode);
18699
18700 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18701 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18702
18703 n -= mode_bits;
18704
18705 /* Do certain trailing copies as overlapping if it's going to be
18706 cheaper. i.e. less instructions to do so. For instance doing a 15
18707 byte copy it's more efficient to do two overlapping 8 byte copies than
18708 8 + 6 + 1. */
18709 if (n > 0 && n <= 8 * BITS_PER_UNIT)
18710 {
18711 next_mode = smallest_mode_for_size (n, MODE_INT);
18712 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18713 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18714 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18715 n = n_bits;
18716 }
18717 }
18718
18719 return true;
18720 }
18721
18722 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18723 SImode stores. Handle the case when the constant has identical
18724 bottom and top halves. This is beneficial when the two stores can be
18725 merged into an STP and we avoid synthesising potentially expensive
18726 immediates twice. Return true if such a split is possible. */
18727
18728 bool
18729 aarch64_split_dimode_const_store (rtx dst, rtx src)
18730 {
18731 rtx lo = gen_lowpart (SImode, src);
18732 rtx hi = gen_highpart_mode (SImode, DImode, src);
18733
18734 bool size_p = optimize_function_for_size_p (cfun);
18735
18736 if (!rtx_equal_p (lo, hi))
18737 return false;
18738
18739 unsigned int orig_cost
18740 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18741 unsigned int lo_cost
18742 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18743
18744 /* We want to transform:
18745 MOV x1, 49370
18746 MOVK x1, 0x140, lsl 16
18747 MOVK x1, 0xc0da, lsl 32
18748 MOVK x1, 0x140, lsl 48
18749 STR x1, [x0]
18750 into:
18751 MOV w1, 49370
18752 MOVK w1, 0x140, lsl 16
18753 STP w1, w1, [x0]
18754 So we want to perform this only when we save two instructions
18755 or more. When optimizing for size, however, accept any code size
18756 savings we can. */
18757 if (size_p && orig_cost <= lo_cost)
18758 return false;
18759
18760 if (!size_p
18761 && (orig_cost <= lo_cost + 1))
18762 return false;
18763
18764 rtx mem_lo = adjust_address (dst, SImode, 0);
18765 if (!aarch64_mem_pair_operand (mem_lo, SImode))
18766 return false;
18767
18768 rtx tmp_reg = gen_reg_rtx (SImode);
18769 aarch64_expand_mov_immediate (tmp_reg, lo);
18770 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18771 /* Don't emit an explicit store pair as this may not be always profitable.
18772 Let the sched-fusion logic decide whether to merge them. */
18773 emit_move_insn (mem_lo, tmp_reg);
18774 emit_move_insn (mem_hi, tmp_reg);
18775
18776 return true;
18777 }
18778
18779 /* Generate RTL for a conditional branch with rtx comparison CODE in
18780 mode CC_MODE. The destination of the unlikely conditional branch
18781 is LABEL_REF. */
18782
18783 void
18784 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18785 rtx label_ref)
18786 {
18787 rtx x;
18788 x = gen_rtx_fmt_ee (code, VOIDmode,
18789 gen_rtx_REG (cc_mode, CC_REGNUM),
18790 const0_rtx);
18791
18792 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18793 gen_rtx_LABEL_REF (VOIDmode, label_ref),
18794 pc_rtx);
18795 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18796 }
18797
18798 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18799
18800 OP1 represents the TImode destination operand 1
18801 OP2 represents the TImode destination operand 2
18802 LOW_DEST represents the low half (DImode) of TImode operand 0
18803 LOW_IN1 represents the low half (DImode) of TImode operand 1
18804 LOW_IN2 represents the low half (DImode) of TImode operand 2
18805 HIGH_DEST represents the high half (DImode) of TImode operand 0
18806 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18807 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18808
18809 void
18810 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18811 rtx *low_in1, rtx *low_in2,
18812 rtx *high_dest, rtx *high_in1,
18813 rtx *high_in2)
18814 {
18815 *low_dest = gen_reg_rtx (DImode);
18816 *low_in1 = gen_lowpart (DImode, op1);
18817 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18818 subreg_lowpart_offset (DImode, TImode));
18819 *high_dest = gen_reg_rtx (DImode);
18820 *high_in1 = gen_highpart (DImode, op1);
18821 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18822 subreg_highpart_offset (DImode, TImode));
18823 }
18824
18825 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18826
18827 This function differs from 'arch64_addti_scratch_regs' in that
18828 OP1 can be an immediate constant (zero). We must call
18829 subreg_highpart_offset with DImode and TImode arguments, otherwise
18830 VOIDmode will be used for the const_int which generates an internal
18831 error from subreg_size_highpart_offset which does not expect a size of zero.
18832
18833 OP1 represents the TImode destination operand 1
18834 OP2 represents the TImode destination operand 2
18835 LOW_DEST represents the low half (DImode) of TImode operand 0
18836 LOW_IN1 represents the low half (DImode) of TImode operand 1
18837 LOW_IN2 represents the low half (DImode) of TImode operand 2
18838 HIGH_DEST represents the high half (DImode) of TImode operand 0
18839 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18840 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18841
18842
18843 void
18844 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18845 rtx *low_in1, rtx *low_in2,
18846 rtx *high_dest, rtx *high_in1,
18847 rtx *high_in2)
18848 {
18849 *low_dest = gen_reg_rtx (DImode);
18850 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18851 subreg_lowpart_offset (DImode, TImode));
18852
18853 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18854 subreg_lowpart_offset (DImode, TImode));
18855 *high_dest = gen_reg_rtx (DImode);
18856
18857 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18858 subreg_highpart_offset (DImode, TImode));
18859 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18860 subreg_highpart_offset (DImode, TImode));
18861 }
18862
18863 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18864
18865 OP0 represents the TImode destination operand 0
18866 LOW_DEST represents the low half (DImode) of TImode operand 0
18867 LOW_IN1 represents the low half (DImode) of TImode operand 1
18868 LOW_IN2 represents the low half (DImode) of TImode operand 2
18869 HIGH_DEST represents the high half (DImode) of TImode operand 0
18870 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18871 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18872 UNSIGNED_P is true if the operation is being performed on unsigned
18873 values. */
18874 void
18875 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18876 rtx low_in2, rtx high_dest, rtx high_in1,
18877 rtx high_in2, bool unsigned_p)
18878 {
18879 if (low_in2 == const0_rtx)
18880 {
18881 low_dest = low_in1;
18882 high_in2 = force_reg (DImode, high_in2);
18883 if (unsigned_p)
18884 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18885 else
18886 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18887 }
18888 else
18889 {
18890 if (CONST_INT_P (low_in2))
18891 {
18892 high_in2 = force_reg (DImode, high_in2);
18893 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18894 GEN_INT (-INTVAL (low_in2))));
18895 }
18896 else
18897 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18898
18899 if (unsigned_p)
18900 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18901 else
18902 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18903 }
18904
18905 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18906 emit_move_insn (gen_highpart (DImode, op0), high_dest);
18907
18908 }
18909
18910 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18911
18912 static unsigned HOST_WIDE_INT
18913 aarch64_asan_shadow_offset (void)
18914 {
18915 if (TARGET_ILP32)
18916 return (HOST_WIDE_INT_1 << 29);
18917 else
18918 return (HOST_WIDE_INT_1 << 36);
18919 }
18920
18921 static rtx
18922 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18923 int code, tree treeop0, tree treeop1)
18924 {
18925 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18926 rtx op0, op1;
18927 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18928 insn_code icode;
18929 struct expand_operand ops[4];
18930
18931 start_sequence ();
18932 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18933
18934 op_mode = GET_MODE (op0);
18935 if (op_mode == VOIDmode)
18936 op_mode = GET_MODE (op1);
18937
18938 switch (op_mode)
18939 {
18940 case E_QImode:
18941 case E_HImode:
18942 case E_SImode:
18943 cmp_mode = SImode;
18944 icode = CODE_FOR_cmpsi;
18945 break;
18946
18947 case E_DImode:
18948 cmp_mode = DImode;
18949 icode = CODE_FOR_cmpdi;
18950 break;
18951
18952 case E_SFmode:
18953 cmp_mode = SFmode;
18954 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18955 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18956 break;
18957
18958 case E_DFmode:
18959 cmp_mode = DFmode;
18960 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18961 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18962 break;
18963
18964 default:
18965 end_sequence ();
18966 return NULL_RTX;
18967 }
18968
18969 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18970 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18971 if (!op0 || !op1)
18972 {
18973 end_sequence ();
18974 return NULL_RTX;
18975 }
18976 *prep_seq = get_insns ();
18977 end_sequence ();
18978
18979 create_fixed_operand (&ops[0], op0);
18980 create_fixed_operand (&ops[1], op1);
18981
18982 start_sequence ();
18983 if (!maybe_expand_insn (icode, 2, ops))
18984 {
18985 end_sequence ();
18986 return NULL_RTX;
18987 }
18988 *gen_seq = get_insns ();
18989 end_sequence ();
18990
18991 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18992 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18993 }
18994
18995 static rtx
18996 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18997 int cmp_code, tree treeop0, tree treeop1, int bit_code)
18998 {
18999 rtx op0, op1, target;
19000 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
19001 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
19002 insn_code icode;
19003 struct expand_operand ops[6];
19004 int aarch64_cond;
19005
19006 push_to_sequence (*prep_seq);
19007 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
19008
19009 op_mode = GET_MODE (op0);
19010 if (op_mode == VOIDmode)
19011 op_mode = GET_MODE (op1);
19012
19013 switch (op_mode)
19014 {
19015 case E_QImode:
19016 case E_HImode:
19017 case E_SImode:
19018 cmp_mode = SImode;
19019 icode = CODE_FOR_ccmpsi;
19020 break;
19021
19022 case E_DImode:
19023 cmp_mode = DImode;
19024 icode = CODE_FOR_ccmpdi;
19025 break;
19026
19027 case E_SFmode:
19028 cmp_mode = SFmode;
19029 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
19030 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
19031 break;
19032
19033 case E_DFmode:
19034 cmp_mode = DFmode;
19035 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
19036 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
19037 break;
19038
19039 default:
19040 end_sequence ();
19041 return NULL_RTX;
19042 }
19043
19044 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
19045 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
19046 if (!op0 || !op1)
19047 {
19048 end_sequence ();
19049 return NULL_RTX;
19050 }
19051 *prep_seq = get_insns ();
19052 end_sequence ();
19053
19054 target = gen_rtx_REG (cc_mode, CC_REGNUM);
19055 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
19056
19057 if (bit_code != AND)
19058 {
19059 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
19060 GET_MODE (XEXP (prev, 0))),
19061 VOIDmode, XEXP (prev, 0), const0_rtx);
19062 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
19063 }
19064
19065 create_fixed_operand (&ops[0], XEXP (prev, 0));
19066 create_fixed_operand (&ops[1], target);
19067 create_fixed_operand (&ops[2], op0);
19068 create_fixed_operand (&ops[3], op1);
19069 create_fixed_operand (&ops[4], prev);
19070 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
19071
19072 push_to_sequence (*gen_seq);
19073 if (!maybe_expand_insn (icode, 6, ops))
19074 {
19075 end_sequence ();
19076 return NULL_RTX;
19077 }
19078
19079 *gen_seq = get_insns ();
19080 end_sequence ();
19081
19082 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
19083 }
19084
19085 #undef TARGET_GEN_CCMP_FIRST
19086 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
19087
19088 #undef TARGET_GEN_CCMP_NEXT
19089 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
19090
19091 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
19092 instruction fusion of some sort. */
19093
19094 static bool
19095 aarch64_macro_fusion_p (void)
19096 {
19097 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
19098 }
19099
19100
19101 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
19102 should be kept together during scheduling. */
19103
19104 static bool
19105 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
19106 {
19107 rtx set_dest;
19108 rtx prev_set = single_set (prev);
19109 rtx curr_set = single_set (curr);
19110 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
19111 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
19112
19113 if (!aarch64_macro_fusion_p ())
19114 return false;
19115
19116 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
19117 {
19118 /* We are trying to match:
19119 prev (mov) == (set (reg r0) (const_int imm16))
19120 curr (movk) == (set (zero_extract (reg r0)
19121 (const_int 16)
19122 (const_int 16))
19123 (const_int imm16_1)) */
19124
19125 set_dest = SET_DEST (curr_set);
19126
19127 if (GET_CODE (set_dest) == ZERO_EXTRACT
19128 && CONST_INT_P (SET_SRC (curr_set))
19129 && CONST_INT_P (SET_SRC (prev_set))
19130 && CONST_INT_P (XEXP (set_dest, 2))
19131 && INTVAL (XEXP (set_dest, 2)) == 16
19132 && REG_P (XEXP (set_dest, 0))
19133 && REG_P (SET_DEST (prev_set))
19134 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
19135 {
19136 return true;
19137 }
19138 }
19139
19140 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
19141 {
19142
19143 /* We're trying to match:
19144 prev (adrp) == (set (reg r1)
19145 (high (symbol_ref ("SYM"))))
19146 curr (add) == (set (reg r0)
19147 (lo_sum (reg r1)
19148 (symbol_ref ("SYM"))))
19149 Note that r0 need not necessarily be the same as r1, especially
19150 during pre-regalloc scheduling. */
19151
19152 if (satisfies_constraint_Ush (SET_SRC (prev_set))
19153 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
19154 {
19155 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
19156 && REG_P (XEXP (SET_SRC (curr_set), 0))
19157 && REGNO (XEXP (SET_SRC (curr_set), 0))
19158 == REGNO (SET_DEST (prev_set))
19159 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
19160 XEXP (SET_SRC (curr_set), 1)))
19161 return true;
19162 }
19163 }
19164
19165 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
19166 {
19167
19168 /* We're trying to match:
19169 prev (movk) == (set (zero_extract (reg r0)
19170 (const_int 16)
19171 (const_int 32))
19172 (const_int imm16_1))
19173 curr (movk) == (set (zero_extract (reg r0)
19174 (const_int 16)
19175 (const_int 48))
19176 (const_int imm16_2)) */
19177
19178 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
19179 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
19180 && REG_P (XEXP (SET_DEST (prev_set), 0))
19181 && REG_P (XEXP (SET_DEST (curr_set), 0))
19182 && REGNO (XEXP (SET_DEST (prev_set), 0))
19183 == REGNO (XEXP (SET_DEST (curr_set), 0))
19184 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
19185 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
19186 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
19187 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
19188 && CONST_INT_P (SET_SRC (prev_set))
19189 && CONST_INT_P (SET_SRC (curr_set)))
19190 return true;
19191
19192 }
19193 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
19194 {
19195 /* We're trying to match:
19196 prev (adrp) == (set (reg r0)
19197 (high (symbol_ref ("SYM"))))
19198 curr (ldr) == (set (reg r1)
19199 (mem (lo_sum (reg r0)
19200 (symbol_ref ("SYM")))))
19201 or
19202 curr (ldr) == (set (reg r1)
19203 (zero_extend (mem
19204 (lo_sum (reg r0)
19205 (symbol_ref ("SYM")))))) */
19206 if (satisfies_constraint_Ush (SET_SRC (prev_set))
19207 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
19208 {
19209 rtx curr_src = SET_SRC (curr_set);
19210
19211 if (GET_CODE (curr_src) == ZERO_EXTEND)
19212 curr_src = XEXP (curr_src, 0);
19213
19214 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
19215 && REG_P (XEXP (XEXP (curr_src, 0), 0))
19216 && REGNO (XEXP (XEXP (curr_src, 0), 0))
19217 == REGNO (SET_DEST (prev_set))
19218 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
19219 XEXP (SET_SRC (prev_set), 0)))
19220 return true;
19221 }
19222 }
19223
19224 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
19225 && any_condjump_p (curr))
19226 {
19227 unsigned int condreg1, condreg2;
19228 rtx cc_reg_1;
19229 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
19230 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
19231
19232 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
19233 && prev
19234 && modified_in_p (cc_reg_1, prev))
19235 {
19236 enum attr_type prev_type = get_attr_type (prev);
19237
19238 /* FIXME: this misses some which is considered simple arthematic
19239 instructions for ThunderX. Simple shifts are missed here. */
19240 if (prev_type == TYPE_ALUS_SREG
19241 || prev_type == TYPE_ALUS_IMM
19242 || prev_type == TYPE_LOGICS_REG
19243 || prev_type == TYPE_LOGICS_IMM)
19244 return true;
19245 }
19246 }
19247
19248 if (prev_set
19249 && curr_set
19250 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
19251 && any_condjump_p (curr))
19252 {
19253 /* We're trying to match:
19254 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
19255 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
19256 (const_int 0))
19257 (label_ref ("SYM"))
19258 (pc)) */
19259 if (SET_DEST (curr_set) == (pc_rtx)
19260 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
19261 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
19262 && REG_P (SET_DEST (prev_set))
19263 && REGNO (SET_DEST (prev_set))
19264 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
19265 {
19266 /* Fuse ALU operations followed by conditional branch instruction. */
19267 switch (get_attr_type (prev))
19268 {
19269 case TYPE_ALU_IMM:
19270 case TYPE_ALU_SREG:
19271 case TYPE_ADC_REG:
19272 case TYPE_ADC_IMM:
19273 case TYPE_ADCS_REG:
19274 case TYPE_ADCS_IMM:
19275 case TYPE_LOGIC_REG:
19276 case TYPE_LOGIC_IMM:
19277 case TYPE_CSEL:
19278 case TYPE_ADR:
19279 case TYPE_MOV_IMM:
19280 case TYPE_SHIFT_REG:
19281 case TYPE_SHIFT_IMM:
19282 case TYPE_BFM:
19283 case TYPE_RBIT:
19284 case TYPE_REV:
19285 case TYPE_EXTEND:
19286 return true;
19287
19288 default:;
19289 }
19290 }
19291 }
19292
19293 return false;
19294 }
19295
19296 /* Return true iff the instruction fusion described by OP is enabled. */
19297
19298 bool
19299 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
19300 {
19301 return (aarch64_tune_params.fusible_ops & op) != 0;
19302 }
19303
19304 /* If MEM is in the form of [base+offset], extract the two parts
19305 of address and set to BASE and OFFSET, otherwise return false
19306 after clearing BASE and OFFSET. */
19307
19308 bool
19309 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19310 {
19311 rtx addr;
19312
19313 gcc_assert (MEM_P (mem));
19314
19315 addr = XEXP (mem, 0);
19316
19317 if (REG_P (addr))
19318 {
19319 *base = addr;
19320 *offset = const0_rtx;
19321 return true;
19322 }
19323
19324 if (GET_CODE (addr) == PLUS
19325 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19326 {
19327 *base = XEXP (addr, 0);
19328 *offset = XEXP (addr, 1);
19329 return true;
19330 }
19331
19332 *base = NULL_RTX;
19333 *offset = NULL_RTX;
19334
19335 return false;
19336 }
19337
19338 /* Types for scheduling fusion. */
19339 enum sched_fusion_type
19340 {
19341 SCHED_FUSION_NONE = 0,
19342 SCHED_FUSION_LD_SIGN_EXTEND,
19343 SCHED_FUSION_LD_ZERO_EXTEND,
19344 SCHED_FUSION_LD,
19345 SCHED_FUSION_ST,
19346 SCHED_FUSION_NUM
19347 };
19348
19349 /* If INSN is a load or store of address in the form of [base+offset],
19350 extract the two parts and set to BASE and OFFSET. Return scheduling
19351 fusion type this INSN is. */
19352
19353 static enum sched_fusion_type
19354 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19355 {
19356 rtx x, dest, src;
19357 enum sched_fusion_type fusion = SCHED_FUSION_LD;
19358
19359 gcc_assert (INSN_P (insn));
19360 x = PATTERN (insn);
19361 if (GET_CODE (x) != SET)
19362 return SCHED_FUSION_NONE;
19363
19364 src = SET_SRC (x);
19365 dest = SET_DEST (x);
19366
19367 machine_mode dest_mode = GET_MODE (dest);
19368
19369 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19370 return SCHED_FUSION_NONE;
19371
19372 if (GET_CODE (src) == SIGN_EXTEND)
19373 {
19374 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19375 src = XEXP (src, 0);
19376 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19377 return SCHED_FUSION_NONE;
19378 }
19379 else if (GET_CODE (src) == ZERO_EXTEND)
19380 {
19381 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19382 src = XEXP (src, 0);
19383 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19384 return SCHED_FUSION_NONE;
19385 }
19386
19387 if (GET_CODE (src) == MEM && REG_P (dest))
19388 extract_base_offset_in_addr (src, base, offset);
19389 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19390 {
19391 fusion = SCHED_FUSION_ST;
19392 extract_base_offset_in_addr (dest, base, offset);
19393 }
19394 else
19395 return SCHED_FUSION_NONE;
19396
19397 if (*base == NULL_RTX || *offset == NULL_RTX)
19398 fusion = SCHED_FUSION_NONE;
19399
19400 return fusion;
19401 }
19402
19403 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19404
19405 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19406 and PRI are only calculated for these instructions. For other instruction,
19407 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
19408 type instruction fusion can be added by returning different priorities.
19409
19410 It's important that irrelevant instructions get the largest FUSION_PRI. */
19411
19412 static void
19413 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19414 int *fusion_pri, int *pri)
19415 {
19416 int tmp, off_val;
19417 rtx base, offset;
19418 enum sched_fusion_type fusion;
19419
19420 gcc_assert (INSN_P (insn));
19421
19422 tmp = max_pri - 1;
19423 fusion = fusion_load_store (insn, &base, &offset);
19424 if (fusion == SCHED_FUSION_NONE)
19425 {
19426 *pri = tmp;
19427 *fusion_pri = tmp;
19428 return;
19429 }
19430
19431 /* Set FUSION_PRI according to fusion type and base register. */
19432 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19433
19434 /* Calculate PRI. */
19435 tmp /= 2;
19436
19437 /* INSN with smaller offset goes first. */
19438 off_val = (int)(INTVAL (offset));
19439 if (off_val >= 0)
19440 tmp -= (off_val & 0xfffff);
19441 else
19442 tmp += ((- off_val) & 0xfffff);
19443
19444 *pri = tmp;
19445 return;
19446 }
19447
19448 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19449 Adjust priority of sha1h instructions so they are scheduled before
19450 other SHA1 instructions. */
19451
19452 static int
19453 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19454 {
19455 rtx x = PATTERN (insn);
19456
19457 if (GET_CODE (x) == SET)
19458 {
19459 x = SET_SRC (x);
19460
19461 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19462 return priority + 10;
19463 }
19464
19465 return priority;
19466 }
19467
19468 /* Given OPERANDS of consecutive load/store, check if we can merge
19469 them into ldp/stp. LOAD is true if they are load instructions.
19470 MODE is the mode of memory operands. */
19471
19472 bool
19473 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19474 machine_mode mode)
19475 {
19476 HOST_WIDE_INT offval_1, offval_2, msize;
19477 enum reg_class rclass_1, rclass_2;
19478 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19479
19480 if (load)
19481 {
19482 mem_1 = operands[1];
19483 mem_2 = operands[3];
19484 reg_1 = operands[0];
19485 reg_2 = operands[2];
19486 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19487 if (REGNO (reg_1) == REGNO (reg_2))
19488 return false;
19489 }
19490 else
19491 {
19492 mem_1 = operands[0];
19493 mem_2 = operands[2];
19494 reg_1 = operands[1];
19495 reg_2 = operands[3];
19496 }
19497
19498 /* The mems cannot be volatile. */
19499 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19500 return false;
19501
19502 /* If we have SImode and slow unaligned ldp,
19503 check the alignment to be at least 8 byte. */
19504 if (mode == SImode
19505 && (aarch64_tune_params.extra_tuning_flags
19506 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19507 && !optimize_size
19508 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19509 return false;
19510
19511 /* Check if the addresses are in the form of [base+offset]. */
19512 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19513 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19514 return false;
19515 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19516 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19517 return false;
19518
19519 /* Check if the bases are same. */
19520 if (!rtx_equal_p (base_1, base_2))
19521 return false;
19522
19523 /* The operands must be of the same size. */
19524 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19525 GET_MODE_SIZE (GET_MODE (mem_2))));
19526
19527 offval_1 = INTVAL (offset_1);
19528 offval_2 = INTVAL (offset_2);
19529 /* We should only be trying this for fixed-sized modes. There is no
19530 SVE LDP/STP instruction. */
19531 msize = GET_MODE_SIZE (mode).to_constant ();
19532 /* Check if the offsets are consecutive. */
19533 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19534 return false;
19535
19536 /* Check if the addresses are clobbered by load. */
19537 if (load)
19538 {
19539 if (reg_mentioned_p (reg_1, mem_1))
19540 return false;
19541
19542 /* In increasing order, the last load can clobber the address. */
19543 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19544 return false;
19545 }
19546
19547 /* One of the memory accesses must be a mempair operand.
19548 If it is not the first one, they need to be swapped by the
19549 peephole. */
19550 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19551 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19552 return false;
19553
19554 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19555 rclass_1 = FP_REGS;
19556 else
19557 rclass_1 = GENERAL_REGS;
19558
19559 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19560 rclass_2 = FP_REGS;
19561 else
19562 rclass_2 = GENERAL_REGS;
19563
19564 /* Check if the registers are of same class. */
19565 if (rclass_1 != rclass_2)
19566 return false;
19567
19568 return true;
19569 }
19570
19571 /* Given OPERANDS of consecutive load/store that can be merged,
19572 swap them if they are not in ascending order. */
19573 void
19574 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19575 {
19576 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19577 HOST_WIDE_INT offval_1, offval_2;
19578
19579 if (load)
19580 {
19581 mem_1 = operands[1];
19582 mem_2 = operands[3];
19583 }
19584 else
19585 {
19586 mem_1 = operands[0];
19587 mem_2 = operands[2];
19588 }
19589
19590 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19591 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19592
19593 offval_1 = INTVAL (offset_1);
19594 offval_2 = INTVAL (offset_2);
19595
19596 if (offval_1 > offval_2)
19597 {
19598 /* Irrespective of whether this is a load or a store,
19599 we do the same swap. */
19600 std::swap (operands[0], operands[2]);
19601 std::swap (operands[1], operands[3]);
19602 }
19603 }
19604
19605 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19606 comparison between the two. */
19607 int
19608 aarch64_host_wide_int_compare (const void *x, const void *y)
19609 {
19610 return wi::cmps (* ((const HOST_WIDE_INT *) x),
19611 * ((const HOST_WIDE_INT *) y));
19612 }
19613
19614 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19615 other pointing to a REG rtx containing an offset, compare the offsets
19616 of the two pairs.
19617
19618 Return:
19619
19620 1 iff offset (X) > offset (Y)
19621 0 iff offset (X) == offset (Y)
19622 -1 iff offset (X) < offset (Y) */
19623 int
19624 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19625 {
19626 const rtx * operands_1 = (const rtx *) x;
19627 const rtx * operands_2 = (const rtx *) y;
19628 rtx mem_1, mem_2, base, offset_1, offset_2;
19629
19630 if (MEM_P (operands_1[0]))
19631 mem_1 = operands_1[0];
19632 else
19633 mem_1 = operands_1[1];
19634
19635 if (MEM_P (operands_2[0]))
19636 mem_2 = operands_2[0];
19637 else
19638 mem_2 = operands_2[1];
19639
19640 /* Extract the offsets. */
19641 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19642 extract_base_offset_in_addr (mem_2, &base, &offset_2);
19643
19644 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19645
19646 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19647 }
19648
19649 /* Given OPERANDS of consecutive load/store, check if we can merge
19650 them into ldp/stp by adjusting the offset. LOAD is true if they
19651 are load instructions. MODE is the mode of memory operands.
19652
19653 Given below consecutive stores:
19654
19655 str w1, [xb, 0x100]
19656 str w1, [xb, 0x104]
19657 str w1, [xb, 0x108]
19658 str w1, [xb, 0x10c]
19659
19660 Though the offsets are out of the range supported by stp, we can
19661 still pair them after adjusting the offset, like:
19662
19663 add scratch, xb, 0x100
19664 stp w1, w1, [scratch]
19665 stp w1, w1, [scratch, 0x8]
19666
19667 The peephole patterns detecting this opportunity should guarantee
19668 the scratch register is avaliable. */
19669
19670 bool
19671 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19672 scalar_mode mode)
19673 {
19674 const int num_insns = 4;
19675 enum reg_class rclass;
19676 HOST_WIDE_INT offvals[num_insns], msize;
19677 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19678
19679 if (load)
19680 {
19681 for (int i = 0; i < num_insns; i++)
19682 {
19683 reg[i] = operands[2 * i];
19684 mem[i] = operands[2 * i + 1];
19685
19686 gcc_assert (REG_P (reg[i]));
19687 }
19688
19689 /* Do not attempt to merge the loads if the loads clobber each other. */
19690 for (int i = 0; i < 8; i += 2)
19691 for (int j = i + 2; j < 8; j += 2)
19692 if (reg_overlap_mentioned_p (operands[i], operands[j]))
19693 return false;
19694 }
19695 else
19696 for (int i = 0; i < num_insns; i++)
19697 {
19698 mem[i] = operands[2 * i];
19699 reg[i] = operands[2 * i + 1];
19700 }
19701
19702 /* Skip if memory operand is by itself valid for ldp/stp. */
19703 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19704 return false;
19705
19706 for (int i = 0; i < num_insns; i++)
19707 {
19708 /* The mems cannot be volatile. */
19709 if (MEM_VOLATILE_P (mem[i]))
19710 return false;
19711
19712 /* Check if the addresses are in the form of [base+offset]. */
19713 extract_base_offset_in_addr (mem[i], base + i, offset + i);
19714 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19715 return false;
19716 }
19717
19718 /* Check if the registers are of same class. */
19719 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19720 ? FP_REGS : GENERAL_REGS;
19721
19722 for (int i = 1; i < num_insns; i++)
19723 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19724 {
19725 if (rclass != FP_REGS)
19726 return false;
19727 }
19728 else
19729 {
19730 if (rclass != GENERAL_REGS)
19731 return false;
19732 }
19733
19734 /* Only the last register in the order in which they occur
19735 may be clobbered by the load. */
19736 if (rclass == GENERAL_REGS && load)
19737 for (int i = 0; i < num_insns - 1; i++)
19738 if (reg_mentioned_p (reg[i], mem[i]))
19739 return false;
19740
19741 /* Check if the bases are same. */
19742 for (int i = 0; i < num_insns - 1; i++)
19743 if (!rtx_equal_p (base[i], base[i + 1]))
19744 return false;
19745
19746 for (int i = 0; i < num_insns; i++)
19747 offvals[i] = INTVAL (offset[i]);
19748
19749 msize = GET_MODE_SIZE (mode);
19750
19751 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19752 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19753 aarch64_host_wide_int_compare);
19754
19755 if (!(offvals[1] == offvals[0] + msize
19756 && offvals[3] == offvals[2] + msize))
19757 return false;
19758
19759 /* Check that offsets are within range of each other. The ldp/stp
19760 instructions have 7 bit immediate offsets, so use 0x80. */
19761 if (offvals[2] - offvals[0] >= msize * 0x80)
19762 return false;
19763
19764 /* The offsets must be aligned with respect to each other. */
19765 if (offvals[0] % msize != offvals[2] % msize)
19766 return false;
19767
19768 /* If we have SImode and slow unaligned ldp,
19769 check the alignment to be at least 8 byte. */
19770 if (mode == SImode
19771 && (aarch64_tune_params.extra_tuning_flags
19772 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19773 && !optimize_size
19774 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19775 return false;
19776
19777 return true;
19778 }
19779
19780 /* Given OPERANDS of consecutive load/store, this function pairs them
19781 into LDP/STP after adjusting the offset. It depends on the fact
19782 that the operands can be sorted so the offsets are correct for STP.
19783 MODE is the mode of memory operands. CODE is the rtl operator
19784 which should be applied to all memory operands, it's SIGN_EXTEND,
19785 ZERO_EXTEND or UNKNOWN. */
19786
19787 bool
19788 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19789 scalar_mode mode, RTX_CODE code)
19790 {
19791 rtx base, offset_1, offset_3, t1, t2;
19792 rtx mem_1, mem_2, mem_3, mem_4;
19793 rtx temp_operands[8];
19794 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19795 stp_off_upper_limit, stp_off_lower_limit, msize;
19796
19797 /* We make changes on a copy as we may still bail out. */
19798 for (int i = 0; i < 8; i ++)
19799 temp_operands[i] = operands[i];
19800
19801 /* Sort the operands. */
19802 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19803
19804 /* Copy the memory operands so that if we have to bail for some
19805 reason the original addresses are unchanged. */
19806 if (load)
19807 {
19808 mem_1 = copy_rtx (temp_operands[1]);
19809 mem_2 = copy_rtx (temp_operands[3]);
19810 mem_3 = copy_rtx (temp_operands[5]);
19811 mem_4 = copy_rtx (temp_operands[7]);
19812 }
19813 else
19814 {
19815 mem_1 = copy_rtx (temp_operands[0]);
19816 mem_2 = copy_rtx (temp_operands[2]);
19817 mem_3 = copy_rtx (temp_operands[4]);
19818 mem_4 = copy_rtx (temp_operands[6]);
19819 gcc_assert (code == UNKNOWN);
19820 }
19821
19822 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19823 extract_base_offset_in_addr (mem_3, &base, &offset_3);
19824 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19825 && offset_3 != NULL_RTX);
19826
19827 /* Adjust offset so it can fit in LDP/STP instruction. */
19828 msize = GET_MODE_SIZE (mode);
19829 stp_off_upper_limit = msize * (0x40 - 1);
19830 stp_off_lower_limit = - msize * 0x40;
19831
19832 off_val_1 = INTVAL (offset_1);
19833 off_val_3 = INTVAL (offset_3);
19834
19835 /* The base offset is optimally half way between the two STP/LDP offsets. */
19836 if (msize <= 4)
19837 base_off = (off_val_1 + off_val_3) / 2;
19838 else
19839 /* However, due to issues with negative LDP/STP offset generation for
19840 larger modes, for DF, DI and vector modes. we must not use negative
19841 addresses smaller than 9 signed unadjusted bits can store. This
19842 provides the most range in this case. */
19843 base_off = off_val_1;
19844
19845 /* Adjust the base so that it is aligned with the addresses but still
19846 optimal. */
19847 if (base_off % msize != off_val_1 % msize)
19848 /* Fix the offset, bearing in mind we want to make it bigger not
19849 smaller. */
19850 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19851 else if (msize <= 4)
19852 /* The negative range of LDP/STP is one larger than the positive range. */
19853 base_off += msize;
19854
19855 /* Check if base offset is too big or too small. We can attempt to resolve
19856 this issue by setting it to the maximum value and seeing if the offsets
19857 still fit. */
19858 if (base_off >= 0x1000)
19859 {
19860 base_off = 0x1000 - 1;
19861 /* We must still make sure that the base offset is aligned with respect
19862 to the address. But it may may not be made any bigger. */
19863 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19864 }
19865
19866 /* Likewise for the case where the base is too small. */
19867 if (base_off <= -0x1000)
19868 {
19869 base_off = -0x1000 + 1;
19870 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19871 }
19872
19873 /* Offset of the first STP/LDP. */
19874 new_off_1 = off_val_1 - base_off;
19875
19876 /* Offset of the second STP/LDP. */
19877 new_off_3 = off_val_3 - base_off;
19878
19879 /* The offsets must be within the range of the LDP/STP instructions. */
19880 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19881 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19882 return false;
19883
19884 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19885 new_off_1), true);
19886 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19887 new_off_1 + msize), true);
19888 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19889 new_off_3), true);
19890 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19891 new_off_3 + msize), true);
19892
19893 if (!aarch64_mem_pair_operand (mem_1, mode)
19894 || !aarch64_mem_pair_operand (mem_3, mode))
19895 return false;
19896
19897 if (code == ZERO_EXTEND)
19898 {
19899 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19900 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19901 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19902 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19903 }
19904 else if (code == SIGN_EXTEND)
19905 {
19906 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19907 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19908 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19909 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19910 }
19911
19912 if (load)
19913 {
19914 operands[0] = temp_operands[0];
19915 operands[1] = mem_1;
19916 operands[2] = temp_operands[2];
19917 operands[3] = mem_2;
19918 operands[4] = temp_operands[4];
19919 operands[5] = mem_3;
19920 operands[6] = temp_operands[6];
19921 operands[7] = mem_4;
19922 }
19923 else
19924 {
19925 operands[0] = mem_1;
19926 operands[1] = temp_operands[1];
19927 operands[2] = mem_2;
19928 operands[3] = temp_operands[3];
19929 operands[4] = mem_3;
19930 operands[5] = temp_operands[5];
19931 operands[6] = mem_4;
19932 operands[7] = temp_operands[7];
19933 }
19934
19935 /* Emit adjusting instruction. */
19936 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19937 /* Emit ldp/stp instructions. */
19938 t1 = gen_rtx_SET (operands[0], operands[1]);
19939 t2 = gen_rtx_SET (operands[2], operands[3]);
19940 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19941 t1 = gen_rtx_SET (operands[4], operands[5]);
19942 t2 = gen_rtx_SET (operands[6], operands[7]);
19943 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19944 return true;
19945 }
19946
19947 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19948 it isn't worth branching around empty masked ops (including masked
19949 stores). */
19950
19951 static bool
19952 aarch64_empty_mask_is_expensive (unsigned)
19953 {
19954 return false;
19955 }
19956
19957 /* Return 1 if pseudo register should be created and used to hold
19958 GOT address for PIC code. */
19959
19960 bool
19961 aarch64_use_pseudo_pic_reg (void)
19962 {
19963 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19964 }
19965
19966 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19967
19968 static int
19969 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19970 {
19971 switch (XINT (x, 1))
19972 {
19973 case UNSPEC_GOTSMALLPIC:
19974 case UNSPEC_GOTSMALLPIC28K:
19975 case UNSPEC_GOTTINYPIC:
19976 return 0;
19977 default:
19978 break;
19979 }
19980
19981 return default_unspec_may_trap_p (x, flags);
19982 }
19983
19984
19985 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19986 return the log2 of that value. Otherwise return -1. */
19987
19988 int
19989 aarch64_fpconst_pow_of_2 (rtx x)
19990 {
19991 const REAL_VALUE_TYPE *r;
19992
19993 if (!CONST_DOUBLE_P (x))
19994 return -1;
19995
19996 r = CONST_DOUBLE_REAL_VALUE (x);
19997
19998 if (REAL_VALUE_NEGATIVE (*r)
19999 || REAL_VALUE_ISNAN (*r)
20000 || REAL_VALUE_ISINF (*r)
20001 || !real_isinteger (r, DFmode))
20002 return -1;
20003
20004 return exact_log2 (real_to_integer (r));
20005 }
20006
20007 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
20008 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
20009 return n. Otherwise return -1. */
20010
20011 int
20012 aarch64_fpconst_pow2_recip (rtx x)
20013 {
20014 REAL_VALUE_TYPE r0;
20015
20016 if (!CONST_DOUBLE_P (x))
20017 return -1;
20018
20019 r0 = *CONST_DOUBLE_REAL_VALUE (x);
20020 if (exact_real_inverse (DFmode, &r0)
20021 && !REAL_VALUE_NEGATIVE (r0))
20022 {
20023 int ret = exact_log2 (real_to_integer (&r0));
20024 if (ret >= 1 && ret <= 32)
20025 return ret;
20026 }
20027 return -1;
20028 }
20029
20030 /* If X is a vector of equal CONST_DOUBLE values and that value is
20031 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
20032
20033 int
20034 aarch64_vec_fpconst_pow_of_2 (rtx x)
20035 {
20036 int nelts;
20037 if (GET_CODE (x) != CONST_VECTOR
20038 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
20039 return -1;
20040
20041 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
20042 return -1;
20043
20044 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
20045 if (firstval <= 0)
20046 return -1;
20047
20048 for (int i = 1; i < nelts; i++)
20049 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
20050 return -1;
20051
20052 return firstval;
20053 }
20054
20055 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
20056 to float.
20057
20058 __fp16 always promotes through this hook.
20059 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
20060 through the generic excess precision logic rather than here. */
20061
20062 static tree
20063 aarch64_promoted_type (const_tree t)
20064 {
20065 if (SCALAR_FLOAT_TYPE_P (t)
20066 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
20067 return float_type_node;
20068
20069 return NULL_TREE;
20070 }
20071
20072 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
20073
20074 static bool
20075 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
20076 optimization_type opt_type)
20077 {
20078 switch (op)
20079 {
20080 case rsqrt_optab:
20081 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
20082
20083 default:
20084 return true;
20085 }
20086 }
20087
20088 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
20089
20090 static unsigned int
20091 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
20092 int *offset)
20093 {
20094 /* Polynomial invariant 1 == (VG / 2) - 1. */
20095 gcc_assert (i == 1);
20096 *factor = 2;
20097 *offset = 1;
20098 return AARCH64_DWARF_VG;
20099 }
20100
20101 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
20102 if MODE is HFmode, and punt to the generic implementation otherwise. */
20103
20104 static bool
20105 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
20106 {
20107 return (mode == HFmode
20108 ? true
20109 : default_libgcc_floating_mode_supported_p (mode));
20110 }
20111
20112 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
20113 if MODE is HFmode, and punt to the generic implementation otherwise. */
20114
20115 static bool
20116 aarch64_scalar_mode_supported_p (scalar_mode mode)
20117 {
20118 return (mode == HFmode
20119 ? true
20120 : default_scalar_mode_supported_p (mode));
20121 }
20122
20123 /* Set the value of FLT_EVAL_METHOD.
20124 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
20125
20126 0: evaluate all operations and constants, whose semantic type has at
20127 most the range and precision of type float, to the range and
20128 precision of float; evaluate all other operations and constants to
20129 the range and precision of the semantic type;
20130
20131 N, where _FloatN is a supported interchange floating type
20132 evaluate all operations and constants, whose semantic type has at
20133 most the range and precision of _FloatN type, to the range and
20134 precision of the _FloatN type; evaluate all other operations and
20135 constants to the range and precision of the semantic type;
20136
20137 If we have the ARMv8.2-A extensions then we support _Float16 in native
20138 precision, so we should set this to 16. Otherwise, we support the type,
20139 but want to evaluate expressions in float precision, so set this to
20140 0. */
20141
20142 static enum flt_eval_method
20143 aarch64_excess_precision (enum excess_precision_type type)
20144 {
20145 switch (type)
20146 {
20147 case EXCESS_PRECISION_TYPE_FAST:
20148 case EXCESS_PRECISION_TYPE_STANDARD:
20149 /* We can calculate either in 16-bit range and precision or
20150 32-bit range and precision. Make that decision based on whether
20151 we have native support for the ARMv8.2-A 16-bit floating-point
20152 instructions or not. */
20153 return (TARGET_FP_F16INST
20154 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
20155 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
20156 case EXCESS_PRECISION_TYPE_IMPLICIT:
20157 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
20158 default:
20159 gcc_unreachable ();
20160 }
20161 return FLT_EVAL_METHOD_UNPREDICTABLE;
20162 }
20163
20164 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
20165 scheduled for speculative execution. Reject the long-running division
20166 and square-root instructions. */
20167
20168 static bool
20169 aarch64_sched_can_speculate_insn (rtx_insn *insn)
20170 {
20171 switch (get_attr_type (insn))
20172 {
20173 case TYPE_SDIV:
20174 case TYPE_UDIV:
20175 case TYPE_FDIVS:
20176 case TYPE_FDIVD:
20177 case TYPE_FSQRTS:
20178 case TYPE_FSQRTD:
20179 case TYPE_NEON_FP_SQRT_S:
20180 case TYPE_NEON_FP_SQRT_D:
20181 case TYPE_NEON_FP_SQRT_S_Q:
20182 case TYPE_NEON_FP_SQRT_D_Q:
20183 case TYPE_NEON_FP_DIV_S:
20184 case TYPE_NEON_FP_DIV_D:
20185 case TYPE_NEON_FP_DIV_S_Q:
20186 case TYPE_NEON_FP_DIV_D_Q:
20187 return false;
20188 default:
20189 return true;
20190 }
20191 }
20192
20193 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
20194
20195 static int
20196 aarch64_compute_pressure_classes (reg_class *classes)
20197 {
20198 int i = 0;
20199 classes[i++] = GENERAL_REGS;
20200 classes[i++] = FP_REGS;
20201 /* PR_REGS isn't a useful pressure class because many predicate pseudo
20202 registers need to go in PR_LO_REGS at some point during their
20203 lifetime. Splitting it into two halves has the effect of making
20204 all predicates count against PR_LO_REGS, so that we try whenever
20205 possible to restrict the number of live predicates to 8. This
20206 greatly reduces the amount of spilling in certain loops. */
20207 classes[i++] = PR_LO_REGS;
20208 classes[i++] = PR_HI_REGS;
20209 return i;
20210 }
20211
20212 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
20213
20214 static bool
20215 aarch64_can_change_mode_class (machine_mode from,
20216 machine_mode to, reg_class_t)
20217 {
20218 if (BYTES_BIG_ENDIAN)
20219 {
20220 bool from_sve_p = aarch64_sve_data_mode_p (from);
20221 bool to_sve_p = aarch64_sve_data_mode_p (to);
20222
20223 /* Don't allow changes between SVE data modes and non-SVE modes.
20224 See the comment at the head of aarch64-sve.md for details. */
20225 if (from_sve_p != to_sve_p)
20226 return false;
20227
20228 /* Don't allow changes in element size: lane 0 of the new vector
20229 would not then be lane 0 of the old vector. See the comment
20230 above aarch64_maybe_expand_sve_subreg_move for a more detailed
20231 description.
20232
20233 In the worst case, this forces a register to be spilled in
20234 one mode and reloaded in the other, which handles the
20235 endianness correctly. */
20236 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
20237 return false;
20238 }
20239 return true;
20240 }
20241
20242 /* Implement TARGET_EARLY_REMAT_MODES. */
20243
20244 static void
20245 aarch64_select_early_remat_modes (sbitmap modes)
20246 {
20247 /* SVE values are not normally live across a call, so it should be
20248 worth doing early rematerialization even in VL-specific mode. */
20249 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
20250 if (aarch64_sve_mode_p ((machine_mode) i))
20251 bitmap_set_bit (modes, i);
20252 }
20253
20254 /* Override the default target speculation_safe_value. */
20255 static rtx
20256 aarch64_speculation_safe_value (machine_mode mode,
20257 rtx result, rtx val, rtx failval)
20258 {
20259 /* Maybe we should warn if falling back to hard barriers. They are
20260 likely to be noticably more expensive than the alternative below. */
20261 if (!aarch64_track_speculation)
20262 return default_speculation_safe_value (mode, result, val, failval);
20263
20264 if (!REG_P (val))
20265 val = copy_to_mode_reg (mode, val);
20266
20267 if (!aarch64_reg_or_zero (failval, mode))
20268 failval = copy_to_mode_reg (mode, failval);
20269
20270 emit_insn (gen_despeculate_copy (mode, result, val, failval));
20271 return result;
20272 }
20273
20274 /* Implement TARGET_ESTIMATED_POLY_VALUE.
20275 Look into the tuning structure for an estimate.
20276 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
20277 Advanced SIMD 128 bits. */
20278
20279 static HOST_WIDE_INT
20280 aarch64_estimated_poly_value (poly_int64 val)
20281 {
20282 enum aarch64_sve_vector_bits_enum width_source
20283 = aarch64_tune_params.sve_width;
20284
20285 /* If we still don't have an estimate, use the default. */
20286 if (width_source == SVE_SCALABLE)
20287 return default_estimated_poly_value (val);
20288
20289 HOST_WIDE_INT over_128 = width_source - 128;
20290 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
20291 }
20292
20293
20294 /* Return true for types that could be supported as SIMD return or
20295 argument types. */
20296
20297 static bool
20298 supported_simd_type (tree t)
20299 {
20300 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
20301 {
20302 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
20303 return s == 1 || s == 2 || s == 4 || s == 8;
20304 }
20305 return false;
20306 }
20307
20308 /* Return true for types that currently are supported as SIMD return
20309 or argument types. */
20310
20311 static bool
20312 currently_supported_simd_type (tree t, tree b)
20313 {
20314 if (COMPLEX_FLOAT_TYPE_P (t))
20315 return false;
20316
20317 if (TYPE_SIZE (t) != TYPE_SIZE (b))
20318 return false;
20319
20320 return supported_simd_type (t);
20321 }
20322
20323 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
20324
20325 static int
20326 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20327 struct cgraph_simd_clone *clonei,
20328 tree base_type, int num)
20329 {
20330 tree t, ret_type, arg_type;
20331 unsigned int elt_bits, vec_bits, count;
20332
20333 if (!TARGET_SIMD)
20334 return 0;
20335
20336 if (clonei->simdlen
20337 && (clonei->simdlen < 2
20338 || clonei->simdlen > 1024
20339 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20340 {
20341 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20342 "unsupported simdlen %d", clonei->simdlen);
20343 return 0;
20344 }
20345
20346 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20347 if (TREE_CODE (ret_type) != VOID_TYPE
20348 && !currently_supported_simd_type (ret_type, base_type))
20349 {
20350 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20351 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20352 "GCC does not currently support mixed size types "
20353 "for %<simd%> functions");
20354 else if (supported_simd_type (ret_type))
20355 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20356 "GCC does not currently support return type %qT "
20357 "for %<simd%> functions", ret_type);
20358 else
20359 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20360 "unsupported return type %qT for %<simd%> functions",
20361 ret_type);
20362 return 0;
20363 }
20364
20365 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20366 {
20367 arg_type = TREE_TYPE (t);
20368
20369 if (!currently_supported_simd_type (arg_type, base_type))
20370 {
20371 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20372 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20373 "GCC does not currently support mixed size types "
20374 "for %<simd%> functions");
20375 else
20376 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20377 "GCC does not currently support argument type %qT "
20378 "for %<simd%> functions", arg_type);
20379 return 0;
20380 }
20381 }
20382
20383 clonei->vecsize_mangle = 'n';
20384 clonei->mask_mode = VOIDmode;
20385 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20386 if (clonei->simdlen == 0)
20387 {
20388 count = 2;
20389 vec_bits = (num == 0 ? 64 : 128);
20390 clonei->simdlen = vec_bits / elt_bits;
20391 }
20392 else
20393 {
20394 count = 1;
20395 vec_bits = clonei->simdlen * elt_bits;
20396 if (vec_bits != 64 && vec_bits != 128)
20397 {
20398 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20399 "GCC does not currently support simdlen %d for type %qT",
20400 clonei->simdlen, base_type);
20401 return 0;
20402 }
20403 }
20404 clonei->vecsize_int = vec_bits;
20405 clonei->vecsize_float = vec_bits;
20406 return count;
20407 }
20408
20409 /* Implement TARGET_SIMD_CLONE_ADJUST. */
20410
20411 static void
20412 aarch64_simd_clone_adjust (struct cgraph_node *node)
20413 {
20414 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20415 use the correct ABI. */
20416
20417 tree t = TREE_TYPE (node->decl);
20418 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20419 TYPE_ATTRIBUTES (t));
20420 }
20421
20422 /* Implement TARGET_SIMD_CLONE_USABLE. */
20423
20424 static int
20425 aarch64_simd_clone_usable (struct cgraph_node *node)
20426 {
20427 switch (node->simdclone->vecsize_mangle)
20428 {
20429 case 'n':
20430 if (!TARGET_SIMD)
20431 return -1;
20432 return 0;
20433 default:
20434 gcc_unreachable ();
20435 }
20436 }
20437
20438 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20439
20440 static int
20441 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20442 {
20443 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20444 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20445 return 0;
20446 return 1;
20447 }
20448
20449 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20450
20451 static const char *
20452 aarch64_get_multilib_abi_name (void)
20453 {
20454 if (TARGET_BIG_END)
20455 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20456 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20457 }
20458
20459 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20460 global variable based guard use the default else
20461 return a null tree. */
20462 static tree
20463 aarch64_stack_protect_guard (void)
20464 {
20465 if (aarch64_stack_protector_guard == SSP_GLOBAL)
20466 return default_stack_protect_guard ();
20467
20468 return NULL_TREE;
20469 }
20470
20471 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
20472 section at the end if needed. */
20473 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
20474 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
20475 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
20476 void
20477 aarch64_file_end_indicate_exec_stack ()
20478 {
20479 file_end_indicate_exec_stack ();
20480
20481 unsigned feature_1_and = 0;
20482 if (aarch64_bti_enabled ())
20483 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20484
20485 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20486 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20487
20488 if (feature_1_and)
20489 {
20490 /* Generate .note.gnu.property section. */
20491 switch_to_section (get_section (".note.gnu.property",
20492 SECTION_NOTYPE, NULL));
20493
20494 /* PT_NOTE header: namesz, descsz, type.
20495 namesz = 4 ("GNU\0")
20496 descsz = 16 (Size of the program property array)
20497 [(12 + padding) * Number of array elements]
20498 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20499 assemble_align (POINTER_SIZE);
20500 assemble_integer (GEN_INT (4), 4, 32, 1);
20501 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20502 assemble_integer (GEN_INT (5), 4, 32, 1);
20503
20504 /* PT_NOTE name. */
20505 assemble_string ("GNU", 4);
20506
20507 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20508 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20509 datasz = 4
20510 data = feature_1_and. */
20511 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20512 assemble_integer (GEN_INT (4), 4, 32, 1);
20513 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20514
20515 /* Pad the size of the note to the required alignment. */
20516 assemble_align (POINTER_SIZE);
20517 }
20518 }
20519 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20520 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20521 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20522
20523 /* Target-specific selftests. */
20524
20525 #if CHECKING_P
20526
20527 namespace selftest {
20528
20529 /* Selftest for the RTL loader.
20530 Verify that the RTL loader copes with a dump from
20531 print_rtx_function. This is essentially just a test that class
20532 function_reader can handle a real dump, but it also verifies
20533 that lookup_reg_by_dump_name correctly handles hard regs.
20534 The presence of hard reg names in the dump means that the test is
20535 target-specific, hence it is in this file. */
20536
20537 static void
20538 aarch64_test_loading_full_dump ()
20539 {
20540 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20541
20542 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20543
20544 rtx_insn *insn_1 = get_insn_by_uid (1);
20545 ASSERT_EQ (NOTE, GET_CODE (insn_1));
20546
20547 rtx_insn *insn_15 = get_insn_by_uid (15);
20548 ASSERT_EQ (INSN, GET_CODE (insn_15));
20549 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20550
20551 /* Verify crtl->return_rtx. */
20552 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20553 ASSERT_EQ (0, REGNO (crtl->return_rtx));
20554 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20555 }
20556
20557 /* Run all target-specific selftests. */
20558
20559 static void
20560 aarch64_run_selftests (void)
20561 {
20562 aarch64_test_loading_full_dump ();
20563 }
20564
20565 } // namespace selftest
20566
20567 #endif /* #if CHECKING_P */
20568
20569 #undef TARGET_STACK_PROTECT_GUARD
20570 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20571
20572 #undef TARGET_ADDRESS_COST
20573 #define TARGET_ADDRESS_COST aarch64_address_cost
20574
20575 /* This hook will determines whether unnamed bitfields affect the alignment
20576 of the containing structure. The hook returns true if the structure
20577 should inherit the alignment requirements of an unnamed bitfield's
20578 type. */
20579 #undef TARGET_ALIGN_ANON_BITFIELD
20580 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20581
20582 #undef TARGET_ASM_ALIGNED_DI_OP
20583 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20584
20585 #undef TARGET_ASM_ALIGNED_HI_OP
20586 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20587
20588 #undef TARGET_ASM_ALIGNED_SI_OP
20589 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20590
20591 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20592 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20593 hook_bool_const_tree_hwi_hwi_const_tree_true
20594
20595 #undef TARGET_ASM_FILE_START
20596 #define TARGET_ASM_FILE_START aarch64_start_file
20597
20598 #undef TARGET_ASM_OUTPUT_MI_THUNK
20599 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20600
20601 #undef TARGET_ASM_SELECT_RTX_SECTION
20602 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20603
20604 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20605 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20606
20607 #undef TARGET_BUILD_BUILTIN_VA_LIST
20608 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20609
20610 #undef TARGET_CALLEE_COPIES
20611 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
20612
20613 #undef TARGET_CAN_ELIMINATE
20614 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20615
20616 #undef TARGET_CAN_INLINE_P
20617 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20618
20619 #undef TARGET_CANNOT_FORCE_CONST_MEM
20620 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20621
20622 #undef TARGET_CASE_VALUES_THRESHOLD
20623 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20624
20625 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20626 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20627
20628 /* Only the least significant bit is used for initialization guard
20629 variables. */
20630 #undef TARGET_CXX_GUARD_MASK_BIT
20631 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20632
20633 #undef TARGET_C_MODE_FOR_SUFFIX
20634 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20635
20636 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20637 #undef TARGET_DEFAULT_TARGET_FLAGS
20638 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20639 #endif
20640
20641 #undef TARGET_CLASS_MAX_NREGS
20642 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20643
20644 #undef TARGET_BUILTIN_DECL
20645 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20646
20647 #undef TARGET_BUILTIN_RECIPROCAL
20648 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20649
20650 #undef TARGET_C_EXCESS_PRECISION
20651 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20652
20653 #undef TARGET_EXPAND_BUILTIN
20654 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20655
20656 #undef TARGET_EXPAND_BUILTIN_VA_START
20657 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20658
20659 #undef TARGET_FOLD_BUILTIN
20660 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20661
20662 #undef TARGET_FUNCTION_ARG
20663 #define TARGET_FUNCTION_ARG aarch64_function_arg
20664
20665 #undef TARGET_FUNCTION_ARG_ADVANCE
20666 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20667
20668 #undef TARGET_FUNCTION_ARG_BOUNDARY
20669 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20670
20671 #undef TARGET_FUNCTION_ARG_PADDING
20672 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20673
20674 #undef TARGET_GET_RAW_RESULT_MODE
20675 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20676 #undef TARGET_GET_RAW_ARG_MODE
20677 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20678
20679 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20680 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20681
20682 #undef TARGET_FUNCTION_VALUE
20683 #define TARGET_FUNCTION_VALUE aarch64_function_value
20684
20685 #undef TARGET_FUNCTION_VALUE_REGNO_P
20686 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20687
20688 #undef TARGET_GIMPLE_FOLD_BUILTIN
20689 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20690
20691 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20692 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20693
20694 #undef TARGET_INIT_BUILTINS
20695 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20696
20697 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20698 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20699 aarch64_ira_change_pseudo_allocno_class
20700
20701 #undef TARGET_LEGITIMATE_ADDRESS_P
20702 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20703
20704 #undef TARGET_LEGITIMATE_CONSTANT_P
20705 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20706
20707 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20708 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20709 aarch64_legitimize_address_displacement
20710
20711 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20712 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20713
20714 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20715 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20716 aarch64_libgcc_floating_mode_supported_p
20717
20718 #undef TARGET_MANGLE_TYPE
20719 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20720
20721 #undef TARGET_MEMORY_MOVE_COST
20722 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20723
20724 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20725 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20726
20727 #undef TARGET_MUST_PASS_IN_STACK
20728 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20729
20730 /* This target hook should return true if accesses to volatile bitfields
20731 should use the narrowest mode possible. It should return false if these
20732 accesses should use the bitfield container type. */
20733 #undef TARGET_NARROW_VOLATILE_BITFIELD
20734 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20735
20736 #undef TARGET_OPTION_OVERRIDE
20737 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20738
20739 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20740 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20741 aarch64_override_options_after_change
20742
20743 #undef TARGET_OPTION_SAVE
20744 #define TARGET_OPTION_SAVE aarch64_option_save
20745
20746 #undef TARGET_OPTION_RESTORE
20747 #define TARGET_OPTION_RESTORE aarch64_option_restore
20748
20749 #undef TARGET_OPTION_PRINT
20750 #define TARGET_OPTION_PRINT aarch64_option_print
20751
20752 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20753 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20754
20755 #undef TARGET_SET_CURRENT_FUNCTION
20756 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20757
20758 #undef TARGET_PASS_BY_REFERENCE
20759 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20760
20761 #undef TARGET_PREFERRED_RELOAD_CLASS
20762 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20763
20764 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20765 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20766
20767 #undef TARGET_PROMOTED_TYPE
20768 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20769
20770 #undef TARGET_SECONDARY_RELOAD
20771 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20772
20773 #undef TARGET_SHIFT_TRUNCATION_MASK
20774 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20775
20776 #undef TARGET_SETUP_INCOMING_VARARGS
20777 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20778
20779 #undef TARGET_STRUCT_VALUE_RTX
20780 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20781
20782 #undef TARGET_REGISTER_MOVE_COST
20783 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20784
20785 #undef TARGET_RETURN_IN_MEMORY
20786 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20787
20788 #undef TARGET_RETURN_IN_MSB
20789 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20790
20791 #undef TARGET_RTX_COSTS
20792 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20793
20794 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20795 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20796
20797 #undef TARGET_SCHED_ISSUE_RATE
20798 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20799
20800 #undef TARGET_SCHED_VARIABLE_ISSUE
20801 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
20802
20803 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20804 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20805 aarch64_sched_first_cycle_multipass_dfa_lookahead
20806
20807 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20808 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20809 aarch64_first_cycle_multipass_dfa_lookahead_guard
20810
20811 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20812 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20813 aarch64_get_separate_components
20814
20815 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20816 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20817 aarch64_components_for_bb
20818
20819 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20820 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20821 aarch64_disqualify_components
20822
20823 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20824 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20825 aarch64_emit_prologue_components
20826
20827 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20828 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20829 aarch64_emit_epilogue_components
20830
20831 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20832 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20833 aarch64_set_handled_components
20834
20835 #undef TARGET_TRAMPOLINE_INIT
20836 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20837
20838 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20839 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20840
20841 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20842 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20843
20844 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20845 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20846 aarch64_builtin_support_vector_misalignment
20847
20848 #undef TARGET_ARRAY_MODE
20849 #define TARGET_ARRAY_MODE aarch64_array_mode
20850
20851 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20852 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20853
20854 #undef TARGET_VECTORIZE_ADD_STMT_COST
20855 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20856
20857 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20858 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20859 aarch64_builtin_vectorization_cost
20860
20861 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20862 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20863
20864 #undef TARGET_VECTORIZE_BUILTINS
20865 #define TARGET_VECTORIZE_BUILTINS
20866
20867 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20868 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20869 aarch64_builtin_vectorized_function
20870
20871 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20872 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20873 aarch64_autovectorize_vector_sizes
20874
20875 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20876 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20877 aarch64_atomic_assign_expand_fenv
20878
20879 /* Section anchor support. */
20880
20881 #undef TARGET_MIN_ANCHOR_OFFSET
20882 #define TARGET_MIN_ANCHOR_OFFSET -256
20883
20884 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20885 byte offset; we can do much more for larger data types, but have no way
20886 to determine the size of the access. We assume accesses are aligned. */
20887 #undef TARGET_MAX_ANCHOR_OFFSET
20888 #define TARGET_MAX_ANCHOR_OFFSET 4095
20889
20890 #undef TARGET_VECTOR_ALIGNMENT
20891 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20892
20893 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20894 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20895 aarch64_vectorize_preferred_vector_alignment
20896 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20897 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20898 aarch64_simd_vector_alignment_reachable
20899
20900 /* vec_perm support. */
20901
20902 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20903 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20904 aarch64_vectorize_vec_perm_const
20905
20906 #undef TARGET_VECTORIZE_GET_MASK_MODE
20907 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20908 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20909 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20910 aarch64_empty_mask_is_expensive
20911 #undef TARGET_PREFERRED_ELSE_VALUE
20912 #define TARGET_PREFERRED_ELSE_VALUE \
20913 aarch64_preferred_else_value
20914
20915 #undef TARGET_INIT_LIBFUNCS
20916 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20917
20918 #undef TARGET_FIXED_CONDITION_CODE_REGS
20919 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20920
20921 #undef TARGET_FLAGS_REGNUM
20922 #define TARGET_FLAGS_REGNUM CC_REGNUM
20923
20924 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20925 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20926
20927 #undef TARGET_ASAN_SHADOW_OFFSET
20928 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20929
20930 #undef TARGET_LEGITIMIZE_ADDRESS
20931 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20932
20933 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20934 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20935
20936 #undef TARGET_CAN_USE_DOLOOP_P
20937 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20938
20939 #undef TARGET_SCHED_ADJUST_PRIORITY
20940 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20941
20942 #undef TARGET_SCHED_MACRO_FUSION_P
20943 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20944
20945 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20946 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20947
20948 #undef TARGET_SCHED_FUSION_PRIORITY
20949 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20950
20951 #undef TARGET_UNSPEC_MAY_TRAP_P
20952 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20953
20954 #undef TARGET_USE_PSEUDO_PIC_REG
20955 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20956
20957 #undef TARGET_PRINT_OPERAND
20958 #define TARGET_PRINT_OPERAND aarch64_print_operand
20959
20960 #undef TARGET_PRINT_OPERAND_ADDRESS
20961 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20962
20963 #undef TARGET_OPTAB_SUPPORTED_P
20964 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20965
20966 #undef TARGET_OMIT_STRUCT_RETURN_REG
20967 #define TARGET_OMIT_STRUCT_RETURN_REG true
20968
20969 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20970 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20971 aarch64_dwarf_poly_indeterminate_value
20972
20973 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20974 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20975 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20976
20977 #undef TARGET_HARD_REGNO_NREGS
20978 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20979 #undef TARGET_HARD_REGNO_MODE_OK
20980 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20981
20982 #undef TARGET_MODES_TIEABLE_P
20983 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20984
20985 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20986 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20987 aarch64_hard_regno_call_part_clobbered
20988
20989 #undef TARGET_INSN_CALLEE_ABI
20990 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
20991
20992 #undef TARGET_CONSTANT_ALIGNMENT
20993 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20994
20995 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20996 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20997 aarch64_stack_clash_protection_alloca_probe_range
20998
20999 #undef TARGET_COMPUTE_PRESSURE_CLASSES
21000 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
21001
21002 #undef TARGET_CAN_CHANGE_MODE_CLASS
21003 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
21004
21005 #undef TARGET_SELECT_EARLY_REMAT_MODES
21006 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
21007
21008 #undef TARGET_SPECULATION_SAFE_VALUE
21009 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
21010
21011 #undef TARGET_ESTIMATED_POLY_VALUE
21012 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
21013
21014 #undef TARGET_ATTRIBUTE_TABLE
21015 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
21016
21017 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
21018 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
21019 aarch64_simd_clone_compute_vecsize_and_simdlen
21020
21021 #undef TARGET_SIMD_CLONE_ADJUST
21022 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
21023
21024 #undef TARGET_SIMD_CLONE_USABLE
21025 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
21026
21027 #undef TARGET_COMP_TYPE_ATTRIBUTES
21028 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
21029
21030 #undef TARGET_GET_MULTILIB_ABI_NAME
21031 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
21032
21033 #undef TARGET_FNTYPE_ABI
21034 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
21035
21036 #if CHECKING_P
21037 #undef TARGET_RUN_TARGET_SELFTESTS
21038 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
21039 #endif /* #if CHECKING_P */
21040
21041 #undef TARGET_ASM_POST_CFI_STARTPROC
21042 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
21043
21044 struct gcc_target targetm = TARGET_INITIALIZER;
21045
21046 #include "gt-aarch64.h"