]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
[AArch64] Use SVE binary immediate instructions for conditional arithmetic
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
85 {
86 enum insn_type { MOV, MVN, INDEX, PTRUE };
87 enum modifier_type { LSL, MSL };
88
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode, rtx);
91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 insn_type = MOV, modifier_type = LSL,
93 unsigned int = 0);
94 simd_immediate_info (scalar_mode, rtx, rtx);
95 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
96
97 /* The mode of the elements. */
98 scalar_mode elt_mode;
99
100 /* The instruction to use to move the immediate into a vector. */
101 insn_type insn;
102
103 union
104 {
105 /* For MOV and MVN. */
106 struct
107 {
108 /* The value of each element. */
109 rtx value;
110
111 /* The kind of shift modifier to use, and the number of bits to shift.
112 This is (LSL, 0) if no shift is needed. */
113 modifier_type modifier;
114 unsigned int shift;
115 } mov;
116
117 /* For INDEX. */
118 struct
119 {
120 /* The value of the first element and the step to be added for each
121 subsequent element. */
122 rtx base, step;
123 } index;
124
125 /* For PTRUE. */
126 aarch64_svpattern pattern;
127 } u;
128 };
129
130 /* Construct a floating-point immediate in which each element has mode
131 ELT_MODE_IN and value VALUE_IN. */
132 inline simd_immediate_info
133 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
134 : elt_mode (elt_mode_in), insn (MOV)
135 {
136 u.mov.value = value_in;
137 u.mov.modifier = LSL;
138 u.mov.shift = 0;
139 }
140
141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
142 and value VALUE_IN. The other parameters are as for the structure
143 fields. */
144 inline simd_immediate_info
145 ::simd_immediate_info (scalar_int_mode elt_mode_in,
146 unsigned HOST_WIDE_INT value_in,
147 insn_type insn_in, modifier_type modifier_in,
148 unsigned int shift_in)
149 : elt_mode (elt_mode_in), insn (insn_in)
150 {
151 u.mov.value = gen_int_mode (value_in, elt_mode_in);
152 u.mov.modifier = modifier_in;
153 u.mov.shift = shift_in;
154 }
155
156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
157 and where element I is equal to BASE_IN + I * STEP_IN. */
158 inline simd_immediate_info
159 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
160 : elt_mode (elt_mode_in), insn (INDEX)
161 {
162 u.index.base = base_in;
163 u.index.step = step_in;
164 }
165
166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
167 and has PTRUE pattern PATTERN_IN. */
168 inline simd_immediate_info
169 ::simd_immediate_info (scalar_int_mode elt_mode_in,
170 aarch64_svpattern pattern_in)
171 : elt_mode (elt_mode_in), insn (PTRUE)
172 {
173 u.pattern = pattern_in;
174 }
175
176 /* The current code model. */
177 enum aarch64_code_model aarch64_cmodel;
178
179 /* The number of 64-bit elements in an SVE vector. */
180 poly_uint16 aarch64_sve_vg;
181
182 #ifdef HAVE_AS_TLS
183 #undef TARGET_HAVE_TLS
184 #define TARGET_HAVE_TLS 1
185 #endif
186
187 static bool aarch64_composite_type_p (const_tree, machine_mode);
188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
189 const_tree,
190 machine_mode *, int *,
191 bool *);
192 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
193 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
194 static void aarch64_override_options_after_change (void);
195 static bool aarch64_vector_mode_supported_p (machine_mode);
196 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
198 const_tree type,
199 int misalignment,
200 bool is_packed);
201 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
202 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
203 aarch64_addr_query_type);
204 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
205
206 /* Major revision number of the ARM Architecture implemented by the target. */
207 unsigned aarch64_architecture_version;
208
209 /* The processor for which instructions should be scheduled. */
210 enum aarch64_processor aarch64_tune = cortexa53;
211
212 /* Mask to specify which instruction scheduling options should be used. */
213 uint64_t aarch64_tune_flags = 0;
214
215 /* Global flag for PC relative loads. */
216 bool aarch64_pcrelative_literal_loads;
217
218 /* Global flag for whether frame pointer is enabled. */
219 bool aarch64_use_frame_pointer;
220
221 #define BRANCH_PROTECT_STR_MAX 255
222 char *accepted_branch_protection_string = NULL;
223
224 static enum aarch64_parse_opt_result
225 aarch64_parse_branch_protection (const char*, char**);
226
227 /* Support for command line parsing of boolean flags in the tuning
228 structures. */
229 struct aarch64_flag_desc
230 {
231 const char* name;
232 unsigned int flag;
233 };
234
235 #define AARCH64_FUSION_PAIR(name, internal_name) \
236 { name, AARCH64_FUSE_##internal_name },
237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
238 {
239 { "none", AARCH64_FUSE_NOTHING },
240 #include "aarch64-fusion-pairs.def"
241 { "all", AARCH64_FUSE_ALL },
242 { NULL, AARCH64_FUSE_NOTHING }
243 };
244
245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
246 { name, AARCH64_EXTRA_TUNE_##internal_name },
247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
248 {
249 { "none", AARCH64_EXTRA_TUNE_NONE },
250 #include "aarch64-tuning-flags.def"
251 { "all", AARCH64_EXTRA_TUNE_ALL },
252 { NULL, AARCH64_EXTRA_TUNE_NONE }
253 };
254
255 /* Tuning parameters. */
256
257 static const struct cpu_addrcost_table generic_addrcost_table =
258 {
259 {
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
264 },
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
271 };
272
273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
274 {
275 {
276 0, /* hi */
277 0, /* si */
278 0, /* di */
279 2, /* ti */
280 },
281 0, /* pre_modify */
282 0, /* post_modify */
283 1, /* register_offset */
284 1, /* register_sextend */
285 2, /* register_zextend */
286 0, /* imm_offset */
287 };
288
289 static const struct cpu_addrcost_table xgene1_addrcost_table =
290 {
291 {
292 1, /* hi */
293 0, /* si */
294 0, /* di */
295 1, /* ti */
296 },
297 1, /* pre_modify */
298 1, /* post_modify */
299 0, /* register_offset */
300 1, /* register_sextend */
301 1, /* register_zextend */
302 0, /* imm_offset */
303 };
304
305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
306 {
307 {
308 1, /* hi */
309 1, /* si */
310 1, /* di */
311 2, /* ti */
312 },
313 0, /* pre_modify */
314 0, /* post_modify */
315 2, /* register_offset */
316 3, /* register_sextend */
317 3, /* register_zextend */
318 0, /* imm_offset */
319 };
320
321 static const struct cpu_addrcost_table tsv110_addrcost_table =
322 {
323 {
324 1, /* hi */
325 0, /* si */
326 0, /* di */
327 1, /* ti */
328 },
329 0, /* pre_modify */
330 0, /* post_modify */
331 0, /* register_offset */
332 1, /* register_sextend */
333 1, /* register_zextend */
334 0, /* imm_offset */
335 };
336
337 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
338 {
339 {
340 1, /* hi */
341 1, /* si */
342 1, /* di */
343 2, /* ti */
344 },
345 1, /* pre_modify */
346 1, /* post_modify */
347 3, /* register_offset */
348 3, /* register_sextend */
349 3, /* register_zextend */
350 2, /* imm_offset */
351 };
352
353 static const struct cpu_regmove_cost generic_regmove_cost =
354 {
355 1, /* GP2GP */
356 /* Avoid the use of slow int<->fp moves for spilling by setting
357 their cost higher than memmov_cost. */
358 5, /* GP2FP */
359 5, /* FP2GP */
360 2 /* FP2FP */
361 };
362
363 static const struct cpu_regmove_cost cortexa57_regmove_cost =
364 {
365 1, /* GP2GP */
366 /* Avoid the use of slow int<->fp moves for spilling by setting
367 their cost higher than memmov_cost. */
368 5, /* GP2FP */
369 5, /* FP2GP */
370 2 /* FP2FP */
371 };
372
373 static const struct cpu_regmove_cost cortexa53_regmove_cost =
374 {
375 1, /* GP2GP */
376 /* Avoid the use of slow int<->fp moves for spilling by setting
377 their cost higher than memmov_cost. */
378 5, /* GP2FP */
379 5, /* FP2GP */
380 2 /* FP2FP */
381 };
382
383 static const struct cpu_regmove_cost exynosm1_regmove_cost =
384 {
385 1, /* GP2GP */
386 /* Avoid the use of slow int<->fp moves for spilling by setting
387 their cost higher than memmov_cost (actual, 4 and 9). */
388 9, /* GP2FP */
389 9, /* FP2GP */
390 1 /* FP2FP */
391 };
392
393 static const struct cpu_regmove_cost thunderx_regmove_cost =
394 {
395 2, /* GP2GP */
396 2, /* GP2FP */
397 6, /* FP2GP */
398 4 /* FP2FP */
399 };
400
401 static const struct cpu_regmove_cost xgene1_regmove_cost =
402 {
403 1, /* GP2GP */
404 /* Avoid the use of slow int<->fp moves for spilling by setting
405 their cost higher than memmov_cost. */
406 8, /* GP2FP */
407 8, /* FP2GP */
408 2 /* FP2FP */
409 };
410
411 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
412 {
413 2, /* GP2GP */
414 /* Avoid the use of int<->fp moves for spilling. */
415 6, /* GP2FP */
416 6, /* FP2GP */
417 4 /* FP2FP */
418 };
419
420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
421 {
422 1, /* GP2GP */
423 /* Avoid the use of int<->fp moves for spilling. */
424 8, /* GP2FP */
425 8, /* FP2GP */
426 4 /* FP2FP */
427 };
428
429 static const struct cpu_regmove_cost tsv110_regmove_cost =
430 {
431 1, /* GP2GP */
432 /* Avoid the use of slow int<->fp moves for spilling by setting
433 their cost higher than memmov_cost. */
434 2, /* GP2FP */
435 3, /* FP2GP */
436 2 /* FP2FP */
437 };
438
439 /* Generic costs for vector insn classes. */
440 static const struct cpu_vector_cost generic_vector_cost =
441 {
442 1, /* scalar_int_stmt_cost */
443 1, /* scalar_fp_stmt_cost */
444 1, /* scalar_load_cost */
445 1, /* scalar_store_cost */
446 1, /* vec_int_stmt_cost */
447 1, /* vec_fp_stmt_cost */
448 2, /* vec_permute_cost */
449 1, /* vec_to_scalar_cost */
450 1, /* scalar_to_vec_cost */
451 1, /* vec_align_load_cost */
452 1, /* vec_unalign_load_cost */
453 1, /* vec_unalign_store_cost */
454 1, /* vec_store_cost */
455 3, /* cond_taken_branch_cost */
456 1 /* cond_not_taken_branch_cost */
457 };
458
459 /* QDF24XX costs for vector insn classes. */
460 static const struct cpu_vector_cost qdf24xx_vector_cost =
461 {
462 1, /* scalar_int_stmt_cost */
463 1, /* scalar_fp_stmt_cost */
464 1, /* scalar_load_cost */
465 1, /* scalar_store_cost */
466 1, /* vec_int_stmt_cost */
467 3, /* vec_fp_stmt_cost */
468 2, /* vec_permute_cost */
469 1, /* vec_to_scalar_cost */
470 1, /* scalar_to_vec_cost */
471 1, /* vec_align_load_cost */
472 1, /* vec_unalign_load_cost */
473 1, /* vec_unalign_store_cost */
474 1, /* vec_store_cost */
475 3, /* cond_taken_branch_cost */
476 1 /* cond_not_taken_branch_cost */
477 };
478
479 /* ThunderX costs for vector insn classes. */
480 static const struct cpu_vector_cost thunderx_vector_cost =
481 {
482 1, /* scalar_int_stmt_cost */
483 1, /* scalar_fp_stmt_cost */
484 3, /* scalar_load_cost */
485 1, /* scalar_store_cost */
486 4, /* vec_int_stmt_cost */
487 1, /* vec_fp_stmt_cost */
488 4, /* vec_permute_cost */
489 2, /* vec_to_scalar_cost */
490 2, /* scalar_to_vec_cost */
491 3, /* vec_align_load_cost */
492 5, /* vec_unalign_load_cost */
493 5, /* vec_unalign_store_cost */
494 1, /* vec_store_cost */
495 3, /* cond_taken_branch_cost */
496 3 /* cond_not_taken_branch_cost */
497 };
498
499 static const struct cpu_vector_cost tsv110_vector_cost =
500 {
501 1, /* scalar_int_stmt_cost */
502 1, /* scalar_fp_stmt_cost */
503 5, /* scalar_load_cost */
504 1, /* scalar_store_cost */
505 2, /* vec_int_stmt_cost */
506 2, /* vec_fp_stmt_cost */
507 2, /* vec_permute_cost */
508 3, /* vec_to_scalar_cost */
509 2, /* scalar_to_vec_cost */
510 5, /* vec_align_load_cost */
511 5, /* vec_unalign_load_cost */
512 1, /* vec_unalign_store_cost */
513 1, /* vec_store_cost */
514 1, /* cond_taken_branch_cost */
515 1 /* cond_not_taken_branch_cost */
516 };
517
518 /* Generic costs for vector insn classes. */
519 static const struct cpu_vector_cost cortexa57_vector_cost =
520 {
521 1, /* scalar_int_stmt_cost */
522 1, /* scalar_fp_stmt_cost */
523 4, /* scalar_load_cost */
524 1, /* scalar_store_cost */
525 2, /* vec_int_stmt_cost */
526 2, /* vec_fp_stmt_cost */
527 3, /* vec_permute_cost */
528 8, /* vec_to_scalar_cost */
529 8, /* scalar_to_vec_cost */
530 4, /* vec_align_load_cost */
531 4, /* vec_unalign_load_cost */
532 1, /* vec_unalign_store_cost */
533 1, /* vec_store_cost */
534 1, /* cond_taken_branch_cost */
535 1 /* cond_not_taken_branch_cost */
536 };
537
538 static const struct cpu_vector_cost exynosm1_vector_cost =
539 {
540 1, /* scalar_int_stmt_cost */
541 1, /* scalar_fp_stmt_cost */
542 5, /* scalar_load_cost */
543 1, /* scalar_store_cost */
544 3, /* vec_int_stmt_cost */
545 3, /* vec_fp_stmt_cost */
546 3, /* vec_permute_cost */
547 3, /* vec_to_scalar_cost */
548 3, /* scalar_to_vec_cost */
549 5, /* vec_align_load_cost */
550 5, /* vec_unalign_load_cost */
551 1, /* vec_unalign_store_cost */
552 1, /* vec_store_cost */
553 1, /* cond_taken_branch_cost */
554 1 /* cond_not_taken_branch_cost */
555 };
556
557 /* Generic costs for vector insn classes. */
558 static const struct cpu_vector_cost xgene1_vector_cost =
559 {
560 1, /* scalar_int_stmt_cost */
561 1, /* scalar_fp_stmt_cost */
562 5, /* scalar_load_cost */
563 1, /* scalar_store_cost */
564 2, /* vec_int_stmt_cost */
565 2, /* vec_fp_stmt_cost */
566 2, /* vec_permute_cost */
567 4, /* vec_to_scalar_cost */
568 4, /* scalar_to_vec_cost */
569 10, /* vec_align_load_cost */
570 10, /* vec_unalign_load_cost */
571 2, /* vec_unalign_store_cost */
572 2, /* vec_store_cost */
573 2, /* cond_taken_branch_cost */
574 1 /* cond_not_taken_branch_cost */
575 };
576
577 /* Costs for vector insn classes for Vulcan. */
578 static const struct cpu_vector_cost thunderx2t99_vector_cost =
579 {
580 1, /* scalar_int_stmt_cost */
581 6, /* scalar_fp_stmt_cost */
582 4, /* scalar_load_cost */
583 1, /* scalar_store_cost */
584 5, /* vec_int_stmt_cost */
585 6, /* vec_fp_stmt_cost */
586 3, /* vec_permute_cost */
587 6, /* vec_to_scalar_cost */
588 5, /* scalar_to_vec_cost */
589 8, /* vec_align_load_cost */
590 8, /* vec_unalign_load_cost */
591 4, /* vec_unalign_store_cost */
592 4, /* vec_store_cost */
593 2, /* cond_taken_branch_cost */
594 1 /* cond_not_taken_branch_cost */
595 };
596
597 /* Generic costs for branch instructions. */
598 static const struct cpu_branch_cost generic_branch_cost =
599 {
600 1, /* Predictable. */
601 3 /* Unpredictable. */
602 };
603
604 /* Generic approximation modes. */
605 static const cpu_approx_modes generic_approx_modes =
606 {
607 AARCH64_APPROX_NONE, /* division */
608 AARCH64_APPROX_NONE, /* sqrt */
609 AARCH64_APPROX_NONE /* recip_sqrt */
610 };
611
612 /* Approximation modes for Exynos M1. */
613 static const cpu_approx_modes exynosm1_approx_modes =
614 {
615 AARCH64_APPROX_NONE, /* division */
616 AARCH64_APPROX_ALL, /* sqrt */
617 AARCH64_APPROX_ALL /* recip_sqrt */
618 };
619
620 /* Approximation modes for X-Gene 1. */
621 static const cpu_approx_modes xgene1_approx_modes =
622 {
623 AARCH64_APPROX_NONE, /* division */
624 AARCH64_APPROX_NONE, /* sqrt */
625 AARCH64_APPROX_ALL /* recip_sqrt */
626 };
627
628 /* Generic prefetch settings (which disable prefetch). */
629 static const cpu_prefetch_tune generic_prefetch_tune =
630 {
631 0, /* num_slots */
632 -1, /* l1_cache_size */
633 -1, /* l1_cache_line_size */
634 -1, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 -1 /* default_opt_level */
638 };
639
640 static const cpu_prefetch_tune exynosm1_prefetch_tune =
641 {
642 0, /* num_slots */
643 -1, /* l1_cache_size */
644 64, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
649 };
650
651 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
652 {
653 4, /* num_slots */
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 512, /* l2_cache_size */
657 false, /* prefetch_dynamic_strides */
658 2048, /* minimum_stride */
659 3 /* default_opt_level */
660 };
661
662 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
663 {
664 8, /* num_slots */
665 32, /* l1_cache_size */
666 128, /* l1_cache_line_size */
667 16*1024, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 3 /* default_opt_level */
671 };
672
673 static const cpu_prefetch_tune thunderx_prefetch_tune =
674 {
675 8, /* num_slots */
676 32, /* l1_cache_size */
677 128, /* l1_cache_line_size */
678 -1, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
682 };
683
684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
685 {
686 8, /* num_slots */
687 32, /* l1_cache_size */
688 64, /* l1_cache_line_size */
689 256, /* l2_cache_size */
690 true, /* prefetch_dynamic_strides */
691 -1, /* minimum_stride */
692 -1 /* default_opt_level */
693 };
694
695 static const cpu_prefetch_tune tsv110_prefetch_tune =
696 {
697 0, /* num_slots */
698 64, /* l1_cache_size */
699 64, /* l1_cache_line_size */
700 512, /* l2_cache_size */
701 true, /* prefetch_dynamic_strides */
702 -1, /* minimum_stride */
703 -1 /* default_opt_level */
704 };
705
706 static const cpu_prefetch_tune xgene1_prefetch_tune =
707 {
708 8, /* num_slots */
709 32, /* l1_cache_size */
710 64, /* l1_cache_line_size */
711 256, /* l2_cache_size */
712 true, /* prefetch_dynamic_strides */
713 -1, /* minimum_stride */
714 -1 /* default_opt_level */
715 };
716
717 static const struct tune_params generic_tunings =
718 {
719 &cortexa57_extra_costs,
720 &generic_addrcost_table,
721 &generic_regmove_cost,
722 &generic_vector_cost,
723 &generic_branch_cost,
724 &generic_approx_modes,
725 SVE_NOT_IMPLEMENTED, /* sve_width */
726 4, /* memmov_cost */
727 2, /* issue_rate */
728 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
729 "16:12", /* function_align. */
730 "4", /* jump_align. */
731 "8", /* loop_align. */
732 2, /* int_reassoc_width. */
733 4, /* fp_reassoc_width. */
734 1, /* vec_reassoc_width. */
735 2, /* min_div_recip_mul_sf. */
736 2, /* min_div_recip_mul_df. */
737 0, /* max_case_values. */
738 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
739 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
740 &generic_prefetch_tune
741 };
742
743 static const struct tune_params cortexa35_tunings =
744 {
745 &cortexa53_extra_costs,
746 &generic_addrcost_table,
747 &cortexa53_regmove_cost,
748 &generic_vector_cost,
749 &generic_branch_cost,
750 &generic_approx_modes,
751 SVE_NOT_IMPLEMENTED, /* sve_width */
752 4, /* memmov_cost */
753 1, /* issue_rate */
754 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
755 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
756 "16", /* function_align. */
757 "4", /* jump_align. */
758 "8", /* loop_align. */
759 2, /* int_reassoc_width. */
760 4, /* fp_reassoc_width. */
761 1, /* vec_reassoc_width. */
762 2, /* min_div_recip_mul_sf. */
763 2, /* min_div_recip_mul_df. */
764 0, /* max_case_values. */
765 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
766 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
767 &generic_prefetch_tune
768 };
769
770 static const struct tune_params cortexa53_tunings =
771 {
772 &cortexa53_extra_costs,
773 &generic_addrcost_table,
774 &cortexa53_regmove_cost,
775 &generic_vector_cost,
776 &generic_branch_cost,
777 &generic_approx_modes,
778 SVE_NOT_IMPLEMENTED, /* sve_width */
779 4, /* memmov_cost */
780 2, /* issue_rate */
781 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
782 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
783 "16", /* function_align. */
784 "4", /* jump_align. */
785 "8", /* loop_align. */
786 2, /* int_reassoc_width. */
787 4, /* fp_reassoc_width. */
788 1, /* vec_reassoc_width. */
789 2, /* min_div_recip_mul_sf. */
790 2, /* min_div_recip_mul_df. */
791 0, /* max_case_values. */
792 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
793 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
794 &generic_prefetch_tune
795 };
796
797 static const struct tune_params cortexa57_tunings =
798 {
799 &cortexa57_extra_costs,
800 &generic_addrcost_table,
801 &cortexa57_regmove_cost,
802 &cortexa57_vector_cost,
803 &generic_branch_cost,
804 &generic_approx_modes,
805 SVE_NOT_IMPLEMENTED, /* sve_width */
806 4, /* memmov_cost */
807 3, /* issue_rate */
808 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
809 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
810 "16", /* function_align. */
811 "4", /* jump_align. */
812 "8", /* loop_align. */
813 2, /* int_reassoc_width. */
814 4, /* fp_reassoc_width. */
815 1, /* vec_reassoc_width. */
816 2, /* min_div_recip_mul_sf. */
817 2, /* min_div_recip_mul_df. */
818 0, /* max_case_values. */
819 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
820 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
821 &generic_prefetch_tune
822 };
823
824 static const struct tune_params cortexa72_tunings =
825 {
826 &cortexa57_extra_costs,
827 &generic_addrcost_table,
828 &cortexa57_regmove_cost,
829 &cortexa57_vector_cost,
830 &generic_branch_cost,
831 &generic_approx_modes,
832 SVE_NOT_IMPLEMENTED, /* sve_width */
833 4, /* memmov_cost */
834 3, /* issue_rate */
835 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
836 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
837 "16", /* function_align. */
838 "4", /* jump_align. */
839 "8", /* loop_align. */
840 2, /* int_reassoc_width. */
841 4, /* fp_reassoc_width. */
842 1, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &generic_prefetch_tune
849 };
850
851 static const struct tune_params cortexa73_tunings =
852 {
853 &cortexa57_extra_costs,
854 &generic_addrcost_table,
855 &cortexa57_regmove_cost,
856 &cortexa57_vector_cost,
857 &generic_branch_cost,
858 &generic_approx_modes,
859 SVE_NOT_IMPLEMENTED, /* sve_width */
860 4, /* memmov_cost. */
861 2, /* issue_rate. */
862 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
863 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
864 "16", /* function_align. */
865 "4", /* jump_align. */
866 "8", /* loop_align. */
867 2, /* int_reassoc_width. */
868 4, /* fp_reassoc_width. */
869 1, /* vec_reassoc_width. */
870 2, /* min_div_recip_mul_sf. */
871 2, /* min_div_recip_mul_df. */
872 0, /* max_case_values. */
873 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
874 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
875 &generic_prefetch_tune
876 };
877
878
879
880 static const struct tune_params exynosm1_tunings =
881 {
882 &exynosm1_extra_costs,
883 &exynosm1_addrcost_table,
884 &exynosm1_regmove_cost,
885 &exynosm1_vector_cost,
886 &generic_branch_cost,
887 &exynosm1_approx_modes,
888 SVE_NOT_IMPLEMENTED, /* sve_width */
889 4, /* memmov_cost */
890 3, /* issue_rate */
891 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
892 "4", /* function_align. */
893 "4", /* jump_align. */
894 "4", /* loop_align. */
895 2, /* int_reassoc_width. */
896 4, /* fp_reassoc_width. */
897 1, /* vec_reassoc_width. */
898 2, /* min_div_recip_mul_sf. */
899 2, /* min_div_recip_mul_df. */
900 48, /* max_case_values. */
901 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
902 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
903 &exynosm1_prefetch_tune
904 };
905
906 static const struct tune_params thunderxt88_tunings =
907 {
908 &thunderx_extra_costs,
909 &generic_addrcost_table,
910 &thunderx_regmove_cost,
911 &thunderx_vector_cost,
912 &generic_branch_cost,
913 &generic_approx_modes,
914 SVE_NOT_IMPLEMENTED, /* sve_width */
915 6, /* memmov_cost */
916 2, /* issue_rate */
917 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
918 "8", /* function_align. */
919 "8", /* jump_align. */
920 "8", /* loop_align. */
921 2, /* int_reassoc_width. */
922 4, /* fp_reassoc_width. */
923 1, /* vec_reassoc_width. */
924 2, /* min_div_recip_mul_sf. */
925 2, /* min_div_recip_mul_df. */
926 0, /* max_case_values. */
927 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
928 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
929 &thunderxt88_prefetch_tune
930 };
931
932 static const struct tune_params thunderx_tunings =
933 {
934 &thunderx_extra_costs,
935 &generic_addrcost_table,
936 &thunderx_regmove_cost,
937 &thunderx_vector_cost,
938 &generic_branch_cost,
939 &generic_approx_modes,
940 SVE_NOT_IMPLEMENTED, /* sve_width */
941 6, /* memmov_cost */
942 2, /* issue_rate */
943 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
944 "8", /* function_align. */
945 "8", /* jump_align. */
946 "8", /* loop_align. */
947 2, /* int_reassoc_width. */
948 4, /* fp_reassoc_width. */
949 1, /* vec_reassoc_width. */
950 2, /* min_div_recip_mul_sf. */
951 2, /* min_div_recip_mul_df. */
952 0, /* max_case_values. */
953 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
954 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
955 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
956 &thunderx_prefetch_tune
957 };
958
959 static const struct tune_params tsv110_tunings =
960 {
961 &tsv110_extra_costs,
962 &tsv110_addrcost_table,
963 &tsv110_regmove_cost,
964 &tsv110_vector_cost,
965 &generic_branch_cost,
966 &generic_approx_modes,
967 SVE_NOT_IMPLEMENTED, /* sve_width */
968 4, /* memmov_cost */
969 4, /* issue_rate */
970 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
971 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
972 "16", /* function_align. */
973 "4", /* jump_align. */
974 "8", /* loop_align. */
975 2, /* int_reassoc_width. */
976 4, /* fp_reassoc_width. */
977 1, /* vec_reassoc_width. */
978 2, /* min_div_recip_mul_sf. */
979 2, /* min_div_recip_mul_df. */
980 0, /* max_case_values. */
981 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
982 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
983 &tsv110_prefetch_tune
984 };
985
986 static const struct tune_params xgene1_tunings =
987 {
988 &xgene1_extra_costs,
989 &xgene1_addrcost_table,
990 &xgene1_regmove_cost,
991 &xgene1_vector_cost,
992 &generic_branch_cost,
993 &xgene1_approx_modes,
994 SVE_NOT_IMPLEMENTED, /* sve_width */
995 6, /* memmov_cost */
996 4, /* issue_rate */
997 AARCH64_FUSE_NOTHING, /* fusible_ops */
998 "16", /* function_align. */
999 "16", /* jump_align. */
1000 "16", /* loop_align. */
1001 2, /* int_reassoc_width. */
1002 4, /* fp_reassoc_width. */
1003 1, /* vec_reassoc_width. */
1004 2, /* min_div_recip_mul_sf. */
1005 2, /* min_div_recip_mul_df. */
1006 17, /* max_case_values. */
1007 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1008 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1009 &xgene1_prefetch_tune
1010 };
1011
1012 static const struct tune_params emag_tunings =
1013 {
1014 &xgene1_extra_costs,
1015 &xgene1_addrcost_table,
1016 &xgene1_regmove_cost,
1017 &xgene1_vector_cost,
1018 &generic_branch_cost,
1019 &xgene1_approx_modes,
1020 SVE_NOT_IMPLEMENTED,
1021 6, /* memmov_cost */
1022 4, /* issue_rate */
1023 AARCH64_FUSE_NOTHING, /* fusible_ops */
1024 "16", /* function_align. */
1025 "16", /* jump_align. */
1026 "16", /* loop_align. */
1027 2, /* int_reassoc_width. */
1028 4, /* fp_reassoc_width. */
1029 1, /* vec_reassoc_width. */
1030 2, /* min_div_recip_mul_sf. */
1031 2, /* min_div_recip_mul_df. */
1032 17, /* max_case_values. */
1033 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1034 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1035 &xgene1_prefetch_tune
1036 };
1037
1038 static const struct tune_params qdf24xx_tunings =
1039 {
1040 &qdf24xx_extra_costs,
1041 &qdf24xx_addrcost_table,
1042 &qdf24xx_regmove_cost,
1043 &qdf24xx_vector_cost,
1044 &generic_branch_cost,
1045 &generic_approx_modes,
1046 SVE_NOT_IMPLEMENTED, /* sve_width */
1047 4, /* memmov_cost */
1048 4, /* issue_rate */
1049 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1050 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1051 "16", /* function_align. */
1052 "8", /* jump_align. */
1053 "16", /* loop_align. */
1054 2, /* int_reassoc_width. */
1055 4, /* fp_reassoc_width. */
1056 1, /* vec_reassoc_width. */
1057 2, /* min_div_recip_mul_sf. */
1058 2, /* min_div_recip_mul_df. */
1059 0, /* max_case_values. */
1060 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1061 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1062 &qdf24xx_prefetch_tune
1063 };
1064
1065 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1066 for now. */
1067 static const struct tune_params saphira_tunings =
1068 {
1069 &generic_extra_costs,
1070 &generic_addrcost_table,
1071 &generic_regmove_cost,
1072 &generic_vector_cost,
1073 &generic_branch_cost,
1074 &generic_approx_modes,
1075 SVE_NOT_IMPLEMENTED, /* sve_width */
1076 4, /* memmov_cost */
1077 4, /* issue_rate */
1078 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1079 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1080 "16", /* function_align. */
1081 "8", /* jump_align. */
1082 "16", /* loop_align. */
1083 2, /* int_reassoc_width. */
1084 4, /* fp_reassoc_width. */
1085 1, /* vec_reassoc_width. */
1086 2, /* min_div_recip_mul_sf. */
1087 2, /* min_div_recip_mul_df. */
1088 0, /* max_case_values. */
1089 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1090 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1091 &generic_prefetch_tune
1092 };
1093
1094 static const struct tune_params thunderx2t99_tunings =
1095 {
1096 &thunderx2t99_extra_costs,
1097 &thunderx2t99_addrcost_table,
1098 &thunderx2t99_regmove_cost,
1099 &thunderx2t99_vector_cost,
1100 &generic_branch_cost,
1101 &generic_approx_modes,
1102 SVE_NOT_IMPLEMENTED, /* sve_width */
1103 4, /* memmov_cost. */
1104 4, /* issue_rate. */
1105 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1106 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1107 "16", /* function_align. */
1108 "8", /* jump_align. */
1109 "16", /* loop_align. */
1110 3, /* int_reassoc_width. */
1111 2, /* fp_reassoc_width. */
1112 2, /* vec_reassoc_width. */
1113 2, /* min_div_recip_mul_sf. */
1114 2, /* min_div_recip_mul_df. */
1115 0, /* max_case_values. */
1116 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1117 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1118 &thunderx2t99_prefetch_tune
1119 };
1120
1121 static const struct tune_params neoversen1_tunings =
1122 {
1123 &cortexa57_extra_costs,
1124 &generic_addrcost_table,
1125 &generic_regmove_cost,
1126 &cortexa57_vector_cost,
1127 &generic_branch_cost,
1128 &generic_approx_modes,
1129 SVE_NOT_IMPLEMENTED, /* sve_width */
1130 4, /* memmov_cost */
1131 3, /* issue_rate */
1132 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1133 "32:16", /* function_align. */
1134 "32:16", /* jump_align. */
1135 "32:16", /* loop_align. */
1136 2, /* int_reassoc_width. */
1137 4, /* fp_reassoc_width. */
1138 2, /* vec_reassoc_width. */
1139 2, /* min_div_recip_mul_sf. */
1140 2, /* min_div_recip_mul_df. */
1141 0, /* max_case_values. */
1142 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1143 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1144 &generic_prefetch_tune
1145 };
1146
1147 /* Support for fine-grained override of the tuning structures. */
1148 struct aarch64_tuning_override_function
1149 {
1150 const char* name;
1151 void (*parse_override)(const char*, struct tune_params*);
1152 };
1153
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1157
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions[] =
1160 {
1161 { "fuse", aarch64_parse_fuse_string },
1162 { "tune", aarch64_parse_tune_string },
1163 { "sve_width", aarch64_parse_sve_width_string },
1164 { NULL, NULL }
1165 };
1166
1167 /* A processor implementing AArch64. */
1168 struct processor
1169 {
1170 const char *const name;
1171 enum aarch64_processor ident;
1172 enum aarch64_processor sched_core;
1173 enum aarch64_arch arch;
1174 unsigned architecture_version;
1175 const uint64_t flags;
1176 const struct tune_params *const tune;
1177 };
1178
1179 /* Architectures implementing AArch64. */
1180 static const struct processor all_architectures[] =
1181 {
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1186 };
1187
1188 /* Processor cores implementing AArch64. */
1189 static const struct processor all_cores[] =
1190 {
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1193 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1194 FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1197 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1198 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1199 };
1200
1201
1202 /* Target specification. These are populated by the -march, -mtune, -mcpu
1203 handling code or by target attributes. */
1204 static const struct processor *selected_arch;
1205 static const struct processor *selected_cpu;
1206 static const struct processor *selected_tune;
1207
1208 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1209
1210 /* The current tuning set. */
1211 struct tune_params aarch64_tune_params = generic_tunings;
1212
1213 /* Table of machine attributes. */
1214 static const struct attribute_spec aarch64_attribute_table[] =
1215 {
1216 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217 affects_type_identity, handler, exclude } */
1218 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1219 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1220 };
1221
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1223
1224 /* An ISA extension in the co-processor and main instruction set space. */
1225 struct aarch64_option_extension
1226 {
1227 const char *const name;
1228 const unsigned long flags_on;
1229 const unsigned long flags_off;
1230 };
1231
1232 typedef enum aarch64_cond_code
1233 {
1234 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1235 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1236 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1237 }
1238 aarch64_cc;
1239
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1241
1242 struct aarch64_branch_protect_type
1243 {
1244 /* The type's name that the user passes to the branch-protection option
1245 string. */
1246 const char* name;
1247 /* Function to handle the protection type and set global variables.
1248 First argument is the string token corresponding with this type and the
1249 second argument is the next token in the option string.
1250 Return values:
1251 * AARCH64_PARSE_OK: Handling was sucessful.
1252 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253 should print an error.
1254 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1255 own error. */
1256 enum aarch64_parse_opt_result (*handler)(char*, char*);
1257 /* A list of types that can follow this type in the option string. */
1258 const aarch64_branch_protect_type* subtypes;
1259 unsigned int num_subtypes;
1260 };
1261
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str, char* rest)
1264 {
1265 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1266 aarch64_enable_bti = 0;
1267 if (rest)
1268 {
1269 error ("unexpected %<%s%> after %<%s%>", rest, str);
1270 return AARCH64_PARSE_INVALID_FEATURE;
1271 }
1272 return AARCH64_PARSE_OK;
1273 }
1274
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str, char* rest)
1277 {
1278 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1279 aarch64_ra_sign_key = AARCH64_KEY_A;
1280 aarch64_enable_bti = 1;
1281 if (rest)
1282 {
1283 error ("unexpected %<%s%> after %<%s%>", rest, str);
1284 return AARCH64_PARSE_INVALID_FEATURE;
1285 }
1286 return AARCH64_PARSE_OK;
1287 }
1288
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1291 char* rest ATTRIBUTE_UNUSED)
1292 {
1293 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1294 aarch64_ra_sign_key = AARCH64_KEY_A;
1295 return AARCH64_PARSE_OK;
1296 }
1297
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1300 char* rest ATTRIBUTE_UNUSED)
1301 {
1302 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1303 return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1308 char* rest ATTRIBUTE_UNUSED)
1309 {
1310 aarch64_ra_sign_key = AARCH64_KEY_B;
1311 return AARCH64_PARSE_OK;
1312 }
1313
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1316 char* rest ATTRIBUTE_UNUSED)
1317 {
1318 aarch64_enable_bti = 1;
1319 return AARCH64_PARSE_OK;
1320 }
1321
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1323 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1324 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1325 { NULL, NULL, NULL, 0 }
1326 };
1327
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1329 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1330 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1331 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1332 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1333 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1334 { NULL, NULL, NULL, 0 }
1335 };
1336
1337 /* The condition codes of the processor, and the inverse function. */
1338 static const char * const aarch64_condition_codes[] =
1339 {
1340 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1342 };
1343
1344 /* The preferred condition codes for SVE conditions. */
1345 static const char *const aarch64_sve_condition_codes[] =
1346 {
1347 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1349 };
1350
1351 /* Return the assembly token for svpattern value VALUE. */
1352
1353 static const char *
1354 svpattern_token (enum aarch64_svpattern pattern)
1355 {
1356 switch (pattern)
1357 {
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359 AARCH64_FOR_SVPATTERN (CASE)
1360 #undef CASE
1361 case AARCH64_NUM_SVPATTERNS:
1362 break;
1363 }
1364 gcc_unreachable ();
1365 }
1366
1367 /* Generate code to enable conditional branches in functions over 1 MiB. */
1368 const char *
1369 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1370 const char * branch_format)
1371 {
1372 rtx_code_label * tmp_label = gen_label_rtx ();
1373 char label_buf[256];
1374 char buffer[128];
1375 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1376 CODE_LABEL_NUMBER (tmp_label));
1377 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1378 rtx dest_label = operands[pos_label];
1379 operands[pos_label] = tmp_label;
1380
1381 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1382 output_asm_insn (buffer, operands);
1383
1384 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1385 operands[pos_label] = dest_label;
1386 output_asm_insn (buffer, operands);
1387 return "";
1388 }
1389
1390 void
1391 aarch64_err_no_fpadvsimd (machine_mode mode)
1392 {
1393 if (TARGET_GENERAL_REGS_ONLY)
1394 if (FLOAT_MODE_P (mode))
1395 error ("%qs is incompatible with the use of floating-point types",
1396 "-mgeneral-regs-only");
1397 else
1398 error ("%qs is incompatible with the use of vector types",
1399 "-mgeneral-regs-only");
1400 else
1401 if (FLOAT_MODE_P (mode))
1402 error ("%qs feature modifier is incompatible with the use of"
1403 " floating-point types", "+nofp");
1404 else
1405 error ("%qs feature modifier is incompatible with the use of"
1406 " vector types", "+nofp");
1407 }
1408
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413 and GENERAL_REGS is lower than the memory cost (in this case the best class
1414 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1415 cost results in bad allocations with many redundant int<->FP moves which
1416 are expensive on various cores.
1417 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1419 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1420 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1421 The result of this is that it is no longer inefficient to have a higher
1422 memory move cost than the register move cost.
1423 */
1424
1425 static reg_class_t
1426 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1427 reg_class_t best_class)
1428 {
1429 machine_mode mode;
1430
1431 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1432 || !reg_class_subset_p (FP_REGS, allocno_class))
1433 return allocno_class;
1434
1435 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1436 || !reg_class_subset_p (FP_REGS, best_class))
1437 return best_class;
1438
1439 mode = PSEUDO_REGNO_MODE (regno);
1440 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1441 }
1442
1443 static unsigned int
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1445 {
1446 if (GET_MODE_UNIT_SIZE (mode) == 4)
1447 return aarch64_tune_params.min_div_recip_mul_sf;
1448 return aarch64_tune_params.min_div_recip_mul_df;
1449 }
1450
1451 /* Return the reassociation width of treeop OPC with mode MODE. */
1452 static int
1453 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1454 {
1455 if (VECTOR_MODE_P (mode))
1456 return aarch64_tune_params.vec_reassoc_width;
1457 if (INTEGRAL_MODE_P (mode))
1458 return aarch64_tune_params.int_reassoc_width;
1459 /* Avoid reassociating floating point addition so we emit more FMAs. */
1460 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1461 return aarch64_tune_params.fp_reassoc_width;
1462 return 1;
1463 }
1464
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1466 unsigned
1467 aarch64_dbx_register_number (unsigned regno)
1468 {
1469 if (GP_REGNUM_P (regno))
1470 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1471 else if (regno == SP_REGNUM)
1472 return AARCH64_DWARF_SP;
1473 else if (FP_REGNUM_P (regno))
1474 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1475 else if (PR_REGNUM_P (regno))
1476 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1477 else if (regno == VG_REGNUM)
1478 return AARCH64_DWARF_VG;
1479
1480 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481 equivalent DWARF register. */
1482 return DWARF_FRAME_REGISTERS;
1483 }
1484
1485 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1486 integer, otherwise return X unmodified. */
1487 static rtx
1488 aarch64_bit_representation (rtx x)
1489 {
1490 if (CONST_DOUBLE_P (x))
1491 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1492 return x;
1493 }
1494
1495 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1496 static bool
1497 aarch64_advsimd_struct_mode_p (machine_mode mode)
1498 {
1499 return (TARGET_SIMD
1500 && (mode == OImode || mode == CImode || mode == XImode));
1501 }
1502
1503 /* Return true if MODE is an SVE predicate mode. */
1504 static bool
1505 aarch64_sve_pred_mode_p (machine_mode mode)
1506 {
1507 return (TARGET_SVE
1508 && (mode == VNx16BImode
1509 || mode == VNx8BImode
1510 || mode == VNx4BImode
1511 || mode == VNx2BImode));
1512 }
1513
1514 /* Three mutually-exclusive flags describing a vector or predicate type. */
1515 const unsigned int VEC_ADVSIMD = 1;
1516 const unsigned int VEC_SVE_DATA = 2;
1517 const unsigned int VEC_SVE_PRED = 4;
1518 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1519 a structure of 2, 3 or 4 vectors. */
1520 const unsigned int VEC_STRUCT = 8;
1521 /* Useful combinations of the above. */
1522 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1523 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1524
1525 /* Return a set of flags describing the vector properties of mode MODE.
1526 Ignore modes that are not supported by the current target. */
1527 static unsigned int
1528 aarch64_classify_vector_mode (machine_mode mode)
1529 {
1530 if (aarch64_advsimd_struct_mode_p (mode))
1531 return VEC_ADVSIMD | VEC_STRUCT;
1532
1533 if (aarch64_sve_pred_mode_p (mode))
1534 return VEC_SVE_PRED;
1535
1536 /* Make the decision based on the mode's enum value rather than its
1537 properties, so that we keep the correct classification regardless
1538 of -msve-vector-bits. */
1539 switch (mode)
1540 {
1541 /* Single SVE vectors. */
1542 case E_VNx16QImode:
1543 case E_VNx8HImode:
1544 case E_VNx4SImode:
1545 case E_VNx2DImode:
1546 case E_VNx8HFmode:
1547 case E_VNx4SFmode:
1548 case E_VNx2DFmode:
1549 return TARGET_SVE ? VEC_SVE_DATA : 0;
1550
1551 /* x2 SVE vectors. */
1552 case E_VNx32QImode:
1553 case E_VNx16HImode:
1554 case E_VNx8SImode:
1555 case E_VNx4DImode:
1556 case E_VNx16HFmode:
1557 case E_VNx8SFmode:
1558 case E_VNx4DFmode:
1559 /* x3 SVE vectors. */
1560 case E_VNx48QImode:
1561 case E_VNx24HImode:
1562 case E_VNx12SImode:
1563 case E_VNx6DImode:
1564 case E_VNx24HFmode:
1565 case E_VNx12SFmode:
1566 case E_VNx6DFmode:
1567 /* x4 SVE vectors. */
1568 case E_VNx64QImode:
1569 case E_VNx32HImode:
1570 case E_VNx16SImode:
1571 case E_VNx8DImode:
1572 case E_VNx32HFmode:
1573 case E_VNx16SFmode:
1574 case E_VNx8DFmode:
1575 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1576
1577 /* 64-bit Advanced SIMD vectors. */
1578 case E_V8QImode:
1579 case E_V4HImode:
1580 case E_V2SImode:
1581 /* ...E_V1DImode doesn't exist. */
1582 case E_V4HFmode:
1583 case E_V2SFmode:
1584 case E_V1DFmode:
1585 /* 128-bit Advanced SIMD vectors. */
1586 case E_V16QImode:
1587 case E_V8HImode:
1588 case E_V4SImode:
1589 case E_V2DImode:
1590 case E_V8HFmode:
1591 case E_V4SFmode:
1592 case E_V2DFmode:
1593 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1594
1595 default:
1596 return 0;
1597 }
1598 }
1599
1600 /* Return true if MODE is any of the data vector modes, including
1601 structure modes. */
1602 static bool
1603 aarch64_vector_data_mode_p (machine_mode mode)
1604 {
1605 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1606 }
1607
1608 /* Return true if MODE is an SVE data vector mode; either a single vector
1609 or a structure of vectors. */
1610 static bool
1611 aarch64_sve_data_mode_p (machine_mode mode)
1612 {
1613 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1614 }
1615
1616 /* Implement target hook TARGET_ARRAY_MODE. */
1617 static opt_machine_mode
1618 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1619 {
1620 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1621 && IN_RANGE (nelems, 2, 4))
1622 return mode_for_vector (GET_MODE_INNER (mode),
1623 GET_MODE_NUNITS (mode) * nelems);
1624
1625 return opt_machine_mode ();
1626 }
1627
1628 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1629 static bool
1630 aarch64_array_mode_supported_p (machine_mode mode,
1631 unsigned HOST_WIDE_INT nelems)
1632 {
1633 if (TARGET_SIMD
1634 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1635 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1636 && (nelems >= 2 && nelems <= 4))
1637 return true;
1638
1639 return false;
1640 }
1641
1642 /* Return the SVE predicate mode to use for elements that have
1643 ELEM_NBYTES bytes, if such a mode exists. */
1644
1645 opt_machine_mode
1646 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1647 {
1648 if (TARGET_SVE)
1649 {
1650 if (elem_nbytes == 1)
1651 return VNx16BImode;
1652 if (elem_nbytes == 2)
1653 return VNx8BImode;
1654 if (elem_nbytes == 4)
1655 return VNx4BImode;
1656 if (elem_nbytes == 8)
1657 return VNx2BImode;
1658 }
1659 return opt_machine_mode ();
1660 }
1661
1662 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1663
1664 static opt_machine_mode
1665 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1666 {
1667 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1668 {
1669 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1670 machine_mode pred_mode;
1671 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1672 return pred_mode;
1673 }
1674
1675 return default_get_mask_mode (nunits, nbytes);
1676 }
1677
1678 /* Return the integer element mode associated with SVE mode MODE. */
1679
1680 static scalar_int_mode
1681 aarch64_sve_element_int_mode (machine_mode mode)
1682 {
1683 unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1684 GET_MODE_NUNITS (mode));
1685 return int_mode_for_size (elt_bits, 0).require ();
1686 }
1687
1688 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1689 prefer to use the first arithmetic operand as the else value if
1690 the else value doesn't matter, since that exactly matches the SVE
1691 destructive merging form. For ternary operations we could either
1692 pick the first operand and use FMAD-like instructions or the last
1693 operand and use FMLA-like instructions; the latter seems more
1694 natural. */
1695
1696 static tree
1697 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1698 {
1699 return nops == 3 ? ops[2] : ops[0];
1700 }
1701
1702 /* Implement TARGET_HARD_REGNO_NREGS. */
1703
1704 static unsigned int
1705 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1706 {
1707 /* ??? Logically we should only need to provide a value when
1708 HARD_REGNO_MODE_OK says that the combination is valid,
1709 but at the moment we need to handle all modes. Just ignore
1710 any runtime parts for registers that can't store them. */
1711 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1712 switch (aarch64_regno_regclass (regno))
1713 {
1714 case FP_REGS:
1715 case FP_LO_REGS:
1716 case FP_LO8_REGS:
1717 if (aarch64_sve_data_mode_p (mode))
1718 return exact_div (GET_MODE_SIZE (mode),
1719 BYTES_PER_SVE_VECTOR).to_constant ();
1720 return CEIL (lowest_size, UNITS_PER_VREG);
1721 case PR_REGS:
1722 case PR_LO_REGS:
1723 case PR_HI_REGS:
1724 return 1;
1725 default:
1726 return CEIL (lowest_size, UNITS_PER_WORD);
1727 }
1728 gcc_unreachable ();
1729 }
1730
1731 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1732
1733 static bool
1734 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1735 {
1736 if (GET_MODE_CLASS (mode) == MODE_CC)
1737 return regno == CC_REGNUM;
1738
1739 if (regno == VG_REGNUM)
1740 /* This must have the same size as _Unwind_Word. */
1741 return mode == DImode;
1742
1743 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1744 if (vec_flags & VEC_SVE_PRED)
1745 return PR_REGNUM_P (regno);
1746
1747 if (PR_REGNUM_P (regno))
1748 return 0;
1749
1750 if (regno == SP_REGNUM)
1751 /* The purpose of comparing with ptr_mode is to support the
1752 global register variable associated with the stack pointer
1753 register via the syntax of asm ("wsp") in ILP32. */
1754 return mode == Pmode || mode == ptr_mode;
1755
1756 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1757 return mode == Pmode;
1758
1759 if (GP_REGNUM_P (regno))
1760 {
1761 if (known_le (GET_MODE_SIZE (mode), 8))
1762 return true;
1763 else if (known_le (GET_MODE_SIZE (mode), 16))
1764 return (regno & 1) == 0;
1765 }
1766 else if (FP_REGNUM_P (regno))
1767 {
1768 if (vec_flags & VEC_STRUCT)
1769 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1770 else
1771 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1772 }
1773
1774 return false;
1775 }
1776
1777 /* Return true if this is a definition of a vectorized simd function. */
1778
1779 static bool
1780 aarch64_simd_decl_p (tree fndecl)
1781 {
1782 tree fntype;
1783
1784 if (fndecl == NULL)
1785 return false;
1786 fntype = TREE_TYPE (fndecl);
1787 if (fntype == NULL)
1788 return false;
1789
1790 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1791 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1792 return true;
1793
1794 return false;
1795 }
1796
1797 /* Return the mode a register save/restore should use. DImode for integer
1798 registers, DFmode for FP registers in non-SIMD functions (they only save
1799 the bottom half of a 128 bit register), or TFmode for FP registers in
1800 SIMD functions. */
1801
1802 static machine_mode
1803 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1804 {
1805 return GP_REGNUM_P (regno)
1806 ? E_DImode
1807 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1808 }
1809
1810 /* Return true if the instruction is a call to a SIMD function, false
1811 if it is not a SIMD function or if we do not know anything about
1812 the function. */
1813
1814 static bool
1815 aarch64_simd_call_p (rtx_insn *insn)
1816 {
1817 rtx symbol;
1818 rtx call;
1819 tree fndecl;
1820
1821 gcc_assert (CALL_P (insn));
1822 call = get_call_rtx_from (insn);
1823 symbol = XEXP (XEXP (call, 0), 0);
1824 if (GET_CODE (symbol) != SYMBOL_REF)
1825 return false;
1826 fndecl = SYMBOL_REF_DECL (symbol);
1827 if (!fndecl)
1828 return false;
1829
1830 return aarch64_simd_decl_p (fndecl);
1831 }
1832
1833 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1834 a function that uses the SIMD ABI, take advantage of the extra
1835 call-preserved registers that the ABI provides. */
1836
1837 void
1838 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1839 HARD_REG_SET *return_set)
1840 {
1841 if (aarch64_simd_call_p (insn))
1842 {
1843 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1844 if (FP_SIMD_SAVED_REGNUM_P (regno))
1845 CLEAR_HARD_REG_BIT (*return_set, regno);
1846 }
1847 }
1848
1849 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1850 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1851 clobbers the top 64 bits when restoring the bottom 64 bits. */
1852
1853 static bool
1854 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1855 machine_mode mode)
1856 {
1857 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1858 return FP_REGNUM_P (regno)
1859 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1860 }
1861
1862 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1863
1864 rtx_insn *
1865 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1866 {
1867 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1868
1869 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1870 return call_1;
1871 else
1872 return call_2;
1873 }
1874
1875 /* Implement REGMODE_NATURAL_SIZE. */
1876 poly_uint64
1877 aarch64_regmode_natural_size (machine_mode mode)
1878 {
1879 /* The natural size for SVE data modes is one SVE data vector,
1880 and similarly for predicates. We can't independently modify
1881 anything smaller than that. */
1882 /* ??? For now, only do this for variable-width SVE registers.
1883 Doing it for constant-sized registers breaks lower-subreg.c. */
1884 /* ??? And once that's fixed, we should probably have similar
1885 code for Advanced SIMD. */
1886 if (!aarch64_sve_vg.is_constant ())
1887 {
1888 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1889 if (vec_flags & VEC_SVE_PRED)
1890 return BYTES_PER_SVE_PRED;
1891 if (vec_flags & VEC_SVE_DATA)
1892 return BYTES_PER_SVE_VECTOR;
1893 }
1894 return UNITS_PER_WORD;
1895 }
1896
1897 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1898 machine_mode
1899 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1900 machine_mode mode)
1901 {
1902 /* The predicate mode determines which bits are significant and
1903 which are "don't care". Decreasing the number of lanes would
1904 lose data while increasing the number of lanes would make bits
1905 unnecessarily significant. */
1906 if (PR_REGNUM_P (regno))
1907 return mode;
1908 if (known_ge (GET_MODE_SIZE (mode), 4))
1909 return mode;
1910 else
1911 return SImode;
1912 }
1913
1914 /* Return true if I's bits are consecutive ones from the MSB. */
1915 bool
1916 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1917 {
1918 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1919 }
1920
1921 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1922 that strcpy from constants will be faster. */
1923
1924 static HOST_WIDE_INT
1925 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1926 {
1927 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1928 return MAX (align, BITS_PER_WORD);
1929 return align;
1930 }
1931
1932 /* Return true if calls to DECL should be treated as
1933 long-calls (ie called via a register). */
1934 static bool
1935 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1936 {
1937 return false;
1938 }
1939
1940 /* Return true if calls to symbol-ref SYM should be treated as
1941 long-calls (ie called via a register). */
1942 bool
1943 aarch64_is_long_call_p (rtx sym)
1944 {
1945 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1946 }
1947
1948 /* Return true if calls to symbol-ref SYM should not go through
1949 plt stubs. */
1950
1951 bool
1952 aarch64_is_noplt_call_p (rtx sym)
1953 {
1954 const_tree decl = SYMBOL_REF_DECL (sym);
1955
1956 if (flag_pic
1957 && decl
1958 && (!flag_plt
1959 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1960 && !targetm.binds_local_p (decl))
1961 return true;
1962
1963 return false;
1964 }
1965
1966 /* Return true if the offsets to a zero/sign-extract operation
1967 represent an expression that matches an extend operation. The
1968 operands represent the paramters from
1969
1970 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1971 bool
1972 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1973 rtx extract_imm)
1974 {
1975 HOST_WIDE_INT mult_val, extract_val;
1976
1977 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1978 return false;
1979
1980 mult_val = INTVAL (mult_imm);
1981 extract_val = INTVAL (extract_imm);
1982
1983 if (extract_val > 8
1984 && extract_val < GET_MODE_BITSIZE (mode)
1985 && exact_log2 (extract_val & ~7) > 0
1986 && (extract_val & 7) <= 4
1987 && mult_val == (1 << (extract_val & 7)))
1988 return true;
1989
1990 return false;
1991 }
1992
1993 /* Emit an insn that's a simple single-set. Both the operands must be
1994 known to be valid. */
1995 inline static rtx_insn *
1996 emit_set_insn (rtx x, rtx y)
1997 {
1998 return emit_insn (gen_rtx_SET (x, y));
1999 }
2000
2001 /* X and Y are two things to compare using CODE. Emit the compare insn and
2002 return the rtx for register 0 in the proper mode. */
2003 rtx
2004 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2005 {
2006 machine_mode mode = SELECT_CC_MODE (code, x, y);
2007 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
2008
2009 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2010 return cc_reg;
2011 }
2012
2013 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2014
2015 static rtx
2016 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2017 machine_mode y_mode)
2018 {
2019 if (y_mode == E_QImode || y_mode == E_HImode)
2020 {
2021 if (CONST_INT_P (y))
2022 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2023 else
2024 {
2025 rtx t, cc_reg;
2026 machine_mode cc_mode;
2027
2028 t = gen_rtx_ZERO_EXTEND (SImode, y);
2029 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2030 cc_mode = CC_SWPmode;
2031 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2032 emit_set_insn (cc_reg, t);
2033 return cc_reg;
2034 }
2035 }
2036
2037 return aarch64_gen_compare_reg (code, x, y);
2038 }
2039
2040 /* Build the SYMBOL_REF for __tls_get_addr. */
2041
2042 static GTY(()) rtx tls_get_addr_libfunc;
2043
2044 rtx
2045 aarch64_tls_get_addr (void)
2046 {
2047 if (!tls_get_addr_libfunc)
2048 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2049 return tls_get_addr_libfunc;
2050 }
2051
2052 /* Return the TLS model to use for ADDR. */
2053
2054 static enum tls_model
2055 tls_symbolic_operand_type (rtx addr)
2056 {
2057 enum tls_model tls_kind = TLS_MODEL_NONE;
2058 if (GET_CODE (addr) == CONST)
2059 {
2060 poly_int64 addend;
2061 rtx sym = strip_offset (addr, &addend);
2062 if (GET_CODE (sym) == SYMBOL_REF)
2063 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2064 }
2065 else if (GET_CODE (addr) == SYMBOL_REF)
2066 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2067
2068 return tls_kind;
2069 }
2070
2071 /* We'll allow lo_sum's in addresses in our legitimate addresses
2072 so that combine would take care of combining addresses where
2073 necessary, but for generation purposes, we'll generate the address
2074 as :
2075 RTL Absolute
2076 tmp = hi (symbol_ref); adrp x1, foo
2077 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2078 nop
2079
2080 PIC TLS
2081 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2082 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2083 bl __tls_get_addr
2084 nop
2085
2086 Load TLS symbol, depending on TLS mechanism and TLS access model.
2087
2088 Global Dynamic - Traditional TLS:
2089 adrp tmp, :tlsgd:imm
2090 add dest, tmp, #:tlsgd_lo12:imm
2091 bl __tls_get_addr
2092
2093 Global Dynamic - TLS Descriptors:
2094 adrp dest, :tlsdesc:imm
2095 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2096 add dest, dest, #:tlsdesc_lo12:imm
2097 blr tmp
2098 mrs tp, tpidr_el0
2099 add dest, dest, tp
2100
2101 Initial Exec:
2102 mrs tp, tpidr_el0
2103 adrp tmp, :gottprel:imm
2104 ldr dest, [tmp, #:gottprel_lo12:imm]
2105 add dest, dest, tp
2106
2107 Local Exec:
2108 mrs tp, tpidr_el0
2109 add t0, tp, #:tprel_hi12:imm, lsl #12
2110 add t0, t0, #:tprel_lo12_nc:imm
2111 */
2112
2113 static void
2114 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2115 enum aarch64_symbol_type type)
2116 {
2117 switch (type)
2118 {
2119 case SYMBOL_SMALL_ABSOLUTE:
2120 {
2121 /* In ILP32, the mode of dest can be either SImode or DImode. */
2122 rtx tmp_reg = dest;
2123 machine_mode mode = GET_MODE (dest);
2124
2125 gcc_assert (mode == Pmode || mode == ptr_mode);
2126
2127 if (can_create_pseudo_p ())
2128 tmp_reg = gen_reg_rtx (mode);
2129
2130 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2131 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2132 return;
2133 }
2134
2135 case SYMBOL_TINY_ABSOLUTE:
2136 emit_insn (gen_rtx_SET (dest, imm));
2137 return;
2138
2139 case SYMBOL_SMALL_GOT_28K:
2140 {
2141 machine_mode mode = GET_MODE (dest);
2142 rtx gp_rtx = pic_offset_table_rtx;
2143 rtx insn;
2144 rtx mem;
2145
2146 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2147 here before rtl expand. Tree IVOPT will generate rtl pattern to
2148 decide rtx costs, in which case pic_offset_table_rtx is not
2149 initialized. For that case no need to generate the first adrp
2150 instruction as the final cost for global variable access is
2151 one instruction. */
2152 if (gp_rtx != NULL)
2153 {
2154 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2155 using the page base as GOT base, the first page may be wasted,
2156 in the worst scenario, there is only 28K space for GOT).
2157
2158 The generate instruction sequence for accessing global variable
2159 is:
2160
2161 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2162
2163 Only one instruction needed. But we must initialize
2164 pic_offset_table_rtx properly. We generate initialize insn for
2165 every global access, and allow CSE to remove all redundant.
2166
2167 The final instruction sequences will look like the following
2168 for multiply global variables access.
2169
2170 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2171
2172 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2173 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2174 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2175 ... */
2176
2177 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2178 crtl->uses_pic_offset_table = 1;
2179 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2180
2181 if (mode != GET_MODE (gp_rtx))
2182 gp_rtx = gen_lowpart (mode, gp_rtx);
2183
2184 }
2185
2186 if (mode == ptr_mode)
2187 {
2188 if (mode == DImode)
2189 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2190 else
2191 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2192
2193 mem = XVECEXP (SET_SRC (insn), 0, 0);
2194 }
2195 else
2196 {
2197 gcc_assert (mode == Pmode);
2198
2199 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2200 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2201 }
2202
2203 /* The operand is expected to be MEM. Whenever the related insn
2204 pattern changed, above code which calculate mem should be
2205 updated. */
2206 gcc_assert (GET_CODE (mem) == MEM);
2207 MEM_READONLY_P (mem) = 1;
2208 MEM_NOTRAP_P (mem) = 1;
2209 emit_insn (insn);
2210 return;
2211 }
2212
2213 case SYMBOL_SMALL_GOT_4G:
2214 {
2215 /* In ILP32, the mode of dest can be either SImode or DImode,
2216 while the got entry is always of SImode size. The mode of
2217 dest depends on how dest is used: if dest is assigned to a
2218 pointer (e.g. in the memory), it has SImode; it may have
2219 DImode if dest is dereferenced to access the memeory.
2220 This is why we have to handle three different ldr_got_small
2221 patterns here (two patterns for ILP32). */
2222
2223 rtx insn;
2224 rtx mem;
2225 rtx tmp_reg = dest;
2226 machine_mode mode = GET_MODE (dest);
2227
2228 if (can_create_pseudo_p ())
2229 tmp_reg = gen_reg_rtx (mode);
2230
2231 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2232 if (mode == ptr_mode)
2233 {
2234 if (mode == DImode)
2235 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2236 else
2237 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2238
2239 mem = XVECEXP (SET_SRC (insn), 0, 0);
2240 }
2241 else
2242 {
2243 gcc_assert (mode == Pmode);
2244
2245 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2246 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2247 }
2248
2249 gcc_assert (GET_CODE (mem) == MEM);
2250 MEM_READONLY_P (mem) = 1;
2251 MEM_NOTRAP_P (mem) = 1;
2252 emit_insn (insn);
2253 return;
2254 }
2255
2256 case SYMBOL_SMALL_TLSGD:
2257 {
2258 rtx_insn *insns;
2259 machine_mode mode = GET_MODE (dest);
2260 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2261
2262 start_sequence ();
2263 if (TARGET_ILP32)
2264 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2265 else
2266 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2267 insns = get_insns ();
2268 end_sequence ();
2269
2270 RTL_CONST_CALL_P (insns) = 1;
2271 emit_libcall_block (insns, dest, result, imm);
2272 return;
2273 }
2274
2275 case SYMBOL_SMALL_TLSDESC:
2276 {
2277 machine_mode mode = GET_MODE (dest);
2278 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2279 rtx tp;
2280
2281 gcc_assert (mode == Pmode || mode == ptr_mode);
2282
2283 /* In ILP32, the got entry is always of SImode size. Unlike
2284 small GOT, the dest is fixed at reg 0. */
2285 if (TARGET_ILP32)
2286 emit_insn (gen_tlsdesc_small_si (imm));
2287 else
2288 emit_insn (gen_tlsdesc_small_di (imm));
2289 tp = aarch64_load_tp (NULL);
2290
2291 if (mode != Pmode)
2292 tp = gen_lowpart (mode, tp);
2293
2294 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2295 if (REG_P (dest))
2296 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2297 return;
2298 }
2299
2300 case SYMBOL_SMALL_TLSIE:
2301 {
2302 /* In ILP32, the mode of dest can be either SImode or DImode,
2303 while the got entry is always of SImode size. The mode of
2304 dest depends on how dest is used: if dest is assigned to a
2305 pointer (e.g. in the memory), it has SImode; it may have
2306 DImode if dest is dereferenced to access the memeory.
2307 This is why we have to handle three different tlsie_small
2308 patterns here (two patterns for ILP32). */
2309 machine_mode mode = GET_MODE (dest);
2310 rtx tmp_reg = gen_reg_rtx (mode);
2311 rtx tp = aarch64_load_tp (NULL);
2312
2313 if (mode == ptr_mode)
2314 {
2315 if (mode == DImode)
2316 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2317 else
2318 {
2319 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2320 tp = gen_lowpart (mode, tp);
2321 }
2322 }
2323 else
2324 {
2325 gcc_assert (mode == Pmode);
2326 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2327 }
2328
2329 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2330 if (REG_P (dest))
2331 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2332 return;
2333 }
2334
2335 case SYMBOL_TLSLE12:
2336 case SYMBOL_TLSLE24:
2337 case SYMBOL_TLSLE32:
2338 case SYMBOL_TLSLE48:
2339 {
2340 machine_mode mode = GET_MODE (dest);
2341 rtx tp = aarch64_load_tp (NULL);
2342
2343 if (mode != Pmode)
2344 tp = gen_lowpart (mode, tp);
2345
2346 switch (type)
2347 {
2348 case SYMBOL_TLSLE12:
2349 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2350 (dest, tp, imm));
2351 break;
2352 case SYMBOL_TLSLE24:
2353 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2354 (dest, tp, imm));
2355 break;
2356 case SYMBOL_TLSLE32:
2357 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2358 (dest, imm));
2359 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2360 (dest, dest, tp));
2361 break;
2362 case SYMBOL_TLSLE48:
2363 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2364 (dest, imm));
2365 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2366 (dest, dest, tp));
2367 break;
2368 default:
2369 gcc_unreachable ();
2370 }
2371
2372 if (REG_P (dest))
2373 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2374 return;
2375 }
2376
2377 case SYMBOL_TINY_GOT:
2378 emit_insn (gen_ldr_got_tiny (dest, imm));
2379 return;
2380
2381 case SYMBOL_TINY_TLSIE:
2382 {
2383 machine_mode mode = GET_MODE (dest);
2384 rtx tp = aarch64_load_tp (NULL);
2385
2386 if (mode == ptr_mode)
2387 {
2388 if (mode == DImode)
2389 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2390 else
2391 {
2392 tp = gen_lowpart (mode, tp);
2393 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2394 }
2395 }
2396 else
2397 {
2398 gcc_assert (mode == Pmode);
2399 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2400 }
2401
2402 if (REG_P (dest))
2403 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2404 return;
2405 }
2406
2407 default:
2408 gcc_unreachable ();
2409 }
2410 }
2411
2412 /* Emit a move from SRC to DEST. Assume that the move expanders can
2413 handle all moves if !can_create_pseudo_p (). The distinction is
2414 important because, unlike emit_move_insn, the move expanders know
2415 how to force Pmode objects into the constant pool even when the
2416 constant pool address is not itself legitimate. */
2417 static rtx
2418 aarch64_emit_move (rtx dest, rtx src)
2419 {
2420 return (can_create_pseudo_p ()
2421 ? emit_move_insn (dest, src)
2422 : emit_move_insn_1 (dest, src));
2423 }
2424
2425 /* Apply UNOPTAB to OP and store the result in DEST. */
2426
2427 static void
2428 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2429 {
2430 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2431 if (dest != tmp)
2432 emit_move_insn (dest, tmp);
2433 }
2434
2435 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2436
2437 static void
2438 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2439 {
2440 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2441 OPTAB_DIRECT);
2442 if (dest != tmp)
2443 emit_move_insn (dest, tmp);
2444 }
2445
2446 /* Split a 128-bit move operation into two 64-bit move operations,
2447 taking care to handle partial overlap of register to register
2448 copies. Special cases are needed when moving between GP regs and
2449 FP regs. SRC can be a register, constant or memory; DST a register
2450 or memory. If either operand is memory it must not have any side
2451 effects. */
2452 void
2453 aarch64_split_128bit_move (rtx dst, rtx src)
2454 {
2455 rtx dst_lo, dst_hi;
2456 rtx src_lo, src_hi;
2457
2458 machine_mode mode = GET_MODE (dst);
2459
2460 gcc_assert (mode == TImode || mode == TFmode);
2461 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2462 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2463
2464 if (REG_P (dst) && REG_P (src))
2465 {
2466 int src_regno = REGNO (src);
2467 int dst_regno = REGNO (dst);
2468
2469 /* Handle FP <-> GP regs. */
2470 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2471 {
2472 src_lo = gen_lowpart (word_mode, src);
2473 src_hi = gen_highpart (word_mode, src);
2474
2475 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2476 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2477 return;
2478 }
2479 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2480 {
2481 dst_lo = gen_lowpart (word_mode, dst);
2482 dst_hi = gen_highpart (word_mode, dst);
2483
2484 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2485 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2486 return;
2487 }
2488 }
2489
2490 dst_lo = gen_lowpart (word_mode, dst);
2491 dst_hi = gen_highpart (word_mode, dst);
2492 src_lo = gen_lowpart (word_mode, src);
2493 src_hi = gen_highpart_mode (word_mode, mode, src);
2494
2495 /* At most one pairing may overlap. */
2496 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2497 {
2498 aarch64_emit_move (dst_hi, src_hi);
2499 aarch64_emit_move (dst_lo, src_lo);
2500 }
2501 else
2502 {
2503 aarch64_emit_move (dst_lo, src_lo);
2504 aarch64_emit_move (dst_hi, src_hi);
2505 }
2506 }
2507
2508 bool
2509 aarch64_split_128bit_move_p (rtx dst, rtx src)
2510 {
2511 return (! REG_P (src)
2512 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2513 }
2514
2515 /* Split a complex SIMD combine. */
2516
2517 void
2518 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2519 {
2520 machine_mode src_mode = GET_MODE (src1);
2521 machine_mode dst_mode = GET_MODE (dst);
2522
2523 gcc_assert (VECTOR_MODE_P (dst_mode));
2524 gcc_assert (register_operand (dst, dst_mode)
2525 && register_operand (src1, src_mode)
2526 && register_operand (src2, src_mode));
2527
2528 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2529 return;
2530 }
2531
2532 /* Split a complex SIMD move. */
2533
2534 void
2535 aarch64_split_simd_move (rtx dst, rtx src)
2536 {
2537 machine_mode src_mode = GET_MODE (src);
2538 machine_mode dst_mode = GET_MODE (dst);
2539
2540 gcc_assert (VECTOR_MODE_P (dst_mode));
2541
2542 if (REG_P (dst) && REG_P (src))
2543 {
2544 gcc_assert (VECTOR_MODE_P (src_mode));
2545 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2546 }
2547 }
2548
2549 bool
2550 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2551 machine_mode ymode, rtx y)
2552 {
2553 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2554 gcc_assert (r != NULL);
2555 return rtx_equal_p (x, r);
2556 }
2557
2558
2559 /* Return TARGET if it is nonnull and a register of mode MODE.
2560 Otherwise, return a fresh register of mode MODE if we can,
2561 or TARGET reinterpreted as MODE if we can't. */
2562
2563 static rtx
2564 aarch64_target_reg (rtx target, machine_mode mode)
2565 {
2566 if (target && REG_P (target) && GET_MODE (target) == mode)
2567 return target;
2568 if (!can_create_pseudo_p ())
2569 {
2570 gcc_assert (target);
2571 return gen_lowpart (mode, target);
2572 }
2573 return gen_reg_rtx (mode);
2574 }
2575
2576 /* Return a register that contains the constant in BUILDER, given that
2577 the constant is a legitimate move operand. Use TARGET as the register
2578 if it is nonnull and convenient. */
2579
2580 static rtx
2581 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2582 {
2583 rtx src = builder.build ();
2584 target = aarch64_target_reg (target, GET_MODE (src));
2585 emit_insn (gen_rtx_SET (target, src));
2586 return target;
2587 }
2588
2589 static rtx
2590 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2591 {
2592 if (can_create_pseudo_p ())
2593 return force_reg (mode, value);
2594 else
2595 {
2596 gcc_assert (x);
2597 aarch64_emit_move (x, value);
2598 return x;
2599 }
2600 }
2601
2602 /* Return true if predicate value X is a constant in which every element
2603 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2604 value, i.e. as a predicate in which all bits are significant. */
2605
2606 static bool
2607 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2608 {
2609 if (GET_CODE (x) != CONST_VECTOR)
2610 return false;
2611
2612 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2613 GET_MODE_NUNITS (GET_MODE (x)));
2614 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2615 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2616 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2617
2618 unsigned int nelts = const_vector_encoded_nelts (x);
2619 for (unsigned int i = 0; i < nelts; ++i)
2620 {
2621 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2622 if (!CONST_INT_P (elt))
2623 return false;
2624
2625 builder.quick_push (elt);
2626 for (unsigned int j = 1; j < factor; ++j)
2627 builder.quick_push (const0_rtx);
2628 }
2629 builder.finalize ();
2630 return true;
2631 }
2632
2633 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2634 widest predicate element size it can have (that is, the largest size
2635 for which each element would still be 0 or 1). */
2636
2637 unsigned int
2638 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2639 {
2640 /* Start with the most optimistic assumption: that we only need
2641 one bit per pattern. This is what we will use if only the first
2642 bit in each pattern is ever set. */
2643 unsigned int mask = GET_MODE_SIZE (DImode);
2644 mask |= builder.npatterns ();
2645
2646 /* Look for set bits. */
2647 unsigned int nelts = builder.encoded_nelts ();
2648 for (unsigned int i = 1; i < nelts; ++i)
2649 if (INTVAL (builder.elt (i)) != 0)
2650 {
2651 if (i & 1)
2652 return 1;
2653 mask |= i;
2654 }
2655 return mask & -mask;
2656 }
2657
2658 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2659 that the constant would have with predicate element size ELT_SIZE
2660 (ignoring the upper bits in each element) and return:
2661
2662 * -1 if all bits are set
2663 * N if the predicate has N leading set bits followed by all clear bits
2664 * 0 if the predicate does not have any of these forms. */
2665
2666 int
2667 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2668 unsigned int elt_size)
2669 {
2670 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2671 followed by set bits. */
2672 if (builder.nelts_per_pattern () == 3)
2673 return 0;
2674
2675 /* Skip over leading set bits. */
2676 unsigned int nelts = builder.encoded_nelts ();
2677 unsigned int i = 0;
2678 for (; i < nelts; i += elt_size)
2679 if (INTVAL (builder.elt (i)) == 0)
2680 break;
2681 unsigned int vl = i / elt_size;
2682
2683 /* Check for the all-true case. */
2684 if (i == nelts)
2685 return -1;
2686
2687 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2688 repeating pattern of set bits followed by clear bits. */
2689 if (builder.nelts_per_pattern () != 2)
2690 return 0;
2691
2692 /* We have a "foreground" value and a duplicated "background" value.
2693 If the background might repeat and the last set bit belongs to it,
2694 we might have set bits followed by clear bits followed by set bits. */
2695 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2696 return 0;
2697
2698 /* Make sure that the rest are all clear. */
2699 for (; i < nelts; i += elt_size)
2700 if (INTVAL (builder.elt (i)) != 0)
2701 return 0;
2702
2703 return vl;
2704 }
2705
2706 /* See if there is an svpattern that encodes an SVE predicate of mode
2707 PRED_MODE in which the first VL bits are set and the rest are clear.
2708 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2709 A VL of -1 indicates an all-true vector. */
2710
2711 aarch64_svpattern
2712 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2713 {
2714 if (vl < 0)
2715 return AARCH64_SV_ALL;
2716
2717 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2718 return AARCH64_NUM_SVPATTERNS;
2719
2720 if (vl >= 1 && vl <= 8)
2721 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2722
2723 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2724 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2725
2726 int max_vl;
2727 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2728 {
2729 if (vl == (max_vl / 3) * 3)
2730 return AARCH64_SV_MUL3;
2731 /* These would only trigger for non-power-of-2 lengths. */
2732 if (vl == (max_vl & -4))
2733 return AARCH64_SV_MUL4;
2734 if (vl == (1 << floor_log2 (max_vl)))
2735 return AARCH64_SV_POW2;
2736 if (vl == max_vl)
2737 return AARCH64_SV_ALL;
2738 }
2739 return AARCH64_NUM_SVPATTERNS;
2740 }
2741
2742 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2743 bits has the lowest bit set and the upper bits clear. This is the
2744 VNx16BImode equivalent of a PTRUE for controlling elements of
2745 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2746 all bits are significant, even the upper zeros. */
2747
2748 rtx
2749 aarch64_ptrue_all (unsigned int elt_size)
2750 {
2751 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2752 builder.quick_push (const1_rtx);
2753 for (unsigned int i = 1; i < elt_size; ++i)
2754 builder.quick_push (const0_rtx);
2755 return builder.build ();
2756 }
2757
2758 /* Return an all-true predicate register of mode MODE. */
2759
2760 rtx
2761 aarch64_ptrue_reg (machine_mode mode)
2762 {
2763 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2764 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2765 return gen_lowpart (mode, reg);
2766 }
2767
2768 /* Return an all-false predicate register of mode MODE. */
2769
2770 rtx
2771 aarch64_pfalse_reg (machine_mode mode)
2772 {
2773 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2774 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2775 return gen_lowpart (mode, reg);
2776 }
2777
2778 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2779 true, or alternatively if we know that the operation predicated by
2780 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2781 aarch64_sve_gp_strictness operand that describes the operation
2782 predicated by PRED1[0]. */
2783
2784 bool
2785 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2786 {
2787 machine_mode mode = GET_MODE (pred2);
2788 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2789 && mode == GET_MODE (pred1[0])
2790 && aarch64_sve_gp_strictness (pred1[1], SImode));
2791 return (pred1[0] == CONSTM1_RTX (mode)
2792 || INTVAL (pred1[1]) == SVE_RELAXED_GP
2793 || rtx_equal_p (pred1[0], pred2));
2794 }
2795
2796 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2797 for it. PRED2[0] is the predicate for the instruction whose result
2798 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2799 for it. Return true if we can prove that the two predicates are
2800 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2801 with PRED1[0] without changing behavior. */
2802
2803 bool
2804 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2805 {
2806 machine_mode mode = GET_MODE (pred1[0]);
2807 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2808 && mode == GET_MODE (pred2[0])
2809 && aarch64_sve_ptrue_flag (pred1[1], SImode)
2810 && aarch64_sve_ptrue_flag (pred2[1], SImode));
2811
2812 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2813 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2814 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2815 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2816 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2817 }
2818
2819 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2820 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2821 Use TARGET as the target register if nonnull and convenient. */
2822
2823 static rtx
2824 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2825 machine_mode data_mode, rtx op1, rtx op2)
2826 {
2827 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2828 expand_operand ops[5];
2829 create_output_operand (&ops[0], target, pred_mode);
2830 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2831 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2832 create_input_operand (&ops[3], op1, data_mode);
2833 create_input_operand (&ops[4], op2, data_mode);
2834 expand_insn (icode, 5, ops);
2835 return ops[0].value;
2836 }
2837
2838 /* Use a comparison to convert integer vector SRC into MODE, which is
2839 the corresponding SVE predicate mode. Use TARGET for the result
2840 if it's nonnull and convenient. */
2841
2842 static rtx
2843 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2844 {
2845 machine_mode src_mode = GET_MODE (src);
2846 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2847 src, CONST0_RTX (src_mode));
2848 }
2849
2850 /* Return true if we can move VALUE into a register using a single
2851 CNT[BHWD] instruction. */
2852
2853 static bool
2854 aarch64_sve_cnt_immediate_p (poly_int64 value)
2855 {
2856 HOST_WIDE_INT factor = value.coeffs[0];
2857 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2858 return (value.coeffs[1] == factor
2859 && IN_RANGE (factor, 2, 16 * 16)
2860 && (factor & 1) == 0
2861 && factor <= 16 * (factor & -factor));
2862 }
2863
2864 /* Likewise for rtx X. */
2865
2866 bool
2867 aarch64_sve_cnt_immediate_p (rtx x)
2868 {
2869 poly_int64 value;
2870 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2871 }
2872
2873 /* Return the asm string for an instruction with a CNT-like vector size
2874 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2875 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2876 first part of the operands template (the part that comes before the
2877 vector size itself). FACTOR is the number of quadwords.
2878 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2879 If it is zero, we can use any element size. */
2880
2881 static char *
2882 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2883 unsigned int factor,
2884 unsigned int nelts_per_vq)
2885 {
2886 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2887
2888 if (nelts_per_vq == 0)
2889 /* There is some overlap in the ranges of the four CNT instructions.
2890 Here we always use the smallest possible element size, so that the
2891 multiplier is 1 whereever possible. */
2892 nelts_per_vq = factor & -factor;
2893 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2894 gcc_assert (IN_RANGE (shift, 1, 4));
2895 char suffix = "dwhb"[shift - 1];
2896
2897 factor >>= shift;
2898 unsigned int written;
2899 if (factor == 1)
2900 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2901 prefix, suffix, operands);
2902 else
2903 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2904 prefix, suffix, operands, factor);
2905 gcc_assert (written < sizeof (buffer));
2906 return buffer;
2907 }
2908
2909 /* Return the asm string for an instruction with a CNT-like vector size
2910 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2911 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2912 first part of the operands template (the part that comes before the
2913 vector size itself). X is the value of the vector size operand,
2914 as a polynomial integer rtx. */
2915
2916 char *
2917 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2918 rtx x)
2919 {
2920 poly_int64 value = rtx_to_poly_int64 (x);
2921 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2922 return aarch64_output_sve_cnt_immediate (prefix, operands,
2923 value.coeffs[1], 0);
2924 }
2925
2926 /* Return true if we can add VALUE to a register using a single ADDVL
2927 or ADDPL instruction. */
2928
2929 static bool
2930 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2931 {
2932 HOST_WIDE_INT factor = value.coeffs[0];
2933 if (factor == 0 || value.coeffs[1] != factor)
2934 return false;
2935 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2936 and a value of 16 is one vector width. */
2937 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2938 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2939 }
2940
2941 /* Likewise for rtx X. */
2942
2943 bool
2944 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2945 {
2946 poly_int64 value;
2947 return (poly_int_rtx_p (x, &value)
2948 && aarch64_sve_addvl_addpl_immediate_p (value));
2949 }
2950
2951 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2952 and storing the result in operand 0. */
2953
2954 char *
2955 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2956 {
2957 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2958 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2959 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2960
2961 /* Use INC or DEC if possible. */
2962 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2963 {
2964 if (aarch64_sve_cnt_immediate_p (offset_value))
2965 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2966 offset_value.coeffs[1], 0);
2967 if (aarch64_sve_cnt_immediate_p (-offset_value))
2968 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2969 -offset_value.coeffs[1], 0);
2970 }
2971
2972 int factor = offset_value.coeffs[1];
2973 if ((factor & 15) == 0)
2974 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2975 else
2976 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2977 return buffer;
2978 }
2979
2980 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2981 instruction. If it is, store the number of elements in each vector
2982 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2983 factor in *FACTOR_OUT (if nonnull). */
2984
2985 bool
2986 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2987 unsigned int *nelts_per_vq_out)
2988 {
2989 rtx elt;
2990 poly_int64 value;
2991
2992 if (!const_vec_duplicate_p (x, &elt)
2993 || !poly_int_rtx_p (elt, &value))
2994 return false;
2995
2996 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2997 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2998 /* There's no vector INCB. */
2999 return false;
3000
3001 HOST_WIDE_INT factor = value.coeffs[0];
3002 if (value.coeffs[1] != factor)
3003 return false;
3004
3005 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3006 if ((factor % nelts_per_vq) != 0
3007 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3008 return false;
3009
3010 if (factor_out)
3011 *factor_out = factor;
3012 if (nelts_per_vq_out)
3013 *nelts_per_vq_out = nelts_per_vq;
3014 return true;
3015 }
3016
3017 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3018 instruction. */
3019
3020 bool
3021 aarch64_sve_inc_dec_immediate_p (rtx x)
3022 {
3023 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
3024 }
3025
3026 /* Return the asm template for an SVE vector INC or DEC instruction.
3027 OPERANDS gives the operands before the vector count and X is the
3028 value of the vector count operand itself. */
3029
3030 char *
3031 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
3032 {
3033 int factor;
3034 unsigned int nelts_per_vq;
3035 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3036 gcc_unreachable ();
3037 if (factor < 0)
3038 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
3039 nelts_per_vq);
3040 else
3041 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
3042 nelts_per_vq);
3043 }
3044
3045 static int
3046 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3047 scalar_int_mode mode)
3048 {
3049 int i;
3050 unsigned HOST_WIDE_INT val, val2, mask;
3051 int one_match, zero_match;
3052 int num_insns;
3053
3054 val = INTVAL (imm);
3055
3056 if (aarch64_move_imm (val, mode))
3057 {
3058 if (generate)
3059 emit_insn (gen_rtx_SET (dest, imm));
3060 return 1;
3061 }
3062
3063 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3064 (with XXXX non-zero). In that case check to see if the move can be done in
3065 a smaller mode. */
3066 val2 = val & 0xffffffff;
3067 if (mode == DImode
3068 && aarch64_move_imm (val2, SImode)
3069 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3070 {
3071 if (generate)
3072 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3073
3074 /* Check if we have to emit a second instruction by checking to see
3075 if any of the upper 32 bits of the original DI mode value is set. */
3076 if (val == val2)
3077 return 1;
3078
3079 i = (val >> 48) ? 48 : 32;
3080
3081 if (generate)
3082 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3083 GEN_INT ((val >> i) & 0xffff)));
3084
3085 return 2;
3086 }
3087
3088 if ((val >> 32) == 0 || mode == SImode)
3089 {
3090 if (generate)
3091 {
3092 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3093 if (mode == SImode)
3094 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3095 GEN_INT ((val >> 16) & 0xffff)));
3096 else
3097 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3098 GEN_INT ((val >> 16) & 0xffff)));
3099 }
3100 return 2;
3101 }
3102
3103 /* Remaining cases are all for DImode. */
3104
3105 mask = 0xffff;
3106 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3107 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3108 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3109 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3110
3111 if (zero_match != 2 && one_match != 2)
3112 {
3113 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3114 For a 64-bit bitmask try whether changing 16 bits to all ones or
3115 zeroes creates a valid bitmask. To check any repeated bitmask,
3116 try using 16 bits from the other 32-bit half of val. */
3117
3118 for (i = 0; i < 64; i += 16, mask <<= 16)
3119 {
3120 val2 = val & ~mask;
3121 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3122 break;
3123 val2 = val | mask;
3124 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3125 break;
3126 val2 = val2 & ~mask;
3127 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3128 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3129 break;
3130 }
3131 if (i != 64)
3132 {
3133 if (generate)
3134 {
3135 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3136 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3137 GEN_INT ((val >> i) & 0xffff)));
3138 }
3139 return 2;
3140 }
3141 }
3142
3143 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3144 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3145 otherwise skip zero bits. */
3146
3147 num_insns = 1;
3148 mask = 0xffff;
3149 val2 = one_match > zero_match ? ~val : val;
3150 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3151
3152 if (generate)
3153 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3154 ? (val | ~(mask << i))
3155 : (val & (mask << i)))));
3156 for (i += 16; i < 64; i += 16)
3157 {
3158 if ((val2 & (mask << i)) == 0)
3159 continue;
3160 if (generate)
3161 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3162 GEN_INT ((val >> i) & 0xffff)));
3163 num_insns ++;
3164 }
3165
3166 return num_insns;
3167 }
3168
3169 /* Return whether imm is a 128-bit immediate which is simple enough to
3170 expand inline. */
3171 bool
3172 aarch64_mov128_immediate (rtx imm)
3173 {
3174 if (GET_CODE (imm) == CONST_INT)
3175 return true;
3176
3177 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3178
3179 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3180 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3181
3182 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3183 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3184 }
3185
3186
3187 /* Return the number of temporary registers that aarch64_add_offset_1
3188 would need to add OFFSET to a register. */
3189
3190 static unsigned int
3191 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3192 {
3193 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3194 }
3195
3196 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3197 a non-polynomial OFFSET. MODE is the mode of the addition.
3198 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3199 be set and CFA adjustments added to the generated instructions.
3200
3201 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3202 temporary if register allocation is already complete. This temporary
3203 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3204 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3205 the immediate again.
3206
3207 Since this function may be used to adjust the stack pointer, we must
3208 ensure that it cannot cause transient stack deallocation (for example
3209 by first incrementing SP and then decrementing when adjusting by a
3210 large immediate). */
3211
3212 static void
3213 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3214 rtx src, HOST_WIDE_INT offset, rtx temp1,
3215 bool frame_related_p, bool emit_move_imm)
3216 {
3217 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3218 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3219
3220 HOST_WIDE_INT moffset = abs_hwi (offset);
3221 rtx_insn *insn;
3222
3223 if (!moffset)
3224 {
3225 if (!rtx_equal_p (dest, src))
3226 {
3227 insn = emit_insn (gen_rtx_SET (dest, src));
3228 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3229 }
3230 return;
3231 }
3232
3233 /* Single instruction adjustment. */
3234 if (aarch64_uimm12_shift (moffset))
3235 {
3236 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3237 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3238 return;
3239 }
3240
3241 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3242 and either:
3243
3244 a) the offset cannot be loaded by a 16-bit move or
3245 b) there is no spare register into which we can move it. */
3246 if (moffset < 0x1000000
3247 && ((!temp1 && !can_create_pseudo_p ())
3248 || !aarch64_move_imm (moffset, mode)))
3249 {
3250 HOST_WIDE_INT low_off = moffset & 0xfff;
3251
3252 low_off = offset < 0 ? -low_off : low_off;
3253 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3254 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3255 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3256 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3257 return;
3258 }
3259
3260 /* Emit a move immediate if required and an addition/subtraction. */
3261 if (emit_move_imm)
3262 {
3263 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3264 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3265 }
3266 insn = emit_insn (offset < 0
3267 ? gen_sub3_insn (dest, src, temp1)
3268 : gen_add3_insn (dest, src, temp1));
3269 if (frame_related_p)
3270 {
3271 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3272 rtx adj = plus_constant (mode, src, offset);
3273 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3274 }
3275 }
3276
3277 /* Return the number of temporary registers that aarch64_add_offset
3278 would need to move OFFSET into a register or add OFFSET to a register;
3279 ADD_P is true if we want the latter rather than the former. */
3280
3281 static unsigned int
3282 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3283 {
3284 /* This follows the same structure as aarch64_add_offset. */
3285 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3286 return 0;
3287
3288 unsigned int count = 0;
3289 HOST_WIDE_INT factor = offset.coeffs[1];
3290 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3291 poly_int64 poly_offset (factor, factor);
3292 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3293 /* Need one register for the ADDVL/ADDPL result. */
3294 count += 1;
3295 else if (factor != 0)
3296 {
3297 factor = abs (factor);
3298 if (factor > 16 * (factor & -factor))
3299 /* Need one register for the CNT result and one for the multiplication
3300 factor. If necessary, the second temporary can be reused for the
3301 constant part of the offset. */
3302 return 2;
3303 /* Need one register for the CNT result (which might then
3304 be shifted). */
3305 count += 1;
3306 }
3307 return count + aarch64_add_offset_1_temporaries (constant);
3308 }
3309
3310 /* If X can be represented as a poly_int64, return the number
3311 of temporaries that are required to add it to a register.
3312 Return -1 otherwise. */
3313
3314 int
3315 aarch64_add_offset_temporaries (rtx x)
3316 {
3317 poly_int64 offset;
3318 if (!poly_int_rtx_p (x, &offset))
3319 return -1;
3320 return aarch64_offset_temporaries (true, offset);
3321 }
3322
3323 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3324 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3325 be set and CFA adjustments added to the generated instructions.
3326
3327 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3328 temporary if register allocation is already complete. This temporary
3329 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3330 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3331 false to avoid emitting the immediate again.
3332
3333 TEMP2, if nonnull, is a second temporary register that doesn't
3334 overlap either DEST or REG.
3335
3336 Since this function may be used to adjust the stack pointer, we must
3337 ensure that it cannot cause transient stack deallocation (for example
3338 by first incrementing SP and then decrementing when adjusting by a
3339 large immediate). */
3340
3341 static void
3342 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3343 poly_int64 offset, rtx temp1, rtx temp2,
3344 bool frame_related_p, bool emit_move_imm = true)
3345 {
3346 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3347 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3348 gcc_assert (temp1 == NULL_RTX
3349 || !frame_related_p
3350 || !reg_overlap_mentioned_p (temp1, dest));
3351 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3352
3353 /* Try using ADDVL or ADDPL to add the whole value. */
3354 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3355 {
3356 rtx offset_rtx = gen_int_mode (offset, mode);
3357 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3358 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3359 return;
3360 }
3361
3362 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3363 SVE vector register, over and above the minimum size of 128 bits.
3364 This is equivalent to half the value returned by CNTD with a
3365 vector shape of ALL. */
3366 HOST_WIDE_INT factor = offset.coeffs[1];
3367 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3368
3369 /* Try using ADDVL or ADDPL to add the VG-based part. */
3370 poly_int64 poly_offset (factor, factor);
3371 if (src != const0_rtx
3372 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3373 {
3374 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3375 if (frame_related_p)
3376 {
3377 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3378 RTX_FRAME_RELATED_P (insn) = true;
3379 src = dest;
3380 }
3381 else
3382 {
3383 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3384 src = aarch64_force_temporary (mode, temp1, addr);
3385 temp1 = temp2;
3386 temp2 = NULL_RTX;
3387 }
3388 }
3389 /* Otherwise use a CNT-based sequence. */
3390 else if (factor != 0)
3391 {
3392 /* Use a subtraction if we have a negative factor. */
3393 rtx_code code = PLUS;
3394 if (factor < 0)
3395 {
3396 factor = -factor;
3397 code = MINUS;
3398 }
3399
3400 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3401 into the multiplication. */
3402 rtx val;
3403 int shift = 0;
3404 if (factor & 1)
3405 /* Use a right shift by 1. */
3406 shift = -1;
3407 else
3408 factor /= 2;
3409 HOST_WIDE_INT low_bit = factor & -factor;
3410 if (factor <= 16 * low_bit)
3411 {
3412 if (factor > 16 * 8)
3413 {
3414 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3415 the value with the minimum multiplier and shift it into
3416 position. */
3417 int extra_shift = exact_log2 (low_bit);
3418 shift += extra_shift;
3419 factor >>= extra_shift;
3420 }
3421 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3422 }
3423 else
3424 {
3425 /* Use CNTD, then multiply it by FACTOR. */
3426 val = gen_int_mode (poly_int64 (2, 2), mode);
3427 val = aarch64_force_temporary (mode, temp1, val);
3428
3429 /* Go back to using a negative multiplication factor if we have
3430 no register from which to subtract. */
3431 if (code == MINUS && src == const0_rtx)
3432 {
3433 factor = -factor;
3434 code = PLUS;
3435 }
3436 rtx coeff1 = gen_int_mode (factor, mode);
3437 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3438 val = gen_rtx_MULT (mode, val, coeff1);
3439 }
3440
3441 if (shift > 0)
3442 {
3443 /* Multiply by 1 << SHIFT. */
3444 val = aarch64_force_temporary (mode, temp1, val);
3445 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3446 }
3447 else if (shift == -1)
3448 {
3449 /* Divide by 2. */
3450 val = aarch64_force_temporary (mode, temp1, val);
3451 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3452 }
3453
3454 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3455 if (src != const0_rtx)
3456 {
3457 val = aarch64_force_temporary (mode, temp1, val);
3458 val = gen_rtx_fmt_ee (code, mode, src, val);
3459 }
3460 else if (code == MINUS)
3461 {
3462 val = aarch64_force_temporary (mode, temp1, val);
3463 val = gen_rtx_NEG (mode, val);
3464 }
3465
3466 if (constant == 0 || frame_related_p)
3467 {
3468 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3469 if (frame_related_p)
3470 {
3471 RTX_FRAME_RELATED_P (insn) = true;
3472 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3473 gen_rtx_SET (dest, plus_constant (Pmode, src,
3474 poly_offset)));
3475 }
3476 src = dest;
3477 if (constant == 0)
3478 return;
3479 }
3480 else
3481 {
3482 src = aarch64_force_temporary (mode, temp1, val);
3483 temp1 = temp2;
3484 temp2 = NULL_RTX;
3485 }
3486
3487 emit_move_imm = true;
3488 }
3489
3490 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3491 frame_related_p, emit_move_imm);
3492 }
3493
3494 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3495 than a poly_int64. */
3496
3497 void
3498 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3499 rtx offset_rtx, rtx temp1, rtx temp2)
3500 {
3501 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3502 temp1, temp2, false);
3503 }
3504
3505 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3506 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3507 if TEMP1 already contains abs (DELTA). */
3508
3509 static inline void
3510 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3511 {
3512 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3513 temp1, temp2, true, emit_move_imm);
3514 }
3515
3516 /* Subtract DELTA from the stack pointer, marking the instructions
3517 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3518 if nonnull. */
3519
3520 static inline void
3521 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3522 bool emit_move_imm = true)
3523 {
3524 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3525 temp1, temp2, frame_related_p, emit_move_imm);
3526 }
3527
3528 /* Set DEST to (vec_series BASE STEP). */
3529
3530 static void
3531 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3532 {
3533 machine_mode mode = GET_MODE (dest);
3534 scalar_mode inner = GET_MODE_INNER (mode);
3535
3536 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3537 if (!aarch64_sve_index_immediate_p (base))
3538 base = force_reg (inner, base);
3539 if (!aarch64_sve_index_immediate_p (step))
3540 step = force_reg (inner, step);
3541
3542 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3543 }
3544
3545 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3546 register of mode MODE. Use TARGET for the result if it's nonnull
3547 and convenient.
3548
3549 The two vector modes must have the same element mode. The behavior
3550 is to duplicate architectural lane N of SRC into architectural lanes
3551 N + I * STEP of the result. On big-endian targets, architectural
3552 lane 0 of an Advanced SIMD vector is the last element of the vector
3553 in memory layout, so for big-endian targets this operation has the
3554 effect of reversing SRC before duplicating it. Callers need to
3555 account for this. */
3556
3557 rtx
3558 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3559 {
3560 machine_mode src_mode = GET_MODE (src);
3561 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3562 insn_code icode = (BYTES_BIG_ENDIAN
3563 ? code_for_aarch64_vec_duplicate_vq_be (mode)
3564 : code_for_aarch64_vec_duplicate_vq_le (mode));
3565
3566 unsigned int i = 0;
3567 expand_operand ops[3];
3568 create_output_operand (&ops[i++], target, mode);
3569 create_output_operand (&ops[i++], src, src_mode);
3570 if (BYTES_BIG_ENDIAN)
3571 {
3572 /* Create a PARALLEL describing the reversal of SRC. */
3573 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3574 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3575 nelts_per_vq - 1, -1);
3576 create_fixed_operand (&ops[i++], sel);
3577 }
3578 expand_insn (icode, i, ops);
3579 return ops[0].value;
3580 }
3581
3582 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3583 the memory image into DEST. Return true on success. */
3584
3585 static bool
3586 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3587 {
3588 src = force_const_mem (GET_MODE (src), src);
3589 if (!src)
3590 return false;
3591
3592 /* Make sure that the address is legitimate. */
3593 if (!aarch64_sve_ld1rq_operand_p (src))
3594 {
3595 rtx addr = force_reg (Pmode, XEXP (src, 0));
3596 src = replace_equiv_address (src, addr);
3597 }
3598
3599 machine_mode mode = GET_MODE (dest);
3600 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3601 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3602 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3603 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3604 return true;
3605 }
3606
3607 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3608 SVE data mode and isn't a legitimate constant. Use TARGET for the
3609 result if convenient.
3610
3611 The returned register can have whatever mode seems most natural
3612 given the contents of SRC. */
3613
3614 static rtx
3615 aarch64_expand_sve_const_vector (rtx target, rtx src)
3616 {
3617 machine_mode mode = GET_MODE (src);
3618 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3619 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3620 scalar_mode elt_mode = GET_MODE_INNER (mode);
3621 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3622 unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3623
3624 if (nelts_per_pattern == 1 && encoded_bits == 128)
3625 {
3626 /* The constant is a duplicated quadword but can't be narrowed
3627 beyond a quadword. Get the memory image of the first quadword
3628 as a 128-bit vector and try using LD1RQ to load it from memory.
3629
3630 The effect for both endiannesses is to load memory lane N into
3631 architectural lanes N + I * STEP of the result. On big-endian
3632 targets, the layout of the 128-bit vector in an Advanced SIMD
3633 register would be different from its layout in an SVE register,
3634 but this 128-bit vector is a memory value only. */
3635 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3636 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3637 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3638 return target;
3639 }
3640
3641 if (nelts_per_pattern == 1 && encoded_bits < 128)
3642 {
3643 /* The vector is a repeating sequence of 64 bits or fewer.
3644 See if we can load them using an Advanced SIMD move and then
3645 duplicate it to fill a vector. This is better than using a GPR
3646 move because it keeps everything in the same register file. */
3647 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3648 rtx_vector_builder builder (vq_mode, npatterns, 1);
3649 for (unsigned int i = 0; i < npatterns; ++i)
3650 {
3651 /* We want memory lane N to go into architectural lane N,
3652 so reverse for big-endian targets. The DUP .Q pattern
3653 has a compensating reverse built-in. */
3654 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3655 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3656 }
3657 rtx vq_src = builder.build ();
3658 if (aarch64_simd_valid_immediate (vq_src, NULL))
3659 {
3660 vq_src = force_reg (vq_mode, vq_src);
3661 return aarch64_expand_sve_dupq (target, mode, vq_src);
3662 }
3663
3664 /* Get an integer representation of the repeating part of Advanced
3665 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3666 which for big-endian targets is lane-swapped wrt a normal
3667 Advanced SIMD vector. This means that for both endiannesses,
3668 memory lane N of SVE vector SRC corresponds to architectural
3669 lane N of a register holding VQ_SRC. This in turn means that
3670 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3671 as a single 128-bit value) and thus that memory lane 0 of SRC is
3672 in the lsb of the integer. Duplicating the integer therefore
3673 ensures that memory lane N of SRC goes into architectural lane
3674 N + I * INDEX of the SVE register. */
3675 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3676 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3677 if (elt_value)
3678 {
3679 /* Pretend that we had a vector of INT_MODE to start with. */
3680 elt_mode = int_mode;
3681 mode = aarch64_full_sve_mode (int_mode).require ();
3682
3683 /* If the integer can be moved into a general register by a
3684 single instruction, do that and duplicate the result. */
3685 if (CONST_INT_P (elt_value)
3686 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3687 {
3688 elt_value = force_reg (elt_mode, elt_value);
3689 return expand_vector_broadcast (mode, elt_value);
3690 }
3691 }
3692 else if (npatterns == 1)
3693 /* We're duplicating a single value, but can't do better than
3694 force it to memory and load from there. This handles things
3695 like symbolic constants. */
3696 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3697
3698 if (elt_value)
3699 {
3700 /* Load the element from memory if we can, otherwise move it into
3701 a register and use a DUP. */
3702 rtx op = force_const_mem (elt_mode, elt_value);
3703 if (!op)
3704 op = force_reg (elt_mode, elt_value);
3705 return expand_vector_broadcast (mode, op);
3706 }
3707 }
3708
3709 /* Try using INDEX. */
3710 rtx base, step;
3711 if (const_vec_series_p (src, &base, &step))
3712 {
3713 aarch64_expand_vec_series (target, base, step);
3714 return target;
3715 }
3716
3717 /* From here on, it's better to force the whole constant to memory
3718 if we can. */
3719 if (GET_MODE_NUNITS (mode).is_constant ())
3720 return NULL_RTX;
3721
3722 /* Expand each pattern individually. */
3723 gcc_assert (npatterns > 1);
3724 rtx_vector_builder builder;
3725 auto_vec<rtx, 16> vectors (npatterns);
3726 for (unsigned int i = 0; i < npatterns; ++i)
3727 {
3728 builder.new_vector (mode, 1, nelts_per_pattern);
3729 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3730 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3731 vectors.quick_push (force_reg (mode, builder.build ()));
3732 }
3733
3734 /* Use permutes to interleave the separate vectors. */
3735 while (npatterns > 1)
3736 {
3737 npatterns /= 2;
3738 for (unsigned int i = 0; i < npatterns; ++i)
3739 {
3740 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3741 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3742 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3743 vectors[i] = tmp;
3744 }
3745 }
3746 gcc_assert (vectors[0] == target);
3747 return target;
3748 }
3749
3750 /* Use WHILE to set a predicate register of mode MODE in which the first
3751 VL bits are set and the rest are clear. Use TARGET for the register
3752 if it's nonnull and convenient. */
3753
3754 static rtx
3755 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3756 unsigned int vl)
3757 {
3758 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3759 target = aarch64_target_reg (target, mode);
3760 emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3761 return target;
3762 }
3763
3764 static rtx
3765 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3766
3767 /* BUILDER is a constant predicate in which the index of every set bit
3768 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3769 by inverting every element at a multiple of ELT_SIZE and EORing the
3770 result with an ELT_SIZE PTRUE.
3771
3772 Return a register that contains the constant on success, otherwise
3773 return null. Use TARGET as the register if it is nonnull and
3774 convenient. */
3775
3776 static rtx
3777 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3778 unsigned int elt_size)
3779 {
3780 /* Invert every element at a multiple of ELT_SIZE, keeping the
3781 other bits zero. */
3782 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3783 builder.nelts_per_pattern ());
3784 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3785 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3786 inv_builder.quick_push (const1_rtx);
3787 else
3788 inv_builder.quick_push (const0_rtx);
3789 inv_builder.finalize ();
3790
3791 /* See if we can load the constant cheaply. */
3792 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3793 if (!inv)
3794 return NULL_RTX;
3795
3796 /* EOR the result with an ELT_SIZE PTRUE. */
3797 rtx mask = aarch64_ptrue_all (elt_size);
3798 mask = force_reg (VNx16BImode, mask);
3799 target = aarch64_target_reg (target, VNx16BImode);
3800 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3801 return target;
3802 }
3803
3804 /* BUILDER is a constant predicate in which the index of every set bit
3805 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3806 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3807 register on success, otherwise return null. Use TARGET as the register
3808 if nonnull and convenient. */
3809
3810 static rtx
3811 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3812 unsigned int elt_size,
3813 unsigned int permute_size)
3814 {
3815 /* We're going to split the constant into two new constants A and B,
3816 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3817 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3818
3819 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3820 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3821
3822 where _ indicates elements that will be discarded by the permute.
3823
3824 First calculate the ELT_SIZEs for A and B. */
3825 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3826 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3827 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3828 if (INTVAL (builder.elt (i)) != 0)
3829 {
3830 if (i & permute_size)
3831 b_elt_size |= i - permute_size;
3832 else
3833 a_elt_size |= i;
3834 }
3835 a_elt_size &= -a_elt_size;
3836 b_elt_size &= -b_elt_size;
3837
3838 /* Now construct the vectors themselves. */
3839 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3840 builder.nelts_per_pattern ());
3841 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3842 builder.nelts_per_pattern ());
3843 unsigned int nelts = builder.encoded_nelts ();
3844 for (unsigned int i = 0; i < nelts; ++i)
3845 if (i & (elt_size - 1))
3846 {
3847 a_builder.quick_push (const0_rtx);
3848 b_builder.quick_push (const0_rtx);
3849 }
3850 else if ((i & permute_size) == 0)
3851 {
3852 /* The A and B elements are significant. */
3853 a_builder.quick_push (builder.elt (i));
3854 b_builder.quick_push (builder.elt (i + permute_size));
3855 }
3856 else
3857 {
3858 /* The A and B elements are going to be discarded, so pick whatever
3859 is likely to give a nice constant. We are targeting element
3860 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3861 with the aim of each being a sequence of ones followed by
3862 a sequence of zeros. So:
3863
3864 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3865 duplicate the last X_ELT_SIZE element, to extend the
3866 current sequence of ones or zeros.
3867
3868 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3869 zero, so that the constant really does have X_ELT_SIZE and
3870 not a smaller size. */
3871 if (a_elt_size > permute_size)
3872 a_builder.quick_push (const0_rtx);
3873 else
3874 a_builder.quick_push (a_builder.elt (i - a_elt_size));
3875 if (b_elt_size > permute_size)
3876 b_builder.quick_push (const0_rtx);
3877 else
3878 b_builder.quick_push (b_builder.elt (i - b_elt_size));
3879 }
3880 a_builder.finalize ();
3881 b_builder.finalize ();
3882
3883 /* Try loading A into a register. */
3884 rtx_insn *last = get_last_insn ();
3885 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3886 if (!a)
3887 return NULL_RTX;
3888
3889 /* Try loading B into a register. */
3890 rtx b = a;
3891 if (a_builder != b_builder)
3892 {
3893 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3894 if (!b)
3895 {
3896 delete_insns_since (last);
3897 return NULL_RTX;
3898 }
3899 }
3900
3901 /* Emit the TRN1 itself. */
3902 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
3903 target = aarch64_target_reg (target, mode);
3904 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
3905 gen_lowpart (mode, a),
3906 gen_lowpart (mode, b)));
3907 return target;
3908 }
3909
3910 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
3911 constant in BUILDER into an SVE predicate register. Return the register
3912 on success, otherwise return null. Use TARGET for the register if
3913 nonnull and convenient.
3914
3915 ALLOW_RECURSE_P is true if we can use methods that would call this
3916 function recursively. */
3917
3918 static rtx
3919 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
3920 bool allow_recurse_p)
3921 {
3922 if (builder.encoded_nelts () == 1)
3923 /* A PFALSE or a PTRUE .B ALL. */
3924 return aarch64_emit_set_immediate (target, builder);
3925
3926 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3927 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
3928 {
3929 /* If we can load the constant using PTRUE, use it as-is. */
3930 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
3931 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
3932 return aarch64_emit_set_immediate (target, builder);
3933
3934 /* Otherwise use WHILE to set the first VL bits. */
3935 return aarch64_sve_move_pred_via_while (target, mode, vl);
3936 }
3937
3938 if (!allow_recurse_p)
3939 return NULL_RTX;
3940
3941 /* Try inverting the vector in element size ELT_SIZE and then EORing
3942 the result with an ELT_SIZE PTRUE. */
3943 if (INTVAL (builder.elt (0)) == 0)
3944 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
3945 elt_size))
3946 return res;
3947
3948 /* Try using TRN1 to permute two simpler constants. */
3949 for (unsigned int i = elt_size; i <= 8; i *= 2)
3950 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
3951 elt_size, i))
3952 return res;
3953
3954 return NULL_RTX;
3955 }
3956
3957 /* Return an SVE predicate register that contains the VNx16BImode
3958 constant in BUILDER, without going through the move expanders.
3959
3960 The returned register can have whatever mode seems most natural
3961 given the contents of BUILDER. Use TARGET for the result if
3962 convenient. */
3963
3964 static rtx
3965 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
3966 {
3967 /* Try loading the constant using pure predicate operations. */
3968 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
3969 return res;
3970
3971 /* Try forcing the constant to memory. */
3972 if (builder.full_nelts ().is_constant ())
3973 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
3974 {
3975 target = aarch64_target_reg (target, VNx16BImode);
3976 emit_move_insn (target, mem);
3977 return target;
3978 }
3979
3980 /* The last resort is to load the constant as an integer and then
3981 compare it against zero. Use -1 for set bits in order to increase
3982 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
3983 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
3984 builder.nelts_per_pattern ());
3985 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3986 int_builder.quick_push (INTVAL (builder.elt (i))
3987 ? constm1_rtx : const0_rtx);
3988 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
3989 int_builder.build ());
3990 }
3991
3992 /* Set DEST to immediate IMM. */
3993
3994 void
3995 aarch64_expand_mov_immediate (rtx dest, rtx imm)
3996 {
3997 machine_mode mode = GET_MODE (dest);
3998
3999 /* Check on what type of symbol it is. */
4000 scalar_int_mode int_mode;
4001 if ((GET_CODE (imm) == SYMBOL_REF
4002 || GET_CODE (imm) == LABEL_REF
4003 || GET_CODE (imm) == CONST
4004 || GET_CODE (imm) == CONST_POLY_INT)
4005 && is_a <scalar_int_mode> (mode, &int_mode))
4006 {
4007 rtx mem;
4008 poly_int64 offset;
4009 HOST_WIDE_INT const_offset;
4010 enum aarch64_symbol_type sty;
4011
4012 /* If we have (const (plus symbol offset)), separate out the offset
4013 before we start classifying the symbol. */
4014 rtx base = strip_offset (imm, &offset);
4015
4016 /* We must always add an offset involving VL separately, rather than
4017 folding it into the relocation. */
4018 if (!offset.is_constant (&const_offset))
4019 {
4020 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4021 emit_insn (gen_rtx_SET (dest, imm));
4022 else
4023 {
4024 /* Do arithmetic on 32-bit values if the result is smaller
4025 than that. */
4026 if (partial_subreg_p (int_mode, SImode))
4027 {
4028 /* It is invalid to do symbol calculations in modes
4029 narrower than SImode. */
4030 gcc_assert (base == const0_rtx);
4031 dest = gen_lowpart (SImode, dest);
4032 int_mode = SImode;
4033 }
4034 if (base != const0_rtx)
4035 {
4036 base = aarch64_force_temporary (int_mode, dest, base);
4037 aarch64_add_offset (int_mode, dest, base, offset,
4038 NULL_RTX, NULL_RTX, false);
4039 }
4040 else
4041 aarch64_add_offset (int_mode, dest, base, offset,
4042 dest, NULL_RTX, false);
4043 }
4044 return;
4045 }
4046
4047 sty = aarch64_classify_symbol (base, const_offset);
4048 switch (sty)
4049 {
4050 case SYMBOL_FORCE_TO_MEM:
4051 if (const_offset != 0
4052 && targetm.cannot_force_const_mem (int_mode, imm))
4053 {
4054 gcc_assert (can_create_pseudo_p ());
4055 base = aarch64_force_temporary (int_mode, dest, base);
4056 aarch64_add_offset (int_mode, dest, base, const_offset,
4057 NULL_RTX, NULL_RTX, false);
4058 return;
4059 }
4060
4061 mem = force_const_mem (ptr_mode, imm);
4062 gcc_assert (mem);
4063
4064 /* If we aren't generating PC relative literals, then
4065 we need to expand the literal pool access carefully.
4066 This is something that needs to be done in a number
4067 of places, so could well live as a separate function. */
4068 if (!aarch64_pcrelative_literal_loads)
4069 {
4070 gcc_assert (can_create_pseudo_p ());
4071 base = gen_reg_rtx (ptr_mode);
4072 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4073 if (ptr_mode != Pmode)
4074 base = convert_memory_address (Pmode, base);
4075 mem = gen_rtx_MEM (ptr_mode, base);
4076 }
4077
4078 if (int_mode != ptr_mode)
4079 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4080
4081 emit_insn (gen_rtx_SET (dest, mem));
4082
4083 return;
4084
4085 case SYMBOL_SMALL_TLSGD:
4086 case SYMBOL_SMALL_TLSDESC:
4087 case SYMBOL_SMALL_TLSIE:
4088 case SYMBOL_SMALL_GOT_28K:
4089 case SYMBOL_SMALL_GOT_4G:
4090 case SYMBOL_TINY_GOT:
4091 case SYMBOL_TINY_TLSIE:
4092 if (const_offset != 0)
4093 {
4094 gcc_assert(can_create_pseudo_p ());
4095 base = aarch64_force_temporary (int_mode, dest, base);
4096 aarch64_add_offset (int_mode, dest, base, const_offset,
4097 NULL_RTX, NULL_RTX, false);
4098 return;
4099 }
4100 /* FALLTHRU */
4101
4102 case SYMBOL_SMALL_ABSOLUTE:
4103 case SYMBOL_TINY_ABSOLUTE:
4104 case SYMBOL_TLSLE12:
4105 case SYMBOL_TLSLE24:
4106 case SYMBOL_TLSLE32:
4107 case SYMBOL_TLSLE48:
4108 aarch64_load_symref_appropriately (dest, imm, sty);
4109 return;
4110
4111 default:
4112 gcc_unreachable ();
4113 }
4114 }
4115
4116 if (!CONST_INT_P (imm))
4117 {
4118 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4119 {
4120 /* Only the low bit of each .H, .S and .D element is defined,
4121 so we can set the upper bits to whatever we like. If the
4122 predicate is all-true in MODE, prefer to set all the undefined
4123 bits as well, so that we can share a single .B predicate for
4124 all modes. */
4125 if (imm == CONSTM1_RTX (mode))
4126 imm = CONSTM1_RTX (VNx16BImode);
4127
4128 /* All methods for constructing predicate modes wider than VNx16BI
4129 will set the upper bits of each element to zero. Expose this
4130 by moving such constants as a VNx16BI, so that all bits are
4131 significant and so that constants for different modes can be
4132 shared. The wider constant will still be available as a
4133 REG_EQUAL note. */
4134 rtx_vector_builder builder;
4135 if (aarch64_get_sve_pred_bits (builder, imm))
4136 {
4137 rtx res = aarch64_expand_sve_const_pred (dest, builder);
4138 if (dest != res)
4139 emit_move_insn (dest, gen_lowpart (mode, res));
4140 return;
4141 }
4142 }
4143
4144 if (GET_CODE (imm) == HIGH
4145 || aarch64_simd_valid_immediate (imm, NULL))
4146 {
4147 emit_insn (gen_rtx_SET (dest, imm));
4148 return;
4149 }
4150
4151 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4152 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4153 {
4154 if (dest != res)
4155 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4156 return;
4157 }
4158
4159 rtx mem = force_const_mem (mode, imm);
4160 gcc_assert (mem);
4161 emit_move_insn (dest, mem);
4162 return;
4163 }
4164
4165 aarch64_internal_mov_immediate (dest, imm, true,
4166 as_a <scalar_int_mode> (mode));
4167 }
4168
4169 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4170 that is known to contain PTRUE. */
4171
4172 void
4173 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4174 {
4175 expand_operand ops[3];
4176 machine_mode mode = GET_MODE (dest);
4177 create_output_operand (&ops[0], dest, mode);
4178 create_input_operand (&ops[1], pred, GET_MODE(pred));
4179 create_input_operand (&ops[2], src, mode);
4180 temporary_volatile_ok v (true);
4181 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4182 }
4183
4184 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4185 operand is in memory. In this case we need to use the predicated LD1
4186 and ST1 instead of LDR and STR, both for correctness on big-endian
4187 targets and because LD1 and ST1 support a wider range of addressing modes.
4188 PRED_MODE is the mode of the predicate.
4189
4190 See the comment at the head of aarch64-sve.md for details about the
4191 big-endian handling. */
4192
4193 void
4194 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4195 {
4196 machine_mode mode = GET_MODE (dest);
4197 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4198 if (!register_operand (src, mode)
4199 && !register_operand (dest, mode))
4200 {
4201 rtx tmp = gen_reg_rtx (mode);
4202 if (MEM_P (src))
4203 aarch64_emit_sve_pred_move (tmp, ptrue, src);
4204 else
4205 emit_move_insn (tmp, src);
4206 src = tmp;
4207 }
4208 aarch64_emit_sve_pred_move (dest, ptrue, src);
4209 }
4210
4211 /* Called only on big-endian targets. See whether an SVE vector move
4212 from SRC to DEST is effectively a REV[BHW] instruction, because at
4213 least one operand is a subreg of an SVE vector that has wider or
4214 narrower elements. Return true and emit the instruction if so.
4215
4216 For example:
4217
4218 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4219
4220 represents a VIEW_CONVERT between the following vectors, viewed
4221 in memory order:
4222
4223 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4224 R1: { [0], [1], [2], [3], ... }
4225
4226 The high part of lane X in R2 should therefore correspond to lane X*2
4227 of R1, but the register representations are:
4228
4229 msb lsb
4230 R2: ...... [1].high [1].low [0].high [0].low
4231 R1: ...... [3] [2] [1] [0]
4232
4233 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4234 We therefore need a reverse operation to swap the high and low values
4235 around.
4236
4237 This is purely an optimization. Without it we would spill the
4238 subreg operand to the stack in one mode and reload it in the
4239 other mode, which has the same effect as the REV. */
4240
4241 bool
4242 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4243 {
4244 gcc_assert (BYTES_BIG_ENDIAN);
4245 if (GET_CODE (dest) == SUBREG)
4246 dest = SUBREG_REG (dest);
4247 if (GET_CODE (src) == SUBREG)
4248 src = SUBREG_REG (src);
4249
4250 /* The optimization handles two single SVE REGs with different element
4251 sizes. */
4252 if (!REG_P (dest)
4253 || !REG_P (src)
4254 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4255 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4256 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4257 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4258 return false;
4259
4260 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4261 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4262 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4263 UNSPEC_REV_SUBREG);
4264 emit_insn (gen_rtx_SET (dest, unspec));
4265 return true;
4266 }
4267
4268 /* Return a copy of X with mode MODE, without changing its other
4269 attributes. Unlike gen_lowpart, this doesn't care whether the
4270 mode change is valid. */
4271
4272 static rtx
4273 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4274 {
4275 if (GET_MODE (x) == mode)
4276 return x;
4277
4278 x = shallow_copy_rtx (x);
4279 set_mode_and_regno (x, mode, REGNO (x));
4280 return x;
4281 }
4282
4283 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4284 operands. */
4285
4286 void
4287 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4288 {
4289 /* Decide which REV operation we need. The mode with narrower elements
4290 determines the mode of the operands and the mode with the wider
4291 elements determines the reverse width. */
4292 machine_mode mode_with_wider_elts = GET_MODE (dest);
4293 machine_mode mode_with_narrower_elts = GET_MODE (src);
4294 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4295 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4296 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4297
4298 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4299 unsigned int unspec;
4300 if (wider_bytes == 8)
4301 unspec = UNSPEC_REV64;
4302 else if (wider_bytes == 4)
4303 unspec = UNSPEC_REV32;
4304 else if (wider_bytes == 2)
4305 unspec = UNSPEC_REV16;
4306 else
4307 gcc_unreachable ();
4308 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4309
4310 /* Emit:
4311
4312 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)] UNSPEC_PRED_X))
4313
4314 with the appropriate modes. */
4315 ptrue = gen_lowpart (pred_mode, ptrue);
4316 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
4317 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
4318 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
4319 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
4320 UNSPEC_PRED_X);
4321 emit_insn (gen_rtx_SET (dest, src));
4322 }
4323
4324 static bool
4325 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4326 tree exp ATTRIBUTE_UNUSED)
4327 {
4328 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4329 return false;
4330
4331 return true;
4332 }
4333
4334 /* Implement TARGET_PASS_BY_REFERENCE. */
4335
4336 static bool
4337 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
4338 machine_mode mode,
4339 const_tree type,
4340 bool named ATTRIBUTE_UNUSED)
4341 {
4342 HOST_WIDE_INT size;
4343 machine_mode dummymode;
4344 int nregs;
4345
4346 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4347 if (mode == BLKmode && type)
4348 size = int_size_in_bytes (type);
4349 else
4350 /* No frontends can create types with variable-sized modes, so we
4351 shouldn't be asked to pass or return them. */
4352 size = GET_MODE_SIZE (mode).to_constant ();
4353
4354 /* Aggregates are passed by reference based on their size. */
4355 if (type && AGGREGATE_TYPE_P (type))
4356 {
4357 size = int_size_in_bytes (type);
4358 }
4359
4360 /* Variable sized arguments are always returned by reference. */
4361 if (size < 0)
4362 return true;
4363
4364 /* Can this be a candidate to be passed in fp/simd register(s)? */
4365 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4366 &dummymode, &nregs,
4367 NULL))
4368 return false;
4369
4370 /* Arguments which are variable sized or larger than 2 registers are
4371 passed by reference unless they are a homogenous floating point
4372 aggregate. */
4373 return size > 2 * UNITS_PER_WORD;
4374 }
4375
4376 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4377 static bool
4378 aarch64_return_in_msb (const_tree valtype)
4379 {
4380 machine_mode dummy_mode;
4381 int dummy_int;
4382
4383 /* Never happens in little-endian mode. */
4384 if (!BYTES_BIG_ENDIAN)
4385 return false;
4386
4387 /* Only composite types smaller than or equal to 16 bytes can
4388 be potentially returned in registers. */
4389 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4390 || int_size_in_bytes (valtype) <= 0
4391 || int_size_in_bytes (valtype) > 16)
4392 return false;
4393
4394 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4395 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4396 is always passed/returned in the least significant bits of fp/simd
4397 register(s). */
4398 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4399 &dummy_mode, &dummy_int, NULL))
4400 return false;
4401
4402 return true;
4403 }
4404
4405 /* Implement TARGET_FUNCTION_VALUE.
4406 Define how to find the value returned by a function. */
4407
4408 static rtx
4409 aarch64_function_value (const_tree type, const_tree func,
4410 bool outgoing ATTRIBUTE_UNUSED)
4411 {
4412 machine_mode mode;
4413 int unsignedp;
4414 int count;
4415 machine_mode ag_mode;
4416
4417 mode = TYPE_MODE (type);
4418 if (INTEGRAL_TYPE_P (type))
4419 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4420
4421 if (aarch64_return_in_msb (type))
4422 {
4423 HOST_WIDE_INT size = int_size_in_bytes (type);
4424
4425 if (size % UNITS_PER_WORD != 0)
4426 {
4427 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4428 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4429 }
4430 }
4431
4432 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4433 &ag_mode, &count, NULL))
4434 {
4435 if (!aarch64_composite_type_p (type, mode))
4436 {
4437 gcc_assert (count == 1 && mode == ag_mode);
4438 return gen_rtx_REG (mode, V0_REGNUM);
4439 }
4440 else
4441 {
4442 int i;
4443 rtx par;
4444
4445 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4446 for (i = 0; i < count; i++)
4447 {
4448 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4449 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4450 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4451 XVECEXP (par, 0, i) = tmp;
4452 }
4453 return par;
4454 }
4455 }
4456 else
4457 return gen_rtx_REG (mode, R0_REGNUM);
4458 }
4459
4460 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4461 Return true if REGNO is the number of a hard register in which the values
4462 of called function may come back. */
4463
4464 static bool
4465 aarch64_function_value_regno_p (const unsigned int regno)
4466 {
4467 /* Maximum of 16 bytes can be returned in the general registers. Examples
4468 of 16-byte return values are: 128-bit integers and 16-byte small
4469 structures (excluding homogeneous floating-point aggregates). */
4470 if (regno == R0_REGNUM || regno == R1_REGNUM)
4471 return true;
4472
4473 /* Up to four fp/simd registers can return a function value, e.g. a
4474 homogeneous floating-point aggregate having four members. */
4475 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4476 return TARGET_FLOAT;
4477
4478 return false;
4479 }
4480
4481 /* Implement TARGET_RETURN_IN_MEMORY.
4482
4483 If the type T of the result of a function is such that
4484 void func (T arg)
4485 would require that arg be passed as a value in a register (or set of
4486 registers) according to the parameter passing rules, then the result
4487 is returned in the same registers as would be used for such an
4488 argument. */
4489
4490 static bool
4491 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4492 {
4493 HOST_WIDE_INT size;
4494 machine_mode ag_mode;
4495 int count;
4496
4497 if (!AGGREGATE_TYPE_P (type)
4498 && TREE_CODE (type) != COMPLEX_TYPE
4499 && TREE_CODE (type) != VECTOR_TYPE)
4500 /* Simple scalar types always returned in registers. */
4501 return false;
4502
4503 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4504 type,
4505 &ag_mode,
4506 &count,
4507 NULL))
4508 return false;
4509
4510 /* Types larger than 2 registers returned in memory. */
4511 size = int_size_in_bytes (type);
4512 return (size < 0 || size > 2 * UNITS_PER_WORD);
4513 }
4514
4515 static bool
4516 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4517 const_tree type, int *nregs)
4518 {
4519 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4520 return aarch64_vfp_is_call_or_return_candidate (mode,
4521 type,
4522 &pcum->aapcs_vfp_rmode,
4523 nregs,
4524 NULL);
4525 }
4526
4527 /* Given MODE and TYPE of a function argument, return the alignment in
4528 bits. The idea is to suppress any stronger alignment requested by
4529 the user and opt for the natural alignment (specified in AAPCS64 \S
4530 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4531 calculated in versions of GCC prior to GCC-9. This is a helper
4532 function for local use only. */
4533
4534 static unsigned int
4535 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4536 bool *abi_break)
4537 {
4538 *abi_break = false;
4539 if (!type)
4540 return GET_MODE_ALIGNMENT (mode);
4541
4542 if (integer_zerop (TYPE_SIZE (type)))
4543 return 0;
4544
4545 gcc_assert (TYPE_MODE (type) == mode);
4546
4547 if (!AGGREGATE_TYPE_P (type))
4548 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4549
4550 if (TREE_CODE (type) == ARRAY_TYPE)
4551 return TYPE_ALIGN (TREE_TYPE (type));
4552
4553 unsigned int alignment = 0;
4554 unsigned int bitfield_alignment = 0;
4555 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4556 if (TREE_CODE (field) == FIELD_DECL)
4557 {
4558 alignment = std::max (alignment, DECL_ALIGN (field));
4559 if (DECL_BIT_FIELD_TYPE (field))
4560 bitfield_alignment
4561 = std::max (bitfield_alignment,
4562 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4563 }
4564
4565 if (bitfield_alignment > alignment)
4566 {
4567 *abi_break = true;
4568 return bitfield_alignment;
4569 }
4570
4571 return alignment;
4572 }
4573
4574 /* Layout a function argument according to the AAPCS64 rules. The rule
4575 numbers refer to the rule numbers in the AAPCS64. */
4576
4577 static void
4578 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4579 const_tree type,
4580 bool named ATTRIBUTE_UNUSED)
4581 {
4582 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4583 int ncrn, nvrn, nregs;
4584 bool allocate_ncrn, allocate_nvrn;
4585 HOST_WIDE_INT size;
4586 bool abi_break;
4587
4588 /* We need to do this once per argument. */
4589 if (pcum->aapcs_arg_processed)
4590 return;
4591
4592 pcum->aapcs_arg_processed = true;
4593
4594 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4595 if (type)
4596 size = int_size_in_bytes (type);
4597 else
4598 /* No frontends can create types with variable-sized modes, so we
4599 shouldn't be asked to pass or return them. */
4600 size = GET_MODE_SIZE (mode).to_constant ();
4601 size = ROUND_UP (size, UNITS_PER_WORD);
4602
4603 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4604 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4605 mode,
4606 type,
4607 &nregs);
4608
4609 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4610 The following code thus handles passing by SIMD/FP registers first. */
4611
4612 nvrn = pcum->aapcs_nvrn;
4613
4614 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4615 and homogenous short-vector aggregates (HVA). */
4616 if (allocate_nvrn)
4617 {
4618 if (!TARGET_FLOAT)
4619 aarch64_err_no_fpadvsimd (mode);
4620
4621 if (nvrn + nregs <= NUM_FP_ARG_REGS)
4622 {
4623 pcum->aapcs_nextnvrn = nvrn + nregs;
4624 if (!aarch64_composite_type_p (type, mode))
4625 {
4626 gcc_assert (nregs == 1);
4627 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4628 }
4629 else
4630 {
4631 rtx par;
4632 int i;
4633 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4634 for (i = 0; i < nregs; i++)
4635 {
4636 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4637 V0_REGNUM + nvrn + i);
4638 rtx offset = gen_int_mode
4639 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4640 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4641 XVECEXP (par, 0, i) = tmp;
4642 }
4643 pcum->aapcs_reg = par;
4644 }
4645 return;
4646 }
4647 else
4648 {
4649 /* C.3 NSRN is set to 8. */
4650 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4651 goto on_stack;
4652 }
4653 }
4654
4655 ncrn = pcum->aapcs_ncrn;
4656 nregs = size / UNITS_PER_WORD;
4657
4658 /* C6 - C9. though the sign and zero extension semantics are
4659 handled elsewhere. This is the case where the argument fits
4660 entirely general registers. */
4661 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4662 {
4663 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4664
4665 /* C.8 if the argument has an alignment of 16 then the NGRN is
4666 rounded up to the next even number. */
4667 if (nregs == 2
4668 && ncrn % 2
4669 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4670 comparison is there because for > 16 * BITS_PER_UNIT
4671 alignment nregs should be > 2 and therefore it should be
4672 passed by reference rather than value. */
4673 && (aarch64_function_arg_alignment (mode, type, &abi_break)
4674 == 16 * BITS_PER_UNIT))
4675 {
4676 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4677 inform (input_location, "parameter passing for argument of type "
4678 "%qT changed in GCC 9.1", type);
4679 ++ncrn;
4680 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4681 }
4682
4683 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4684 A reg is still generated for it, but the caller should be smart
4685 enough not to use it. */
4686 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4687 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4688 else
4689 {
4690 rtx par;
4691 int i;
4692
4693 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4694 for (i = 0; i < nregs; i++)
4695 {
4696 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4697 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4698 GEN_INT (i * UNITS_PER_WORD));
4699 XVECEXP (par, 0, i) = tmp;
4700 }
4701 pcum->aapcs_reg = par;
4702 }
4703
4704 pcum->aapcs_nextncrn = ncrn + nregs;
4705 return;
4706 }
4707
4708 /* C.11 */
4709 pcum->aapcs_nextncrn = NUM_ARG_REGS;
4710
4711 /* The argument is passed on stack; record the needed number of words for
4712 this argument and align the total size if necessary. */
4713 on_stack:
4714 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4715
4716 if (aarch64_function_arg_alignment (mode, type, &abi_break)
4717 == 16 * BITS_PER_UNIT)
4718 {
4719 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4720 if (pcum->aapcs_stack_size != new_size)
4721 {
4722 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4723 inform (input_location, "parameter passing for argument of type "
4724 "%qT changed in GCC 9.1", type);
4725 pcum->aapcs_stack_size = new_size;
4726 }
4727 }
4728 return;
4729 }
4730
4731 /* Implement TARGET_FUNCTION_ARG. */
4732
4733 static rtx
4734 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4735 const_tree type, bool named)
4736 {
4737 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4738 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4739
4740 if (mode == VOIDmode)
4741 return NULL_RTX;
4742
4743 aarch64_layout_arg (pcum_v, mode, type, named);
4744 return pcum->aapcs_reg;
4745 }
4746
4747 void
4748 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4749 const_tree fntype ATTRIBUTE_UNUSED,
4750 rtx libname ATTRIBUTE_UNUSED,
4751 const_tree fndecl ATTRIBUTE_UNUSED,
4752 unsigned n_named ATTRIBUTE_UNUSED)
4753 {
4754 pcum->aapcs_ncrn = 0;
4755 pcum->aapcs_nvrn = 0;
4756 pcum->aapcs_nextncrn = 0;
4757 pcum->aapcs_nextnvrn = 0;
4758 pcum->pcs_variant = ARM_PCS_AAPCS64;
4759 pcum->aapcs_reg = NULL_RTX;
4760 pcum->aapcs_arg_processed = false;
4761 pcum->aapcs_stack_words = 0;
4762 pcum->aapcs_stack_size = 0;
4763
4764 if (!TARGET_FLOAT
4765 && fndecl && TREE_PUBLIC (fndecl)
4766 && fntype && fntype != error_mark_node)
4767 {
4768 const_tree type = TREE_TYPE (fntype);
4769 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4770 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4771 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4772 &mode, &nregs, NULL))
4773 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4774 }
4775 return;
4776 }
4777
4778 static void
4779 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4780 machine_mode mode,
4781 const_tree type,
4782 bool named)
4783 {
4784 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4785 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4786 {
4787 aarch64_layout_arg (pcum_v, mode, type, named);
4788 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4789 != (pcum->aapcs_stack_words != 0));
4790 pcum->aapcs_arg_processed = false;
4791 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4792 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4793 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4794 pcum->aapcs_stack_words = 0;
4795 pcum->aapcs_reg = NULL_RTX;
4796 }
4797 }
4798
4799 bool
4800 aarch64_function_arg_regno_p (unsigned regno)
4801 {
4802 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4803 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4804 }
4805
4806 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4807 PARM_BOUNDARY bits of alignment, but will be given anything up
4808 to STACK_BOUNDARY bits if the type requires it. This makes sure
4809 that both before and after the layout of each argument, the Next
4810 Stacked Argument Address (NSAA) will have a minimum alignment of
4811 8 bytes. */
4812
4813 static unsigned int
4814 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4815 {
4816 bool abi_break;
4817 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4818 &abi_break);
4819 if (abi_break & warn_psabi)
4820 inform (input_location, "parameter passing for argument of type "
4821 "%qT changed in GCC 9.1", type);
4822
4823 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4824 }
4825
4826 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4827
4828 static fixed_size_mode
4829 aarch64_get_reg_raw_mode (int regno)
4830 {
4831 if (TARGET_SVE && FP_REGNUM_P (regno))
4832 /* Don't use the SVE part of the register for __builtin_apply and
4833 __builtin_return. The SVE registers aren't used by the normal PCS,
4834 so using them there would be a waste of time. The PCS extensions
4835 for SVE types are fundamentally incompatible with the
4836 __builtin_return/__builtin_apply interface. */
4837 return as_a <fixed_size_mode> (V16QImode);
4838 return default_get_reg_raw_mode (regno);
4839 }
4840
4841 /* Implement TARGET_FUNCTION_ARG_PADDING.
4842
4843 Small aggregate types are placed in the lowest memory address.
4844
4845 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4846
4847 static pad_direction
4848 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4849 {
4850 /* On little-endian targets, the least significant byte of every stack
4851 argument is passed at the lowest byte address of the stack slot. */
4852 if (!BYTES_BIG_ENDIAN)
4853 return PAD_UPWARD;
4854
4855 /* Otherwise, integral, floating-point and pointer types are padded downward:
4856 the least significant byte of a stack argument is passed at the highest
4857 byte address of the stack slot. */
4858 if (type
4859 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4860 || POINTER_TYPE_P (type))
4861 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4862 return PAD_DOWNWARD;
4863
4864 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4865 return PAD_UPWARD;
4866 }
4867
4868 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4869
4870 It specifies padding for the last (may also be the only)
4871 element of a block move between registers and memory. If
4872 assuming the block is in the memory, padding upward means that
4873 the last element is padded after its highest significant byte,
4874 while in downward padding, the last element is padded at the
4875 its least significant byte side.
4876
4877 Small aggregates and small complex types are always padded
4878 upwards.
4879
4880 We don't need to worry about homogeneous floating-point or
4881 short-vector aggregates; their move is not affected by the
4882 padding direction determined here. Regardless of endianness,
4883 each element of such an aggregate is put in the least
4884 significant bits of a fp/simd register.
4885
4886 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4887 register has useful data, and return the opposite if the most
4888 significant byte does. */
4889
4890 bool
4891 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4892 bool first ATTRIBUTE_UNUSED)
4893 {
4894
4895 /* Small composite types are always padded upward. */
4896 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4897 {
4898 HOST_WIDE_INT size;
4899 if (type)
4900 size = int_size_in_bytes (type);
4901 else
4902 /* No frontends can create types with variable-sized modes, so we
4903 shouldn't be asked to pass or return them. */
4904 size = GET_MODE_SIZE (mode).to_constant ();
4905 if (size < 2 * UNITS_PER_WORD)
4906 return true;
4907 }
4908
4909 /* Otherwise, use the default padding. */
4910 return !BYTES_BIG_ENDIAN;
4911 }
4912
4913 static scalar_int_mode
4914 aarch64_libgcc_cmp_return_mode (void)
4915 {
4916 return SImode;
4917 }
4918
4919 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4920
4921 /* We use the 12-bit shifted immediate arithmetic instructions so values
4922 must be multiple of (1 << 12), i.e. 4096. */
4923 #define ARITH_FACTOR 4096
4924
4925 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4926 #error Cannot use simple address calculation for stack probing
4927 #endif
4928
4929 /* The pair of scratch registers used for stack probing. */
4930 #define PROBE_STACK_FIRST_REG R9_REGNUM
4931 #define PROBE_STACK_SECOND_REG R10_REGNUM
4932
4933 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4934 inclusive. These are offsets from the current stack pointer. */
4935
4936 static void
4937 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4938 {
4939 HOST_WIDE_INT size;
4940 if (!poly_size.is_constant (&size))
4941 {
4942 sorry ("stack probes for SVE frames");
4943 return;
4944 }
4945
4946 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4947
4948 /* See the same assertion on PROBE_INTERVAL above. */
4949 gcc_assert ((first % ARITH_FACTOR) == 0);
4950
4951 /* See if we have a constant small number of probes to generate. If so,
4952 that's the easy case. */
4953 if (size <= PROBE_INTERVAL)
4954 {
4955 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4956
4957 emit_set_insn (reg1,
4958 plus_constant (Pmode,
4959 stack_pointer_rtx, -(first + base)));
4960 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4961 }
4962
4963 /* The run-time loop is made up of 8 insns in the generic case while the
4964 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4965 else if (size <= 4 * PROBE_INTERVAL)
4966 {
4967 HOST_WIDE_INT i, rem;
4968
4969 emit_set_insn (reg1,
4970 plus_constant (Pmode,
4971 stack_pointer_rtx,
4972 -(first + PROBE_INTERVAL)));
4973 emit_stack_probe (reg1);
4974
4975 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4976 it exceeds SIZE. If only two probes are needed, this will not
4977 generate any code. Then probe at FIRST + SIZE. */
4978 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4979 {
4980 emit_set_insn (reg1,
4981 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4982 emit_stack_probe (reg1);
4983 }
4984
4985 rem = size - (i - PROBE_INTERVAL);
4986 if (rem > 256)
4987 {
4988 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4989
4990 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4991 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4992 }
4993 else
4994 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4995 }
4996
4997 /* Otherwise, do the same as above, but in a loop. Note that we must be
4998 extra careful with variables wrapping around because we might be at
4999 the very top (or the very bottom) of the address space and we have
5000 to be able to handle this case properly; in particular, we use an
5001 equality test for the loop condition. */
5002 else
5003 {
5004 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5005
5006 /* Step 1: round SIZE to the previous multiple of the interval. */
5007
5008 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5009
5010
5011 /* Step 2: compute initial and final value of the loop counter. */
5012
5013 /* TEST_ADDR = SP + FIRST. */
5014 emit_set_insn (reg1,
5015 plus_constant (Pmode, stack_pointer_rtx, -first));
5016
5017 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5018 HOST_WIDE_INT adjustment = - (first + rounded_size);
5019 if (! aarch64_uimm12_shift (adjustment))
5020 {
5021 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5022 true, Pmode);
5023 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5024 }
5025 else
5026 emit_set_insn (reg2,
5027 plus_constant (Pmode, stack_pointer_rtx, adjustment));
5028
5029 /* Step 3: the loop
5030
5031 do
5032 {
5033 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5034 probe at TEST_ADDR
5035 }
5036 while (TEST_ADDR != LAST_ADDR)
5037
5038 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5039 until it is equal to ROUNDED_SIZE. */
5040
5041 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5042
5043
5044 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5045 that SIZE is equal to ROUNDED_SIZE. */
5046
5047 if (size != rounded_size)
5048 {
5049 HOST_WIDE_INT rem = size - rounded_size;
5050
5051 if (rem > 256)
5052 {
5053 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5054
5055 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5056 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5057 }
5058 else
5059 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5060 }
5061 }
5062
5063 /* Make sure nothing is scheduled before we are done. */
5064 emit_insn (gen_blockage ());
5065 }
5066
5067 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5068 absolute addresses. */
5069
5070 const char *
5071 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5072 {
5073 static int labelno = 0;
5074 char loop_lab[32];
5075 rtx xops[2];
5076
5077 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5078
5079 /* Loop. */
5080 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5081
5082 HOST_WIDE_INT stack_clash_probe_interval
5083 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5084
5085 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5086 xops[0] = reg1;
5087 HOST_WIDE_INT interval;
5088 if (flag_stack_clash_protection)
5089 interval = stack_clash_probe_interval;
5090 else
5091 interval = PROBE_INTERVAL;
5092
5093 gcc_assert (aarch64_uimm12_shift (interval));
5094 xops[1] = GEN_INT (interval);
5095
5096 output_asm_insn ("sub\t%0, %0, %1", xops);
5097
5098 /* If doing stack clash protection then we probe up by the ABI specified
5099 amount. We do this because we're dropping full pages at a time in the
5100 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5101 if (flag_stack_clash_protection)
5102 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5103 else
5104 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5105
5106 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5107 by this amount for each iteration. */
5108 output_asm_insn ("str\txzr, [%0, %1]", xops);
5109
5110 /* Test if TEST_ADDR == LAST_ADDR. */
5111 xops[1] = reg2;
5112 output_asm_insn ("cmp\t%0, %1", xops);
5113
5114 /* Branch. */
5115 fputs ("\tb.ne\t", asm_out_file);
5116 assemble_name_raw (asm_out_file, loop_lab);
5117 fputc ('\n', asm_out_file);
5118
5119 return "";
5120 }
5121
5122 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5123 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5124 of GUARD_SIZE. When a probe is emitted it is done at most
5125 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5126 at most MIN_PROBE_THRESHOLD. By the end of this function
5127 BASE = BASE - ADJUSTMENT. */
5128
5129 const char *
5130 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5131 rtx min_probe_threshold, rtx guard_size)
5132 {
5133 /* This function is not allowed to use any instruction generation function
5134 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5135 so instead emit the code you want using output_asm_insn. */
5136 gcc_assert (flag_stack_clash_protection);
5137 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5138 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5139
5140 /* The minimum required allocation before the residual requires probing. */
5141 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5142
5143 /* Clamp the value down to the nearest value that can be used with a cmp. */
5144 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5145 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5146
5147 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5148 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5149
5150 static int labelno = 0;
5151 char loop_start_lab[32];
5152 char loop_end_lab[32];
5153 rtx xops[2];
5154
5155 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5156 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5157
5158 /* Emit loop start label. */
5159 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5160
5161 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5162 xops[0] = adjustment;
5163 xops[1] = probe_offset_value_rtx;
5164 output_asm_insn ("cmp\t%0, %1", xops);
5165
5166 /* Branch to end if not enough adjustment to probe. */
5167 fputs ("\tb.lt\t", asm_out_file);
5168 assemble_name_raw (asm_out_file, loop_end_lab);
5169 fputc ('\n', asm_out_file);
5170
5171 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5172 xops[0] = base;
5173 xops[1] = probe_offset_value_rtx;
5174 output_asm_insn ("sub\t%0, %0, %1", xops);
5175
5176 /* Probe at BASE. */
5177 xops[1] = const0_rtx;
5178 output_asm_insn ("str\txzr, [%0, %1]", xops);
5179
5180 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5181 xops[0] = adjustment;
5182 xops[1] = probe_offset_value_rtx;
5183 output_asm_insn ("sub\t%0, %0, %1", xops);
5184
5185 /* Branch to start if still more bytes to allocate. */
5186 fputs ("\tb\t", asm_out_file);
5187 assemble_name_raw (asm_out_file, loop_start_lab);
5188 fputc ('\n', asm_out_file);
5189
5190 /* No probe leave. */
5191 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5192
5193 /* BASE = BASE - ADJUSTMENT. */
5194 xops[0] = base;
5195 xops[1] = adjustment;
5196 output_asm_insn ("sub\t%0, %0, %1", xops);
5197 return "";
5198 }
5199
5200 /* Determine whether a frame chain needs to be generated. */
5201 static bool
5202 aarch64_needs_frame_chain (void)
5203 {
5204 /* Force a frame chain for EH returns so the return address is at FP+8. */
5205 if (frame_pointer_needed || crtl->calls_eh_return)
5206 return true;
5207
5208 /* A leaf function cannot have calls or write LR. */
5209 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5210
5211 /* Don't use a frame chain in leaf functions if leaf frame pointers
5212 are disabled. */
5213 if (flag_omit_leaf_frame_pointer && is_leaf)
5214 return false;
5215
5216 return aarch64_use_frame_pointer;
5217 }
5218
5219 /* Mark the registers that need to be saved by the callee and calculate
5220 the size of the callee-saved registers area and frame record (both FP
5221 and LR may be omitted). */
5222 static void
5223 aarch64_layout_frame (void)
5224 {
5225 HOST_WIDE_INT offset = 0;
5226 int regno, last_fp_reg = INVALID_REGNUM;
5227 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5228
5229 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5230
5231 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5232 the mid-end is doing. */
5233 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5234
5235 #define SLOT_NOT_REQUIRED (-2)
5236 #define SLOT_REQUIRED (-1)
5237
5238 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5239 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5240
5241 /* If this is a non-leaf simd function with calls we assume that
5242 at least one of those calls is to a non-simd function and thus
5243 we must save V8 to V23 in the prologue. */
5244
5245 if (simd_function && !crtl->is_leaf)
5246 {
5247 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5248 if (FP_SIMD_SAVED_REGNUM_P (regno))
5249 df_set_regs_ever_live (regno, true);
5250 }
5251
5252 /* First mark all the registers that really need to be saved... */
5253 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5254 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5255
5256 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5257 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5258
5259 /* ... that includes the eh data registers (if needed)... */
5260 if (crtl->calls_eh_return)
5261 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5262 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5263 = SLOT_REQUIRED;
5264
5265 /* ... and any callee saved register that dataflow says is live. */
5266 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5267 if (df_regs_ever_live_p (regno)
5268 && (regno == R30_REGNUM
5269 || !call_used_regs[regno]))
5270 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5271
5272 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5273 if (df_regs_ever_live_p (regno)
5274 && (!call_used_regs[regno]
5275 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5276 {
5277 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5278 last_fp_reg = regno;
5279 }
5280
5281 if (cfun->machine->frame.emit_frame_chain)
5282 {
5283 /* FP and LR are placed in the linkage record. */
5284 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5285 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5286 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5287 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5288 offset = 2 * UNITS_PER_WORD;
5289 }
5290
5291 /* With stack-clash, LR must be saved in non-leaf functions. */
5292 gcc_assert (crtl->is_leaf
5293 || (cfun->machine->frame.reg_offset[R30_REGNUM]
5294 != SLOT_NOT_REQUIRED));
5295
5296 /* Now assign stack slots for them. */
5297 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5298 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5299 {
5300 cfun->machine->frame.reg_offset[regno] = offset;
5301 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5302 cfun->machine->frame.wb_candidate1 = regno;
5303 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5304 cfun->machine->frame.wb_candidate2 = regno;
5305 offset += UNITS_PER_WORD;
5306 }
5307
5308 HOST_WIDE_INT max_int_offset = offset;
5309 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5310 bool has_align_gap = offset != max_int_offset;
5311
5312 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5313 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5314 {
5315 /* If there is an alignment gap between integer and fp callee-saves,
5316 allocate the last fp register to it if possible. */
5317 if (regno == last_fp_reg
5318 && has_align_gap
5319 && !simd_function
5320 && (offset & 8) == 0)
5321 {
5322 cfun->machine->frame.reg_offset[regno] = max_int_offset;
5323 break;
5324 }
5325
5326 cfun->machine->frame.reg_offset[regno] = offset;
5327 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5328 cfun->machine->frame.wb_candidate1 = regno;
5329 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5330 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5331 cfun->machine->frame.wb_candidate2 = regno;
5332 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5333 }
5334
5335 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5336
5337 cfun->machine->frame.saved_regs_size = offset;
5338
5339 HOST_WIDE_INT varargs_and_saved_regs_size
5340 = offset + cfun->machine->frame.saved_varargs_size;
5341
5342 cfun->machine->frame.hard_fp_offset
5343 = aligned_upper_bound (varargs_and_saved_regs_size
5344 + get_frame_size (),
5345 STACK_BOUNDARY / BITS_PER_UNIT);
5346
5347 /* Both these values are already aligned. */
5348 gcc_assert (multiple_p (crtl->outgoing_args_size,
5349 STACK_BOUNDARY / BITS_PER_UNIT));
5350 cfun->machine->frame.frame_size
5351 = (cfun->machine->frame.hard_fp_offset
5352 + crtl->outgoing_args_size);
5353
5354 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5355
5356 cfun->machine->frame.initial_adjust = 0;
5357 cfun->machine->frame.final_adjust = 0;
5358 cfun->machine->frame.callee_adjust = 0;
5359 cfun->machine->frame.callee_offset = 0;
5360
5361 HOST_WIDE_INT max_push_offset = 0;
5362 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5363 max_push_offset = 512;
5364 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5365 max_push_offset = 256;
5366
5367 HOST_WIDE_INT const_size, const_fp_offset;
5368 if (cfun->machine->frame.frame_size.is_constant (&const_size)
5369 && const_size < max_push_offset
5370 && known_eq (crtl->outgoing_args_size, 0))
5371 {
5372 /* Simple, small frame with no outgoing arguments:
5373 stp reg1, reg2, [sp, -frame_size]!
5374 stp reg3, reg4, [sp, 16] */
5375 cfun->machine->frame.callee_adjust = const_size;
5376 }
5377 else if (known_lt (crtl->outgoing_args_size
5378 + cfun->machine->frame.saved_regs_size, 512)
5379 && !(cfun->calls_alloca
5380 && known_lt (cfun->machine->frame.hard_fp_offset,
5381 max_push_offset)))
5382 {
5383 /* Frame with small outgoing arguments:
5384 sub sp, sp, frame_size
5385 stp reg1, reg2, [sp, outgoing_args_size]
5386 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5387 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5388 cfun->machine->frame.callee_offset
5389 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5390 }
5391 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5392 && const_fp_offset < max_push_offset)
5393 {
5394 /* Frame with large outgoing arguments but a small local area:
5395 stp reg1, reg2, [sp, -hard_fp_offset]!
5396 stp reg3, reg4, [sp, 16]
5397 sub sp, sp, outgoing_args_size */
5398 cfun->machine->frame.callee_adjust = const_fp_offset;
5399 cfun->machine->frame.final_adjust
5400 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5401 }
5402 else
5403 {
5404 /* Frame with large local area and outgoing arguments using frame pointer:
5405 sub sp, sp, hard_fp_offset
5406 stp x29, x30, [sp, 0]
5407 add x29, sp, 0
5408 stp reg3, reg4, [sp, 16]
5409 sub sp, sp, outgoing_args_size */
5410 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5411 cfun->machine->frame.final_adjust
5412 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5413 }
5414
5415 cfun->machine->frame.laid_out = true;
5416 }
5417
5418 /* Return true if the register REGNO is saved on entry to
5419 the current function. */
5420
5421 static bool
5422 aarch64_register_saved_on_entry (int regno)
5423 {
5424 return cfun->machine->frame.reg_offset[regno] >= 0;
5425 }
5426
5427 /* Return the next register up from REGNO up to LIMIT for the callee
5428 to save. */
5429
5430 static unsigned
5431 aarch64_next_callee_save (unsigned regno, unsigned limit)
5432 {
5433 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5434 regno ++;
5435 return regno;
5436 }
5437
5438 /* Push the register number REGNO of mode MODE to the stack with write-back
5439 adjusting the stack by ADJUSTMENT. */
5440
5441 static void
5442 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5443 HOST_WIDE_INT adjustment)
5444 {
5445 rtx base_rtx = stack_pointer_rtx;
5446 rtx insn, reg, mem;
5447
5448 reg = gen_rtx_REG (mode, regno);
5449 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5450 plus_constant (Pmode, base_rtx, -adjustment));
5451 mem = gen_frame_mem (mode, mem);
5452
5453 insn = emit_move_insn (mem, reg);
5454 RTX_FRAME_RELATED_P (insn) = 1;
5455 }
5456
5457 /* Generate and return an instruction to store the pair of registers
5458 REG and REG2 of mode MODE to location BASE with write-back adjusting
5459 the stack location BASE by ADJUSTMENT. */
5460
5461 static rtx
5462 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5463 HOST_WIDE_INT adjustment)
5464 {
5465 switch (mode)
5466 {
5467 case E_DImode:
5468 return gen_storewb_pairdi_di (base, base, reg, reg2,
5469 GEN_INT (-adjustment),
5470 GEN_INT (UNITS_PER_WORD - adjustment));
5471 case E_DFmode:
5472 return gen_storewb_pairdf_di (base, base, reg, reg2,
5473 GEN_INT (-adjustment),
5474 GEN_INT (UNITS_PER_WORD - adjustment));
5475 case E_TFmode:
5476 return gen_storewb_pairtf_di (base, base, reg, reg2,
5477 GEN_INT (-adjustment),
5478 GEN_INT (UNITS_PER_VREG - adjustment));
5479 default:
5480 gcc_unreachable ();
5481 }
5482 }
5483
5484 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5485 stack pointer by ADJUSTMENT. */
5486
5487 static void
5488 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5489 {
5490 rtx_insn *insn;
5491 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5492
5493 if (regno2 == INVALID_REGNUM)
5494 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5495
5496 rtx reg1 = gen_rtx_REG (mode, regno1);
5497 rtx reg2 = gen_rtx_REG (mode, regno2);
5498
5499 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5500 reg2, adjustment));
5501 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5502 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5503 RTX_FRAME_RELATED_P (insn) = 1;
5504 }
5505
5506 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5507 adjusting it by ADJUSTMENT afterwards. */
5508
5509 static rtx
5510 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5511 HOST_WIDE_INT adjustment)
5512 {
5513 switch (mode)
5514 {
5515 case E_DImode:
5516 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5517 GEN_INT (UNITS_PER_WORD));
5518 case E_DFmode:
5519 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5520 GEN_INT (UNITS_PER_WORD));
5521 case E_TFmode:
5522 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5523 GEN_INT (UNITS_PER_VREG));
5524 default:
5525 gcc_unreachable ();
5526 }
5527 }
5528
5529 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5530 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5531 into CFI_OPS. */
5532
5533 static void
5534 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5535 rtx *cfi_ops)
5536 {
5537 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5538 rtx reg1 = gen_rtx_REG (mode, regno1);
5539
5540 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5541
5542 if (regno2 == INVALID_REGNUM)
5543 {
5544 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5545 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5546 emit_move_insn (reg1, gen_frame_mem (mode, mem));
5547 }
5548 else
5549 {
5550 rtx reg2 = gen_rtx_REG (mode, regno2);
5551 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5552 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5553 reg2, adjustment));
5554 }
5555 }
5556
5557 /* Generate and return a store pair instruction of mode MODE to store
5558 register REG1 to MEM1 and register REG2 to MEM2. */
5559
5560 static rtx
5561 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5562 rtx reg2)
5563 {
5564 switch (mode)
5565 {
5566 case E_DImode:
5567 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5568
5569 case E_DFmode:
5570 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5571
5572 case E_TFmode:
5573 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5574
5575 default:
5576 gcc_unreachable ();
5577 }
5578 }
5579
5580 /* Generate and regurn a load pair isntruction of mode MODE to load register
5581 REG1 from MEM1 and register REG2 from MEM2. */
5582
5583 static rtx
5584 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5585 rtx mem2)
5586 {
5587 switch (mode)
5588 {
5589 case E_DImode:
5590 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5591
5592 case E_DFmode:
5593 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5594
5595 case E_TFmode:
5596 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5597
5598 default:
5599 gcc_unreachable ();
5600 }
5601 }
5602
5603 /* Return TRUE if return address signing should be enabled for the current
5604 function, otherwise return FALSE. */
5605
5606 bool
5607 aarch64_return_address_signing_enabled (void)
5608 {
5609 /* This function should only be called after frame laid out. */
5610 gcc_assert (cfun->machine->frame.laid_out);
5611
5612 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5613 if its LR is pushed onto stack. */
5614 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5615 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5616 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5617 }
5618
5619 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5620 bool
5621 aarch64_bti_enabled (void)
5622 {
5623 return (aarch64_enable_bti == 1);
5624 }
5625
5626 /* Emit code to save the callee-saved registers from register number START
5627 to LIMIT to the stack at the location starting at offset START_OFFSET,
5628 skipping any write-back candidates if SKIP_WB is true. */
5629
5630 static void
5631 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5632 unsigned start, unsigned limit, bool skip_wb)
5633 {
5634 rtx_insn *insn;
5635 unsigned regno;
5636 unsigned regno2;
5637
5638 for (regno = aarch64_next_callee_save (start, limit);
5639 regno <= limit;
5640 regno = aarch64_next_callee_save (regno + 1, limit))
5641 {
5642 rtx reg, mem;
5643 poly_int64 offset;
5644 int offset_diff;
5645
5646 if (skip_wb
5647 && (regno == cfun->machine->frame.wb_candidate1
5648 || regno == cfun->machine->frame.wb_candidate2))
5649 continue;
5650
5651 if (cfun->machine->reg_is_wrapped_separately[regno])
5652 continue;
5653
5654 reg = gen_rtx_REG (mode, regno);
5655 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5656 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5657 offset));
5658
5659 regno2 = aarch64_next_callee_save (regno + 1, limit);
5660 offset_diff = cfun->machine->frame.reg_offset[regno2]
5661 - cfun->machine->frame.reg_offset[regno];
5662
5663 if (regno2 <= limit
5664 && !cfun->machine->reg_is_wrapped_separately[regno2]
5665 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5666 {
5667 rtx reg2 = gen_rtx_REG (mode, regno2);
5668 rtx mem2;
5669
5670 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5671 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5672 offset));
5673 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5674 reg2));
5675
5676 /* The first part of a frame-related parallel insn is
5677 always assumed to be relevant to the frame
5678 calculations; subsequent parts, are only
5679 frame-related if explicitly marked. */
5680 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5681 regno = regno2;
5682 }
5683 else
5684 insn = emit_move_insn (mem, reg);
5685
5686 RTX_FRAME_RELATED_P (insn) = 1;
5687 }
5688 }
5689
5690 /* Emit code to restore the callee registers of mode MODE from register
5691 number START up to and including LIMIT. Restore from the stack offset
5692 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5693 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5694
5695 static void
5696 aarch64_restore_callee_saves (machine_mode mode,
5697 poly_int64 start_offset, unsigned start,
5698 unsigned limit, bool skip_wb, rtx *cfi_ops)
5699 {
5700 rtx base_rtx = stack_pointer_rtx;
5701 unsigned regno;
5702 unsigned regno2;
5703 poly_int64 offset;
5704
5705 for (regno = aarch64_next_callee_save (start, limit);
5706 regno <= limit;
5707 regno = aarch64_next_callee_save (regno + 1, limit))
5708 {
5709 if (cfun->machine->reg_is_wrapped_separately[regno])
5710 continue;
5711
5712 rtx reg, mem;
5713 int offset_diff;
5714
5715 if (skip_wb
5716 && (regno == cfun->machine->frame.wb_candidate1
5717 || regno == cfun->machine->frame.wb_candidate2))
5718 continue;
5719
5720 reg = gen_rtx_REG (mode, regno);
5721 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5722 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5723
5724 regno2 = aarch64_next_callee_save (regno + 1, limit);
5725 offset_diff = cfun->machine->frame.reg_offset[regno2]
5726 - cfun->machine->frame.reg_offset[regno];
5727
5728 if (regno2 <= limit
5729 && !cfun->machine->reg_is_wrapped_separately[regno2]
5730 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5731 {
5732 rtx reg2 = gen_rtx_REG (mode, regno2);
5733 rtx mem2;
5734
5735 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5736 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5737 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5738
5739 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5740 regno = regno2;
5741 }
5742 else
5743 emit_move_insn (reg, mem);
5744 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5745 }
5746 }
5747
5748 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5749 of MODE. */
5750
5751 static inline bool
5752 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5753 {
5754 HOST_WIDE_INT multiple;
5755 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5756 && IN_RANGE (multiple, -8, 7));
5757 }
5758
5759 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5760 of MODE. */
5761
5762 static inline bool
5763 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5764 {
5765 HOST_WIDE_INT multiple;
5766 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5767 && IN_RANGE (multiple, 0, 63));
5768 }
5769
5770 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5771 of MODE. */
5772
5773 bool
5774 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5775 {
5776 HOST_WIDE_INT multiple;
5777 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5778 && IN_RANGE (multiple, -64, 63));
5779 }
5780
5781 /* Return true if OFFSET is a signed 9-bit value. */
5782
5783 bool
5784 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5785 poly_int64 offset)
5786 {
5787 HOST_WIDE_INT const_offset;
5788 return (offset.is_constant (&const_offset)
5789 && IN_RANGE (const_offset, -256, 255));
5790 }
5791
5792 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5793 of MODE. */
5794
5795 static inline bool
5796 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5797 {
5798 HOST_WIDE_INT multiple;
5799 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5800 && IN_RANGE (multiple, -256, 255));
5801 }
5802
5803 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5804 of MODE. */
5805
5806 static inline bool
5807 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5808 {
5809 HOST_WIDE_INT multiple;
5810 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5811 && IN_RANGE (multiple, 0, 4095));
5812 }
5813
5814 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5815
5816 static sbitmap
5817 aarch64_get_separate_components (void)
5818 {
5819 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5820 bitmap_clear (components);
5821
5822 /* The registers we need saved to the frame. */
5823 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5824 if (aarch64_register_saved_on_entry (regno))
5825 {
5826 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5827 if (!frame_pointer_needed)
5828 offset += cfun->machine->frame.frame_size
5829 - cfun->machine->frame.hard_fp_offset;
5830 /* Check that we can access the stack slot of the register with one
5831 direct load with no adjustments needed. */
5832 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5833 bitmap_set_bit (components, regno);
5834 }
5835
5836 /* Don't mess with the hard frame pointer. */
5837 if (frame_pointer_needed)
5838 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5839
5840 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5841 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5842 /* If registers have been chosen to be stored/restored with
5843 writeback don't interfere with them to avoid having to output explicit
5844 stack adjustment instructions. */
5845 if (reg2 != INVALID_REGNUM)
5846 bitmap_clear_bit (components, reg2);
5847 if (reg1 != INVALID_REGNUM)
5848 bitmap_clear_bit (components, reg1);
5849
5850 bitmap_clear_bit (components, LR_REGNUM);
5851 bitmap_clear_bit (components, SP_REGNUM);
5852
5853 return components;
5854 }
5855
5856 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5857
5858 static sbitmap
5859 aarch64_components_for_bb (basic_block bb)
5860 {
5861 bitmap in = DF_LIVE_IN (bb);
5862 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5863 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5864 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5865
5866 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5867 bitmap_clear (components);
5868
5869 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5870 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5871 if ((!call_used_regs[regno]
5872 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5873 && (bitmap_bit_p (in, regno)
5874 || bitmap_bit_p (gen, regno)
5875 || bitmap_bit_p (kill, regno)))
5876 {
5877 unsigned regno2, offset, offset2;
5878 bitmap_set_bit (components, regno);
5879
5880 /* If there is a callee-save at an adjacent offset, add it too
5881 to increase the use of LDP/STP. */
5882 offset = cfun->machine->frame.reg_offset[regno];
5883 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5884
5885 if (regno2 <= LAST_SAVED_REGNUM)
5886 {
5887 offset2 = cfun->machine->frame.reg_offset[regno2];
5888 if ((offset & ~8) == (offset2 & ~8))
5889 bitmap_set_bit (components, regno2);
5890 }
5891 }
5892
5893 return components;
5894 }
5895
5896 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5897 Nothing to do for aarch64. */
5898
5899 static void
5900 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5901 {
5902 }
5903
5904 /* Return the next set bit in BMP from START onwards. Return the total number
5905 of bits in BMP if no set bit is found at or after START. */
5906
5907 static unsigned int
5908 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5909 {
5910 unsigned int nbits = SBITMAP_SIZE (bmp);
5911 if (start == nbits)
5912 return start;
5913
5914 gcc_assert (start < nbits);
5915 for (unsigned int i = start; i < nbits; i++)
5916 if (bitmap_bit_p (bmp, i))
5917 return i;
5918
5919 return nbits;
5920 }
5921
5922 /* Do the work for aarch64_emit_prologue_components and
5923 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5924 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5925 for these components or the epilogue sequence. That is, it determines
5926 whether we should emit stores or loads and what kind of CFA notes to attach
5927 to the insns. Otherwise the logic for the two sequences is very
5928 similar. */
5929
5930 static void
5931 aarch64_process_components (sbitmap components, bool prologue_p)
5932 {
5933 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5934 ? HARD_FRAME_POINTER_REGNUM
5935 : STACK_POINTER_REGNUM);
5936
5937 unsigned last_regno = SBITMAP_SIZE (components);
5938 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5939 rtx_insn *insn = NULL;
5940
5941 while (regno != last_regno)
5942 {
5943 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5944 so DFmode for the vector registers is enough. For simd functions
5945 we want to save the low 128 bits. */
5946 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5947
5948 rtx reg = gen_rtx_REG (mode, regno);
5949 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5950 if (!frame_pointer_needed)
5951 offset += cfun->machine->frame.frame_size
5952 - cfun->machine->frame.hard_fp_offset;
5953 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5954 rtx mem = gen_frame_mem (mode, addr);
5955
5956 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5957 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5958 /* No more registers to handle after REGNO.
5959 Emit a single save/restore and exit. */
5960 if (regno2 == last_regno)
5961 {
5962 insn = emit_insn (set);
5963 RTX_FRAME_RELATED_P (insn) = 1;
5964 if (prologue_p)
5965 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5966 else
5967 add_reg_note (insn, REG_CFA_RESTORE, reg);
5968 break;
5969 }
5970
5971 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5972 /* The next register is not of the same class or its offset is not
5973 mergeable with the current one into a pair. */
5974 if (!satisfies_constraint_Ump (mem)
5975 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5976 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5977 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5978 GET_MODE_SIZE (mode)))
5979 {
5980 insn = emit_insn (set);
5981 RTX_FRAME_RELATED_P (insn) = 1;
5982 if (prologue_p)
5983 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5984 else
5985 add_reg_note (insn, REG_CFA_RESTORE, reg);
5986
5987 regno = regno2;
5988 continue;
5989 }
5990
5991 /* REGNO2 can be saved/restored in a pair with REGNO. */
5992 rtx reg2 = gen_rtx_REG (mode, regno2);
5993 if (!frame_pointer_needed)
5994 offset2 += cfun->machine->frame.frame_size
5995 - cfun->machine->frame.hard_fp_offset;
5996 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5997 rtx mem2 = gen_frame_mem (mode, addr2);
5998 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5999 : gen_rtx_SET (reg2, mem2);
6000
6001 if (prologue_p)
6002 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6003 else
6004 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6005
6006 RTX_FRAME_RELATED_P (insn) = 1;
6007 if (prologue_p)
6008 {
6009 add_reg_note (insn, REG_CFA_OFFSET, set);
6010 add_reg_note (insn, REG_CFA_OFFSET, set2);
6011 }
6012 else
6013 {
6014 add_reg_note (insn, REG_CFA_RESTORE, reg);
6015 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6016 }
6017
6018 regno = aarch64_get_next_set_bit (components, regno2 + 1);
6019 }
6020 }
6021
6022 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6023
6024 static void
6025 aarch64_emit_prologue_components (sbitmap components)
6026 {
6027 aarch64_process_components (components, true);
6028 }
6029
6030 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6031
6032 static void
6033 aarch64_emit_epilogue_components (sbitmap components)
6034 {
6035 aarch64_process_components (components, false);
6036 }
6037
6038 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6039
6040 static void
6041 aarch64_set_handled_components (sbitmap components)
6042 {
6043 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6044 if (bitmap_bit_p (components, regno))
6045 cfun->machine->reg_is_wrapped_separately[regno] = true;
6046 }
6047
6048 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6049 determining the probe offset for alloca. */
6050
6051 static HOST_WIDE_INT
6052 aarch64_stack_clash_protection_alloca_probe_range (void)
6053 {
6054 return STACK_CLASH_CALLER_GUARD;
6055 }
6056
6057
6058 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6059 registers. If POLY_SIZE is not large enough to require a probe this function
6060 will only adjust the stack. When allocating the stack space
6061 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6062 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6063 arguments. If we are then we ensure that any allocation larger than the ABI
6064 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6065 maintained.
6066
6067 We emit barriers after each stack adjustment to prevent optimizations from
6068 breaking the invariant that we never drop the stack more than a page. This
6069 invariant is needed to make it easier to correctly handle asynchronous
6070 events, e.g. if we were to allow the stack to be dropped by more than a page
6071 and then have multiple probes up and we take a signal somewhere in between
6072 then the signal handler doesn't know the state of the stack and can make no
6073 assumptions about which pages have been probed. */
6074
6075 static void
6076 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6077 poly_int64 poly_size,
6078 bool frame_related_p,
6079 bool final_adjustment_p)
6080 {
6081 HOST_WIDE_INT guard_size
6082 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6083 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6084 /* When doing the final adjustment for the outgoing argument size we can't
6085 assume that LR was saved at position 0. So subtract it's offset from the
6086 ABI safe buffer so that we don't accidentally allow an adjustment that
6087 would result in an allocation larger than the ABI buffer without
6088 probing. */
6089 HOST_WIDE_INT min_probe_threshold
6090 = final_adjustment_p
6091 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6092 : guard_size - guard_used_by_caller;
6093
6094 poly_int64 frame_size = cfun->machine->frame.frame_size;
6095
6096 /* We should always have a positive probe threshold. */
6097 gcc_assert (min_probe_threshold > 0);
6098
6099 if (flag_stack_clash_protection && !final_adjustment_p)
6100 {
6101 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6102 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6103
6104 if (known_eq (frame_size, 0))
6105 {
6106 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6107 }
6108 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6109 && known_lt (final_adjust, guard_used_by_caller))
6110 {
6111 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6112 }
6113 }
6114
6115 /* If SIZE is not large enough to require probing, just adjust the stack and
6116 exit. */
6117 if (known_lt (poly_size, min_probe_threshold)
6118 || !flag_stack_clash_protection)
6119 {
6120 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6121 return;
6122 }
6123
6124 HOST_WIDE_INT size;
6125 /* Handle the SVE non-constant case first. */
6126 if (!poly_size.is_constant (&size))
6127 {
6128 if (dump_file)
6129 {
6130 fprintf (dump_file, "Stack clash SVE prologue: ");
6131 print_dec (poly_size, dump_file);
6132 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6133 }
6134
6135 /* First calculate the amount of bytes we're actually spilling. */
6136 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6137 poly_size, temp1, temp2, false, true);
6138
6139 rtx_insn *insn = get_last_insn ();
6140
6141 if (frame_related_p)
6142 {
6143 /* This is done to provide unwinding information for the stack
6144 adjustments we're about to do, however to prevent the optimizers
6145 from removing the R11 move and leaving the CFA note (which would be
6146 very wrong) we tie the old and new stack pointer together.
6147 The tie will expand to nothing but the optimizers will not touch
6148 the instruction. */
6149 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6150 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6151 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6152
6153 /* We want the CFA independent of the stack pointer for the
6154 duration of the loop. */
6155 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6156 RTX_FRAME_RELATED_P (insn) = 1;
6157 }
6158
6159 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6160 rtx guard_const = gen_int_mode (guard_size, Pmode);
6161
6162 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6163 stack_pointer_rtx, temp1,
6164 probe_const, guard_const));
6165
6166 /* Now reset the CFA register if needed. */
6167 if (frame_related_p)
6168 {
6169 add_reg_note (insn, REG_CFA_DEF_CFA,
6170 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6171 gen_int_mode (poly_size, Pmode)));
6172 RTX_FRAME_RELATED_P (insn) = 1;
6173 }
6174
6175 return;
6176 }
6177
6178 if (dump_file)
6179 fprintf (dump_file,
6180 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6181 " bytes, probing will be required.\n", size);
6182
6183 /* Round size to the nearest multiple of guard_size, and calculate the
6184 residual as the difference between the original size and the rounded
6185 size. */
6186 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6187 HOST_WIDE_INT residual = size - rounded_size;
6188
6189 /* We can handle a small number of allocations/probes inline. Otherwise
6190 punt to a loop. */
6191 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6192 {
6193 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6194 {
6195 aarch64_sub_sp (NULL, temp2, guard_size, true);
6196 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6197 guard_used_by_caller));
6198 emit_insn (gen_blockage ());
6199 }
6200 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6201 }
6202 else
6203 {
6204 /* Compute the ending address. */
6205 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6206 temp1, NULL, false, true);
6207 rtx_insn *insn = get_last_insn ();
6208
6209 /* For the initial allocation, we don't have a frame pointer
6210 set up, so we always need CFI notes. If we're doing the
6211 final allocation, then we may have a frame pointer, in which
6212 case it is the CFA, otherwise we need CFI notes.
6213
6214 We can determine which allocation we are doing by looking at
6215 the value of FRAME_RELATED_P since the final allocations are not
6216 frame related. */
6217 if (frame_related_p)
6218 {
6219 /* We want the CFA independent of the stack pointer for the
6220 duration of the loop. */
6221 add_reg_note (insn, REG_CFA_DEF_CFA,
6222 plus_constant (Pmode, temp1, rounded_size));
6223 RTX_FRAME_RELATED_P (insn) = 1;
6224 }
6225
6226 /* This allocates and probes the stack. Note that this re-uses some of
6227 the existing Ada stack protection code. However we are guaranteed not
6228 to enter the non loop or residual branches of that code.
6229
6230 The non-loop part won't be entered because if our allocation amount
6231 doesn't require a loop, the case above would handle it.
6232
6233 The residual amount won't be entered because TEMP1 is a mutliple of
6234 the allocation size. The residual will always be 0. As such, the only
6235 part we are actually using from that code is the loop setup. The
6236 actual probing is done in aarch64_output_probe_stack_range. */
6237 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6238 stack_pointer_rtx, temp1));
6239
6240 /* Now reset the CFA register if needed. */
6241 if (frame_related_p)
6242 {
6243 add_reg_note (insn, REG_CFA_DEF_CFA,
6244 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6245 RTX_FRAME_RELATED_P (insn) = 1;
6246 }
6247
6248 emit_insn (gen_blockage ());
6249 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6250 }
6251
6252 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6253 be probed. This maintains the requirement that each page is probed at
6254 least once. For initial probing we probe only if the allocation is
6255 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6256 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6257 GUARD_SIZE. This works that for any allocation that is large enough to
6258 trigger a probe here, we'll have at least one, and if they're not large
6259 enough for this code to emit anything for them, The page would have been
6260 probed by the saving of FP/LR either by this function or any callees. If
6261 we don't have any callees then we won't have more stack adjustments and so
6262 are still safe. */
6263 if (residual)
6264 {
6265 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6266 /* If we're doing final adjustments, and we've done any full page
6267 allocations then any residual needs to be probed. */
6268 if (final_adjustment_p && rounded_size != 0)
6269 min_probe_threshold = 0;
6270 /* If doing a small final adjustment, we always probe at offset 0.
6271 This is done to avoid issues when LR is not at position 0 or when
6272 the final adjustment is smaller than the probing offset. */
6273 else if (final_adjustment_p && rounded_size == 0)
6274 residual_probe_offset = 0;
6275
6276 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6277 if (residual >= min_probe_threshold)
6278 {
6279 if (dump_file)
6280 fprintf (dump_file,
6281 "Stack clash AArch64 prologue residuals: "
6282 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6283 "\n", residual);
6284
6285 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6286 residual_probe_offset));
6287 emit_insn (gen_blockage ());
6288 }
6289 }
6290 }
6291
6292 /* Return 1 if the register is used by the epilogue. We need to say the
6293 return register is used, but only after epilogue generation is complete.
6294 Note that in the case of sibcalls, the values "used by the epilogue" are
6295 considered live at the start of the called function.
6296
6297 For SIMD functions we need to return 1 for FP registers that are saved and
6298 restored by a function but are not zero in call_used_regs. If we do not do
6299 this optimizations may remove the restore of the register. */
6300
6301 int
6302 aarch64_epilogue_uses (int regno)
6303 {
6304 if (epilogue_completed)
6305 {
6306 if (regno == LR_REGNUM)
6307 return 1;
6308 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6309 return 1;
6310 }
6311 return 0;
6312 }
6313
6314 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6315 is saved at BASE + OFFSET. */
6316
6317 static void
6318 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6319 rtx base, poly_int64 offset)
6320 {
6321 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6322 add_reg_note (insn, REG_CFA_EXPRESSION,
6323 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6324 }
6325
6326 /* AArch64 stack frames generated by this compiler look like:
6327
6328 +-------------------------------+
6329 | |
6330 | incoming stack arguments |
6331 | |
6332 +-------------------------------+
6333 | | <-- incoming stack pointer (aligned)
6334 | callee-allocated save area |
6335 | for register varargs |
6336 | |
6337 +-------------------------------+
6338 | local variables | <-- frame_pointer_rtx
6339 | |
6340 +-------------------------------+
6341 | padding | \
6342 +-------------------------------+ |
6343 | callee-saved registers | | frame.saved_regs_size
6344 +-------------------------------+ |
6345 | LR' | |
6346 +-------------------------------+ |
6347 | FP' | / <- hard_frame_pointer_rtx (aligned)
6348 +-------------------------------+
6349 | dynamic allocation |
6350 +-------------------------------+
6351 | padding |
6352 +-------------------------------+
6353 | outgoing stack arguments | <-- arg_pointer
6354 | |
6355 +-------------------------------+
6356 | | <-- stack_pointer_rtx (aligned)
6357
6358 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6359 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6360 unchanged.
6361
6362 By default for stack-clash we assume the guard is at least 64KB, but this
6363 value is configurable to either 4KB or 64KB. We also force the guard size to
6364 be the same as the probing interval and both values are kept in sync.
6365
6366 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6367 on the guard size) of stack space without probing.
6368
6369 When probing is needed, we emit a probe at the start of the prologue
6370 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6371
6372 We have to track how much space has been allocated and the only stores
6373 to the stack we track as implicit probes are the FP/LR stores.
6374
6375 For outgoing arguments we probe if the size is larger than 1KB, such that
6376 the ABI specified buffer is maintained for the next callee.
6377
6378 The following registers are reserved during frame layout and should not be
6379 used for any other purpose:
6380
6381 - r11: Used by stack clash protection when SVE is enabled.
6382 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6383 - r14 and r15: Used for speculation tracking.
6384 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6385 - r30(LR), r29(FP): Used by standard frame layout.
6386
6387 These registers must be avoided in frame layout related code unless the
6388 explicit intention is to interact with one of the features listed above. */
6389
6390 /* Generate the prologue instructions for entry into a function.
6391 Establish the stack frame by decreasing the stack pointer with a
6392 properly calculated size and, if necessary, create a frame record
6393 filled with the values of LR and previous frame pointer. The
6394 current FP is also set up if it is in use. */
6395
6396 void
6397 aarch64_expand_prologue (void)
6398 {
6399 poly_int64 frame_size = cfun->machine->frame.frame_size;
6400 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6401 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6402 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6403 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6404 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6405 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6406 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6407 rtx_insn *insn;
6408
6409 /* Sign return address for functions. */
6410 if (aarch64_return_address_signing_enabled ())
6411 {
6412 switch (aarch64_ra_sign_key)
6413 {
6414 case AARCH64_KEY_A:
6415 insn = emit_insn (gen_paciasp ());
6416 break;
6417 case AARCH64_KEY_B:
6418 insn = emit_insn (gen_pacibsp ());
6419 break;
6420 default:
6421 gcc_unreachable ();
6422 }
6423 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6424 RTX_FRAME_RELATED_P (insn) = 1;
6425 }
6426
6427 if (flag_stack_usage_info)
6428 current_function_static_stack_size = constant_lower_bound (frame_size);
6429
6430 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6431 {
6432 if (crtl->is_leaf && !cfun->calls_alloca)
6433 {
6434 if (maybe_gt (frame_size, PROBE_INTERVAL)
6435 && maybe_gt (frame_size, get_stack_check_protect ()))
6436 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6437 (frame_size
6438 - get_stack_check_protect ()));
6439 }
6440 else if (maybe_gt (frame_size, 0))
6441 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6442 }
6443
6444 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6445 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6446
6447 /* In theory we should never have both an initial adjustment
6448 and a callee save adjustment. Verify that is the case since the
6449 code below does not handle it for -fstack-clash-protection. */
6450 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6451
6452 /* Will only probe if the initial adjustment is larger than the guard
6453 less the amount of the guard reserved for use by the caller's
6454 outgoing args. */
6455 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6456 true, false);
6457
6458 if (callee_adjust != 0)
6459 aarch64_push_regs (reg1, reg2, callee_adjust);
6460
6461 if (emit_frame_chain)
6462 {
6463 poly_int64 reg_offset = callee_adjust;
6464 if (callee_adjust == 0)
6465 {
6466 reg1 = R29_REGNUM;
6467 reg2 = R30_REGNUM;
6468 reg_offset = callee_offset;
6469 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6470 }
6471 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6472 stack_pointer_rtx, callee_offset,
6473 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6474 if (frame_pointer_needed && !frame_size.is_constant ())
6475 {
6476 /* Variable-sized frames need to describe the save slot
6477 address using DW_CFA_expression rather than DW_CFA_offset.
6478 This means that, without taking further action, the
6479 locations of the registers that we've already saved would
6480 remain based on the stack pointer even after we redefine
6481 the CFA based on the frame pointer. We therefore need new
6482 DW_CFA_expressions to re-express the save slots with addresses
6483 based on the frame pointer. */
6484 rtx_insn *insn = get_last_insn ();
6485 gcc_assert (RTX_FRAME_RELATED_P (insn));
6486
6487 /* Add an explicit CFA definition if this was previously
6488 implicit. */
6489 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6490 {
6491 rtx src = plus_constant (Pmode, stack_pointer_rtx,
6492 callee_offset);
6493 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6494 gen_rtx_SET (hard_frame_pointer_rtx, src));
6495 }
6496
6497 /* Change the save slot expressions for the registers that
6498 we've already saved. */
6499 reg_offset -= callee_offset;
6500 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6501 reg_offset + UNITS_PER_WORD);
6502 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6503 reg_offset);
6504 }
6505 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6506 }
6507
6508 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6509 callee_adjust != 0 || emit_frame_chain);
6510 if (aarch64_simd_decl_p (cfun->decl))
6511 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6512 callee_adjust != 0 || emit_frame_chain);
6513 else
6514 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6515 callee_adjust != 0 || emit_frame_chain);
6516
6517 /* We may need to probe the final adjustment if it is larger than the guard
6518 that is assumed by the called. */
6519 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6520 !frame_pointer_needed, true);
6521 }
6522
6523 /* Return TRUE if we can use a simple_return insn.
6524
6525 This function checks whether the callee saved stack is empty, which
6526 means no restore actions are need. The pro_and_epilogue will use
6527 this to check whether shrink-wrapping opt is feasible. */
6528
6529 bool
6530 aarch64_use_return_insn_p (void)
6531 {
6532 if (!reload_completed)
6533 return false;
6534
6535 if (crtl->profile)
6536 return false;
6537
6538 return known_eq (cfun->machine->frame.frame_size, 0);
6539 }
6540
6541 /* Return false for non-leaf SIMD functions in order to avoid
6542 shrink-wrapping them. Doing this will lose the necessary
6543 save/restore of FP registers. */
6544
6545 bool
6546 aarch64_use_simple_return_insn_p (void)
6547 {
6548 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6549 return false;
6550
6551 return true;
6552 }
6553
6554 /* Generate the epilogue instructions for returning from a function.
6555 This is almost exactly the reverse of the prolog sequence, except
6556 that we need to insert barriers to avoid scheduling loads that read
6557 from a deallocated stack, and we optimize the unwind records by
6558 emitting them all together if possible. */
6559 void
6560 aarch64_expand_epilogue (bool for_sibcall)
6561 {
6562 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6563 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6564 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6565 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6566 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6567 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6568 rtx cfi_ops = NULL;
6569 rtx_insn *insn;
6570 /* A stack clash protection prologue may not have left EP0_REGNUM or
6571 EP1_REGNUM in a usable state. The same is true for allocations
6572 with an SVE component, since we then need both temporary registers
6573 for each allocation. For stack clash we are in a usable state if
6574 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6575 HOST_WIDE_INT guard_size
6576 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6577 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6578
6579 /* We can re-use the registers when the allocation amount is smaller than
6580 guard_size - guard_used_by_caller because we won't be doing any probes
6581 then. In such situations the register should remain live with the correct
6582 value. */
6583 bool can_inherit_p = (initial_adjust.is_constant ()
6584 && final_adjust.is_constant ())
6585 && (!flag_stack_clash_protection
6586 || known_lt (initial_adjust,
6587 guard_size - guard_used_by_caller));
6588
6589 /* We need to add memory barrier to prevent read from deallocated stack. */
6590 bool need_barrier_p
6591 = maybe_ne (get_frame_size ()
6592 + cfun->machine->frame.saved_varargs_size, 0);
6593
6594 /* Emit a barrier to prevent loads from a deallocated stack. */
6595 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6596 || cfun->calls_alloca
6597 || crtl->calls_eh_return)
6598 {
6599 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6600 need_barrier_p = false;
6601 }
6602
6603 /* Restore the stack pointer from the frame pointer if it may not
6604 be the same as the stack pointer. */
6605 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6606 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6607 if (frame_pointer_needed
6608 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6609 /* If writeback is used when restoring callee-saves, the CFA
6610 is restored on the instruction doing the writeback. */
6611 aarch64_add_offset (Pmode, stack_pointer_rtx,
6612 hard_frame_pointer_rtx, -callee_offset,
6613 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6614 else
6615 /* The case where we need to re-use the register here is very rare, so
6616 avoid the complicated condition and just always emit a move if the
6617 immediate doesn't fit. */
6618 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6619
6620 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6621 callee_adjust != 0, &cfi_ops);
6622 if (aarch64_simd_decl_p (cfun->decl))
6623 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6624 callee_adjust != 0, &cfi_ops);
6625 else
6626 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6627 callee_adjust != 0, &cfi_ops);
6628
6629 if (need_barrier_p)
6630 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6631
6632 if (callee_adjust != 0)
6633 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6634
6635 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6636 {
6637 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6638 insn = get_last_insn ();
6639 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6640 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6641 RTX_FRAME_RELATED_P (insn) = 1;
6642 cfi_ops = NULL;
6643 }
6644
6645 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6646 add restriction on emit_move optimization to leaf functions. */
6647 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6648 (!can_inherit_p || !crtl->is_leaf
6649 || df_regs_ever_live_p (EP0_REGNUM)));
6650
6651 if (cfi_ops)
6652 {
6653 /* Emit delayed restores and reset the CFA to be SP. */
6654 insn = get_last_insn ();
6655 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6656 REG_NOTES (insn) = cfi_ops;
6657 RTX_FRAME_RELATED_P (insn) = 1;
6658 }
6659
6660 /* We prefer to emit the combined return/authenticate instruction RETAA,
6661 however there are three cases in which we must instead emit an explicit
6662 authentication instruction.
6663
6664 1) Sibcalls don't return in a normal way, so if we're about to call one
6665 we must authenticate.
6666
6667 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6668 generating code for !TARGET_ARMV8_3 we can't use it and must
6669 explicitly authenticate.
6670
6671 3) On an eh_return path we make extra stack adjustments to update the
6672 canonical frame address to be the exception handler's CFA. We want
6673 to authenticate using the CFA of the function which calls eh_return.
6674 */
6675 if (aarch64_return_address_signing_enabled ()
6676 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6677 {
6678 switch (aarch64_ra_sign_key)
6679 {
6680 case AARCH64_KEY_A:
6681 insn = emit_insn (gen_autiasp ());
6682 break;
6683 case AARCH64_KEY_B:
6684 insn = emit_insn (gen_autibsp ());
6685 break;
6686 default:
6687 gcc_unreachable ();
6688 }
6689 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6690 RTX_FRAME_RELATED_P (insn) = 1;
6691 }
6692
6693 /* Stack adjustment for exception handler. */
6694 if (crtl->calls_eh_return && !for_sibcall)
6695 {
6696 /* We need to unwind the stack by the offset computed by
6697 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6698 to be SP; letting the CFA move during this adjustment
6699 is just as correct as retaining the CFA from the body
6700 of the function. Therefore, do nothing special. */
6701 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6702 }
6703
6704 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6705 if (!for_sibcall)
6706 emit_jump_insn (ret_rtx);
6707 }
6708
6709 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6710 normally or return to a previous frame after unwinding.
6711
6712 An EH return uses a single shared return sequence. The epilogue is
6713 exactly like a normal epilogue except that it has an extra input
6714 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6715 that must be applied after the frame has been destroyed. An extra label
6716 is inserted before the epilogue which initializes this register to zero,
6717 and this is the entry point for a normal return.
6718
6719 An actual EH return updates the return address, initializes the stack
6720 adjustment and jumps directly into the epilogue (bypassing the zeroing
6721 of the adjustment). Since the return address is typically saved on the
6722 stack when a function makes a call, the saved LR must be updated outside
6723 the epilogue.
6724
6725 This poses problems as the store is generated well before the epilogue,
6726 so the offset of LR is not known yet. Also optimizations will remove the
6727 store as it appears dead, even after the epilogue is generated (as the
6728 base or offset for loading LR is different in many cases).
6729
6730 To avoid these problems this implementation forces the frame pointer
6731 in eh_return functions so that the location of LR is fixed and known early.
6732 It also marks the store volatile, so no optimization is permitted to
6733 remove the store. */
6734 rtx
6735 aarch64_eh_return_handler_rtx (void)
6736 {
6737 rtx tmp = gen_frame_mem (Pmode,
6738 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6739
6740 /* Mark the store volatile, so no optimization is permitted to remove it. */
6741 MEM_VOLATILE_P (tmp) = true;
6742 return tmp;
6743 }
6744
6745 /* Output code to add DELTA to the first argument, and then jump
6746 to FUNCTION. Used for C++ multiple inheritance. */
6747 static void
6748 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6749 HOST_WIDE_INT delta,
6750 HOST_WIDE_INT vcall_offset,
6751 tree function)
6752 {
6753 /* The this pointer is always in x0. Note that this differs from
6754 Arm where the this pointer maybe bumped to r1 if r0 is required
6755 to return a pointer to an aggregate. On AArch64 a result value
6756 pointer will be in x8. */
6757 int this_regno = R0_REGNUM;
6758 rtx this_rtx, temp0, temp1, addr, funexp;
6759 rtx_insn *insn;
6760 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6761
6762 if (aarch64_bti_enabled ())
6763 emit_insn (gen_bti_c());
6764
6765 reload_completed = 1;
6766 emit_note (NOTE_INSN_PROLOGUE_END);
6767
6768 this_rtx = gen_rtx_REG (Pmode, this_regno);
6769 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6770 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6771
6772 if (vcall_offset == 0)
6773 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6774 else
6775 {
6776 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6777
6778 addr = this_rtx;
6779 if (delta != 0)
6780 {
6781 if (delta >= -256 && delta < 256)
6782 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6783 plus_constant (Pmode, this_rtx, delta));
6784 else
6785 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6786 temp1, temp0, false);
6787 }
6788
6789 if (Pmode == ptr_mode)
6790 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6791 else
6792 aarch64_emit_move (temp0,
6793 gen_rtx_ZERO_EXTEND (Pmode,
6794 gen_rtx_MEM (ptr_mode, addr)));
6795
6796 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6797 addr = plus_constant (Pmode, temp0, vcall_offset);
6798 else
6799 {
6800 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6801 Pmode);
6802 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6803 }
6804
6805 if (Pmode == ptr_mode)
6806 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6807 else
6808 aarch64_emit_move (temp1,
6809 gen_rtx_SIGN_EXTEND (Pmode,
6810 gen_rtx_MEM (ptr_mode, addr)));
6811
6812 emit_insn (gen_add2_insn (this_rtx, temp1));
6813 }
6814
6815 /* Generate a tail call to the target function. */
6816 if (!TREE_USED (function))
6817 {
6818 assemble_external (function);
6819 TREE_USED (function) = 1;
6820 }
6821 funexp = XEXP (DECL_RTL (function), 0);
6822 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6823 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6824 SIBLING_CALL_P (insn) = 1;
6825
6826 insn = get_insns ();
6827 shorten_branches (insn);
6828
6829 assemble_start_function (thunk, fnname);
6830 final_start_function (insn, file, 1);
6831 final (insn, file, 1);
6832 final_end_function ();
6833 assemble_end_function (thunk, fnname);
6834
6835 /* Stop pretending to be a post-reload pass. */
6836 reload_completed = 0;
6837 }
6838
6839 static bool
6840 aarch64_tls_referenced_p (rtx x)
6841 {
6842 if (!TARGET_HAVE_TLS)
6843 return false;
6844 subrtx_iterator::array_type array;
6845 FOR_EACH_SUBRTX (iter, array, x, ALL)
6846 {
6847 const_rtx x = *iter;
6848 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6849 return true;
6850 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6851 TLS offsets, not real symbol references. */
6852 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6853 iter.skip_subrtxes ();
6854 }
6855 return false;
6856 }
6857
6858
6859 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6860 a left shift of 0 or 12 bits. */
6861 bool
6862 aarch64_uimm12_shift (HOST_WIDE_INT val)
6863 {
6864 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6865 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6866 );
6867 }
6868
6869 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6870 that can be created with a left shift of 0 or 12. */
6871 static HOST_WIDE_INT
6872 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6873 {
6874 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6875 handle correctly. */
6876 gcc_assert ((val & 0xffffff) == val);
6877
6878 if (((val & 0xfff) << 0) == val)
6879 return val;
6880
6881 return val & (0xfff << 12);
6882 }
6883
6884 /* Return true if val is an immediate that can be loaded into a
6885 register by a MOVZ instruction. */
6886 static bool
6887 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6888 {
6889 if (GET_MODE_SIZE (mode) > 4)
6890 {
6891 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6892 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6893 return 1;
6894 }
6895 else
6896 {
6897 /* Ignore sign extension. */
6898 val &= (HOST_WIDE_INT) 0xffffffff;
6899 }
6900 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6901 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6902 }
6903
6904 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6905 64-bit (DImode) integer. */
6906
6907 static unsigned HOST_WIDE_INT
6908 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6909 {
6910 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6911 while (size < 64)
6912 {
6913 val &= (HOST_WIDE_INT_1U << size) - 1;
6914 val |= val << size;
6915 size *= 2;
6916 }
6917 return val;
6918 }
6919
6920 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6921
6922 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6923 {
6924 0x0000000100000001ull,
6925 0x0001000100010001ull,
6926 0x0101010101010101ull,
6927 0x1111111111111111ull,
6928 0x5555555555555555ull,
6929 };
6930
6931
6932 /* Return true if val is a valid bitmask immediate. */
6933
6934 bool
6935 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6936 {
6937 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6938 int bits;
6939
6940 /* Check for a single sequence of one bits and return quickly if so.
6941 The special cases of all ones and all zeroes returns false. */
6942 val = aarch64_replicate_bitmask_imm (val_in, mode);
6943 tmp = val + (val & -val);
6944
6945 if (tmp == (tmp & -tmp))
6946 return (val + 1) > 1;
6947
6948 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6949 if (mode == SImode)
6950 val = (val << 32) | (val & 0xffffffff);
6951
6952 /* Invert if the immediate doesn't start with a zero bit - this means we
6953 only need to search for sequences of one bits. */
6954 if (val & 1)
6955 val = ~val;
6956
6957 /* Find the first set bit and set tmp to val with the first sequence of one
6958 bits removed. Return success if there is a single sequence of ones. */
6959 first_one = val & -val;
6960 tmp = val & (val + first_one);
6961
6962 if (tmp == 0)
6963 return true;
6964
6965 /* Find the next set bit and compute the difference in bit position. */
6966 next_one = tmp & -tmp;
6967 bits = clz_hwi (first_one) - clz_hwi (next_one);
6968 mask = val ^ tmp;
6969
6970 /* Check the bit position difference is a power of 2, and that the first
6971 sequence of one bits fits within 'bits' bits. */
6972 if ((mask >> bits) != 0 || bits != (bits & -bits))
6973 return false;
6974
6975 /* Check the sequence of one bits is repeated 64/bits times. */
6976 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6977 }
6978
6979 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6980 Assumed precondition: VAL_IN Is not zero. */
6981
6982 unsigned HOST_WIDE_INT
6983 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6984 {
6985 int lowest_bit_set = ctz_hwi (val_in);
6986 int highest_bit_set = floor_log2 (val_in);
6987 gcc_assert (val_in != 0);
6988
6989 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6990 (HOST_WIDE_INT_1U << lowest_bit_set));
6991 }
6992
6993 /* Create constant where bits outside of lowest bit set to highest bit set
6994 are set to 1. */
6995
6996 unsigned HOST_WIDE_INT
6997 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6998 {
6999 return val_in | ~aarch64_and_split_imm1 (val_in);
7000 }
7001
7002 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7003
7004 bool
7005 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7006 {
7007 scalar_int_mode int_mode;
7008 if (!is_a <scalar_int_mode> (mode, &int_mode))
7009 return false;
7010
7011 if (aarch64_bitmask_imm (val_in, int_mode))
7012 return false;
7013
7014 if (aarch64_move_imm (val_in, int_mode))
7015 return false;
7016
7017 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7018
7019 return aarch64_bitmask_imm (imm2, int_mode);
7020 }
7021
7022 /* Return true if val is an immediate that can be loaded into a
7023 register in a single instruction. */
7024 bool
7025 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7026 {
7027 scalar_int_mode int_mode;
7028 if (!is_a <scalar_int_mode> (mode, &int_mode))
7029 return false;
7030
7031 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7032 return 1;
7033 return aarch64_bitmask_imm (val, int_mode);
7034 }
7035
7036 static bool
7037 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7038 {
7039 rtx base, offset;
7040
7041 if (GET_CODE (x) == HIGH)
7042 return true;
7043
7044 /* There's no way to calculate VL-based values using relocations. */
7045 subrtx_iterator::array_type array;
7046 FOR_EACH_SUBRTX (iter, array, x, ALL)
7047 if (GET_CODE (*iter) == CONST_POLY_INT)
7048 return true;
7049
7050 split_const (x, &base, &offset);
7051 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7052 {
7053 if (aarch64_classify_symbol (base, INTVAL (offset))
7054 != SYMBOL_FORCE_TO_MEM)
7055 return true;
7056 else
7057 /* Avoid generating a 64-bit relocation in ILP32; leave
7058 to aarch64_expand_mov_immediate to handle it properly. */
7059 return mode != ptr_mode;
7060 }
7061
7062 return aarch64_tls_referenced_p (x);
7063 }
7064
7065 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7066 The expansion for a table switch is quite expensive due to the number
7067 of instructions, the table lookup and hard to predict indirect jump.
7068 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7069 set, otherwise use tables for > 16 cases as a tradeoff between size and
7070 performance. When optimizing for size, use the default setting. */
7071
7072 static unsigned int
7073 aarch64_case_values_threshold (void)
7074 {
7075 /* Use the specified limit for the number of cases before using jump
7076 tables at higher optimization levels. */
7077 if (optimize > 2
7078 && selected_cpu->tune->max_case_values != 0)
7079 return selected_cpu->tune->max_case_values;
7080 else
7081 return optimize_size ? default_case_values_threshold () : 17;
7082 }
7083
7084 /* Return true if register REGNO is a valid index register.
7085 STRICT_P is true if REG_OK_STRICT is in effect. */
7086
7087 bool
7088 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7089 {
7090 if (!HARD_REGISTER_NUM_P (regno))
7091 {
7092 if (!strict_p)
7093 return true;
7094
7095 if (!reg_renumber)
7096 return false;
7097
7098 regno = reg_renumber[regno];
7099 }
7100 return GP_REGNUM_P (regno);
7101 }
7102
7103 /* Return true if register REGNO is a valid base register for mode MODE.
7104 STRICT_P is true if REG_OK_STRICT is in effect. */
7105
7106 bool
7107 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7108 {
7109 if (!HARD_REGISTER_NUM_P (regno))
7110 {
7111 if (!strict_p)
7112 return true;
7113
7114 if (!reg_renumber)
7115 return false;
7116
7117 regno = reg_renumber[regno];
7118 }
7119
7120 /* The fake registers will be eliminated to either the stack or
7121 hard frame pointer, both of which are usually valid base registers.
7122 Reload deals with the cases where the eliminated form isn't valid. */
7123 return (GP_REGNUM_P (regno)
7124 || regno == SP_REGNUM
7125 || regno == FRAME_POINTER_REGNUM
7126 || regno == ARG_POINTER_REGNUM);
7127 }
7128
7129 /* Return true if X is a valid base register for mode MODE.
7130 STRICT_P is true if REG_OK_STRICT is in effect. */
7131
7132 static bool
7133 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7134 {
7135 if (!strict_p
7136 && GET_CODE (x) == SUBREG
7137 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7138 x = SUBREG_REG (x);
7139
7140 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7141 }
7142
7143 /* Return true if address offset is a valid index. If it is, fill in INFO
7144 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7145
7146 static bool
7147 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7148 machine_mode mode, bool strict_p)
7149 {
7150 enum aarch64_address_type type;
7151 rtx index;
7152 int shift;
7153
7154 /* (reg:P) */
7155 if ((REG_P (x) || GET_CODE (x) == SUBREG)
7156 && GET_MODE (x) == Pmode)
7157 {
7158 type = ADDRESS_REG_REG;
7159 index = x;
7160 shift = 0;
7161 }
7162 /* (sign_extend:DI (reg:SI)) */
7163 else if ((GET_CODE (x) == SIGN_EXTEND
7164 || GET_CODE (x) == ZERO_EXTEND)
7165 && GET_MODE (x) == DImode
7166 && GET_MODE (XEXP (x, 0)) == SImode)
7167 {
7168 type = (GET_CODE (x) == SIGN_EXTEND)
7169 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7170 index = XEXP (x, 0);
7171 shift = 0;
7172 }
7173 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7174 else if (GET_CODE (x) == MULT
7175 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7176 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7177 && GET_MODE (XEXP (x, 0)) == DImode
7178 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7179 && CONST_INT_P (XEXP (x, 1)))
7180 {
7181 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7182 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7183 index = XEXP (XEXP (x, 0), 0);
7184 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7185 }
7186 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7187 else if (GET_CODE (x) == ASHIFT
7188 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7189 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7190 && GET_MODE (XEXP (x, 0)) == DImode
7191 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7192 && CONST_INT_P (XEXP (x, 1)))
7193 {
7194 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7195 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7196 index = XEXP (XEXP (x, 0), 0);
7197 shift = INTVAL (XEXP (x, 1));
7198 }
7199 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7200 else if ((GET_CODE (x) == SIGN_EXTRACT
7201 || GET_CODE (x) == ZERO_EXTRACT)
7202 && GET_MODE (x) == DImode
7203 && GET_CODE (XEXP (x, 0)) == MULT
7204 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7205 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7206 {
7207 type = (GET_CODE (x) == SIGN_EXTRACT)
7208 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7209 index = XEXP (XEXP (x, 0), 0);
7210 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7211 if (INTVAL (XEXP (x, 1)) != 32 + shift
7212 || INTVAL (XEXP (x, 2)) != 0)
7213 shift = -1;
7214 }
7215 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7216 (const_int 0xffffffff<<shift)) */
7217 else if (GET_CODE (x) == AND
7218 && GET_MODE (x) == DImode
7219 && GET_CODE (XEXP (x, 0)) == MULT
7220 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7221 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7222 && CONST_INT_P (XEXP (x, 1)))
7223 {
7224 type = ADDRESS_REG_UXTW;
7225 index = XEXP (XEXP (x, 0), 0);
7226 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7227 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7228 shift = -1;
7229 }
7230 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7231 else if ((GET_CODE (x) == SIGN_EXTRACT
7232 || GET_CODE (x) == ZERO_EXTRACT)
7233 && GET_MODE (x) == DImode
7234 && GET_CODE (XEXP (x, 0)) == ASHIFT
7235 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7236 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7237 {
7238 type = (GET_CODE (x) == SIGN_EXTRACT)
7239 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7240 index = XEXP (XEXP (x, 0), 0);
7241 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7242 if (INTVAL (XEXP (x, 1)) != 32 + shift
7243 || INTVAL (XEXP (x, 2)) != 0)
7244 shift = -1;
7245 }
7246 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7247 (const_int 0xffffffff<<shift)) */
7248 else if (GET_CODE (x) == AND
7249 && GET_MODE (x) == DImode
7250 && GET_CODE (XEXP (x, 0)) == ASHIFT
7251 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7252 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7253 && CONST_INT_P (XEXP (x, 1)))
7254 {
7255 type = ADDRESS_REG_UXTW;
7256 index = XEXP (XEXP (x, 0), 0);
7257 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7258 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7259 shift = -1;
7260 }
7261 /* (mult:P (reg:P) (const_int scale)) */
7262 else if (GET_CODE (x) == MULT
7263 && GET_MODE (x) == Pmode
7264 && GET_MODE (XEXP (x, 0)) == Pmode
7265 && CONST_INT_P (XEXP (x, 1)))
7266 {
7267 type = ADDRESS_REG_REG;
7268 index = XEXP (x, 0);
7269 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7270 }
7271 /* (ashift:P (reg:P) (const_int shift)) */
7272 else if (GET_CODE (x) == ASHIFT
7273 && GET_MODE (x) == Pmode
7274 && GET_MODE (XEXP (x, 0)) == Pmode
7275 && CONST_INT_P (XEXP (x, 1)))
7276 {
7277 type = ADDRESS_REG_REG;
7278 index = XEXP (x, 0);
7279 shift = INTVAL (XEXP (x, 1));
7280 }
7281 else
7282 return false;
7283
7284 if (!strict_p
7285 && GET_CODE (index) == SUBREG
7286 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7287 index = SUBREG_REG (index);
7288
7289 if (aarch64_sve_data_mode_p (mode))
7290 {
7291 if (type != ADDRESS_REG_REG
7292 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7293 return false;
7294 }
7295 else
7296 {
7297 if (shift != 0
7298 && !(IN_RANGE (shift, 1, 3)
7299 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7300 return false;
7301 }
7302
7303 if (REG_P (index)
7304 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7305 {
7306 info->type = type;
7307 info->offset = index;
7308 info->shift = shift;
7309 return true;
7310 }
7311
7312 return false;
7313 }
7314
7315 /* Return true if MODE is one of the modes for which we
7316 support LDP/STP operations. */
7317
7318 static bool
7319 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7320 {
7321 return mode == SImode || mode == DImode
7322 || mode == SFmode || mode == DFmode
7323 || (aarch64_vector_mode_supported_p (mode)
7324 && (known_eq (GET_MODE_SIZE (mode), 8)
7325 || (known_eq (GET_MODE_SIZE (mode), 16)
7326 && (aarch64_tune_params.extra_tuning_flags
7327 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7328 }
7329
7330 /* Return true if REGNO is a virtual pointer register, or an eliminable
7331 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7332 include stack_pointer or hard_frame_pointer. */
7333 static bool
7334 virt_or_elim_regno_p (unsigned regno)
7335 {
7336 return ((regno >= FIRST_VIRTUAL_REGISTER
7337 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7338 || regno == FRAME_POINTER_REGNUM
7339 || regno == ARG_POINTER_REGNUM);
7340 }
7341
7342 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7343 If it is, fill in INFO appropriately. STRICT_P is true if
7344 REG_OK_STRICT is in effect. */
7345
7346 bool
7347 aarch64_classify_address (struct aarch64_address_info *info,
7348 rtx x, machine_mode mode, bool strict_p,
7349 aarch64_addr_query_type type)
7350 {
7351 enum rtx_code code = GET_CODE (x);
7352 rtx op0, op1;
7353 poly_int64 offset;
7354
7355 HOST_WIDE_INT const_size;
7356
7357 /* On BE, we use load/store pair for all large int mode load/stores.
7358 TI/TFmode may also use a load/store pair. */
7359 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7360 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7361 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7362 || type == ADDR_QUERY_LDP_STP_N
7363 || mode == TImode
7364 || mode == TFmode
7365 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7366
7367 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7368 corresponds to the actual size of the memory being loaded/stored and the
7369 mode of the corresponding addressing mode is half of that. */
7370 if (type == ADDR_QUERY_LDP_STP_N
7371 && known_eq (GET_MODE_SIZE (mode), 16))
7372 mode = DFmode;
7373
7374 bool allow_reg_index_p = (!load_store_pair_p
7375 && (known_lt (GET_MODE_SIZE (mode), 16)
7376 || vec_flags == VEC_ADVSIMD
7377 || vec_flags & VEC_SVE_DATA));
7378
7379 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7380 [Rn, #offset, MUL VL]. */
7381 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7382 && (code != REG && code != PLUS))
7383 return false;
7384
7385 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7386 REG addressing. */
7387 if (advsimd_struct_p
7388 && !BYTES_BIG_ENDIAN
7389 && (code != POST_INC && code != REG))
7390 return false;
7391
7392 gcc_checking_assert (GET_MODE (x) == VOIDmode
7393 || SCALAR_INT_MODE_P (GET_MODE (x)));
7394
7395 switch (code)
7396 {
7397 case REG:
7398 case SUBREG:
7399 info->type = ADDRESS_REG_IMM;
7400 info->base = x;
7401 info->offset = const0_rtx;
7402 info->const_offset = 0;
7403 return aarch64_base_register_rtx_p (x, strict_p);
7404
7405 case PLUS:
7406 op0 = XEXP (x, 0);
7407 op1 = XEXP (x, 1);
7408
7409 if (! strict_p
7410 && REG_P (op0)
7411 && virt_or_elim_regno_p (REGNO (op0))
7412 && poly_int_rtx_p (op1, &offset))
7413 {
7414 info->type = ADDRESS_REG_IMM;
7415 info->base = op0;
7416 info->offset = op1;
7417 info->const_offset = offset;
7418
7419 return true;
7420 }
7421
7422 if (maybe_ne (GET_MODE_SIZE (mode), 0)
7423 && aarch64_base_register_rtx_p (op0, strict_p)
7424 && poly_int_rtx_p (op1, &offset))
7425 {
7426 info->type = ADDRESS_REG_IMM;
7427 info->base = op0;
7428 info->offset = op1;
7429 info->const_offset = offset;
7430
7431 /* TImode and TFmode values are allowed in both pairs of X
7432 registers and individual Q registers. The available
7433 address modes are:
7434 X,X: 7-bit signed scaled offset
7435 Q: 9-bit signed offset
7436 We conservatively require an offset representable in either mode.
7437 When performing the check for pairs of X registers i.e. LDP/STP
7438 pass down DImode since that is the natural size of the LDP/STP
7439 instruction memory accesses. */
7440 if (mode == TImode || mode == TFmode)
7441 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7442 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7443 || offset_12bit_unsigned_scaled_p (mode, offset)));
7444
7445 /* A 7bit offset check because OImode will emit a ldp/stp
7446 instruction (only big endian will get here).
7447 For ldp/stp instructions, the offset is scaled for the size of a
7448 single element of the pair. */
7449 if (mode == OImode)
7450 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7451
7452 /* Three 9/12 bit offsets checks because CImode will emit three
7453 ldr/str instructions (only big endian will get here). */
7454 if (mode == CImode)
7455 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7456 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7457 offset + 32)
7458 || offset_12bit_unsigned_scaled_p (V16QImode,
7459 offset + 32)));
7460
7461 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7462 instructions (only big endian will get here). */
7463 if (mode == XImode)
7464 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7465 && aarch64_offset_7bit_signed_scaled_p (TImode,
7466 offset + 32));
7467
7468 /* Make "m" use the LD1 offset range for SVE data modes, so
7469 that pre-RTL optimizers like ivopts will work to that
7470 instead of the wider LDR/STR range. */
7471 if (vec_flags == VEC_SVE_DATA)
7472 return (type == ADDR_QUERY_M
7473 ? offset_4bit_signed_scaled_p (mode, offset)
7474 : offset_9bit_signed_scaled_p (mode, offset));
7475
7476 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7477 {
7478 poly_int64 end_offset = (offset
7479 + GET_MODE_SIZE (mode)
7480 - BYTES_PER_SVE_VECTOR);
7481 return (type == ADDR_QUERY_M
7482 ? offset_4bit_signed_scaled_p (mode, offset)
7483 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7484 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7485 end_offset)));
7486 }
7487
7488 if (vec_flags == VEC_SVE_PRED)
7489 return offset_9bit_signed_scaled_p (mode, offset);
7490
7491 if (load_store_pair_p)
7492 return ((known_eq (GET_MODE_SIZE (mode), 4)
7493 || known_eq (GET_MODE_SIZE (mode), 8)
7494 || known_eq (GET_MODE_SIZE (mode), 16))
7495 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7496 else
7497 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7498 || offset_12bit_unsigned_scaled_p (mode, offset));
7499 }
7500
7501 if (allow_reg_index_p)
7502 {
7503 /* Look for base + (scaled/extended) index register. */
7504 if (aarch64_base_register_rtx_p (op0, strict_p)
7505 && aarch64_classify_index (info, op1, mode, strict_p))
7506 {
7507 info->base = op0;
7508 return true;
7509 }
7510 if (aarch64_base_register_rtx_p (op1, strict_p)
7511 && aarch64_classify_index (info, op0, mode, strict_p))
7512 {
7513 info->base = op1;
7514 return true;
7515 }
7516 }
7517
7518 return false;
7519
7520 case POST_INC:
7521 case POST_DEC:
7522 case PRE_INC:
7523 case PRE_DEC:
7524 info->type = ADDRESS_REG_WB;
7525 info->base = XEXP (x, 0);
7526 info->offset = NULL_RTX;
7527 return aarch64_base_register_rtx_p (info->base, strict_p);
7528
7529 case POST_MODIFY:
7530 case PRE_MODIFY:
7531 info->type = ADDRESS_REG_WB;
7532 info->base = XEXP (x, 0);
7533 if (GET_CODE (XEXP (x, 1)) == PLUS
7534 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7535 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7536 && aarch64_base_register_rtx_p (info->base, strict_p))
7537 {
7538 info->offset = XEXP (XEXP (x, 1), 1);
7539 info->const_offset = offset;
7540
7541 /* TImode and TFmode values are allowed in both pairs of X
7542 registers and individual Q registers. The available
7543 address modes are:
7544 X,X: 7-bit signed scaled offset
7545 Q: 9-bit signed offset
7546 We conservatively require an offset representable in either mode.
7547 */
7548 if (mode == TImode || mode == TFmode)
7549 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7550 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7551
7552 if (load_store_pair_p)
7553 return ((known_eq (GET_MODE_SIZE (mode), 4)
7554 || known_eq (GET_MODE_SIZE (mode), 8)
7555 || known_eq (GET_MODE_SIZE (mode), 16))
7556 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7557 else
7558 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7559 }
7560 return false;
7561
7562 case CONST:
7563 case SYMBOL_REF:
7564 case LABEL_REF:
7565 /* load literal: pc-relative constant pool entry. Only supported
7566 for SI mode or larger. */
7567 info->type = ADDRESS_SYMBOLIC;
7568
7569 if (!load_store_pair_p
7570 && GET_MODE_SIZE (mode).is_constant (&const_size)
7571 && const_size >= 4)
7572 {
7573 rtx sym, addend;
7574
7575 split_const (x, &sym, &addend);
7576 return ((GET_CODE (sym) == LABEL_REF
7577 || (GET_CODE (sym) == SYMBOL_REF
7578 && CONSTANT_POOL_ADDRESS_P (sym)
7579 && aarch64_pcrelative_literal_loads)));
7580 }
7581 return false;
7582
7583 case LO_SUM:
7584 info->type = ADDRESS_LO_SUM;
7585 info->base = XEXP (x, 0);
7586 info->offset = XEXP (x, 1);
7587 if (allow_reg_index_p
7588 && aarch64_base_register_rtx_p (info->base, strict_p))
7589 {
7590 rtx sym, offs;
7591 split_const (info->offset, &sym, &offs);
7592 if (GET_CODE (sym) == SYMBOL_REF
7593 && (aarch64_classify_symbol (sym, INTVAL (offs))
7594 == SYMBOL_SMALL_ABSOLUTE))
7595 {
7596 /* The symbol and offset must be aligned to the access size. */
7597 unsigned int align;
7598
7599 if (CONSTANT_POOL_ADDRESS_P (sym))
7600 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7601 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7602 {
7603 tree exp = SYMBOL_REF_DECL (sym);
7604 align = TYPE_ALIGN (TREE_TYPE (exp));
7605 align = aarch64_constant_alignment (exp, align);
7606 }
7607 else if (SYMBOL_REF_DECL (sym))
7608 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7609 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7610 && SYMBOL_REF_BLOCK (sym) != NULL)
7611 align = SYMBOL_REF_BLOCK (sym)->alignment;
7612 else
7613 align = BITS_PER_UNIT;
7614
7615 poly_int64 ref_size = GET_MODE_SIZE (mode);
7616 if (known_eq (ref_size, 0))
7617 ref_size = GET_MODE_SIZE (DImode);
7618
7619 return (multiple_p (INTVAL (offs), ref_size)
7620 && multiple_p (align / BITS_PER_UNIT, ref_size));
7621 }
7622 }
7623 return false;
7624
7625 default:
7626 return false;
7627 }
7628 }
7629
7630 /* Return true if the address X is valid for a PRFM instruction.
7631 STRICT_P is true if we should do strict checking with
7632 aarch64_classify_address. */
7633
7634 bool
7635 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7636 {
7637 struct aarch64_address_info addr;
7638
7639 /* PRFM accepts the same addresses as DImode... */
7640 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7641 if (!res)
7642 return false;
7643
7644 /* ... except writeback forms. */
7645 return addr.type != ADDRESS_REG_WB;
7646 }
7647
7648 bool
7649 aarch64_symbolic_address_p (rtx x)
7650 {
7651 rtx offset;
7652
7653 split_const (x, &x, &offset);
7654 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7655 }
7656
7657 /* Classify the base of symbolic expression X. */
7658
7659 enum aarch64_symbol_type
7660 aarch64_classify_symbolic_expression (rtx x)
7661 {
7662 rtx offset;
7663
7664 split_const (x, &x, &offset);
7665 return aarch64_classify_symbol (x, INTVAL (offset));
7666 }
7667
7668
7669 /* Return TRUE if X is a legitimate address for accessing memory in
7670 mode MODE. */
7671 static bool
7672 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7673 {
7674 struct aarch64_address_info addr;
7675
7676 return aarch64_classify_address (&addr, x, mode, strict_p);
7677 }
7678
7679 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7680 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7681 bool
7682 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7683 aarch64_addr_query_type type)
7684 {
7685 struct aarch64_address_info addr;
7686
7687 return aarch64_classify_address (&addr, x, mode, strict_p, type);
7688 }
7689
7690 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7691
7692 static bool
7693 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7694 poly_int64 orig_offset,
7695 machine_mode mode)
7696 {
7697 HOST_WIDE_INT size;
7698 if (GET_MODE_SIZE (mode).is_constant (&size))
7699 {
7700 HOST_WIDE_INT const_offset, second_offset;
7701
7702 /* A general SVE offset is A * VQ + B. Remove the A component from
7703 coefficient 0 in order to get the constant B. */
7704 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7705
7706 /* Split an out-of-range address displacement into a base and
7707 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7708 range otherwise to increase opportunities for sharing the base
7709 address of different sizes. Unaligned accesses use the signed
7710 9-bit range, TImode/TFmode use the intersection of signed
7711 scaled 7-bit and signed 9-bit offset. */
7712 if (mode == TImode || mode == TFmode)
7713 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7714 else if ((const_offset & (size - 1)) != 0)
7715 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7716 else
7717 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7718
7719 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7720 return false;
7721
7722 /* Split the offset into second_offset and the rest. */
7723 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7724 *offset2 = gen_int_mode (second_offset, Pmode);
7725 return true;
7726 }
7727 else
7728 {
7729 /* Get the mode we should use as the basis of the range. For structure
7730 modes this is the mode of one vector. */
7731 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7732 machine_mode step_mode
7733 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7734
7735 /* Get the "mul vl" multiplier we'd like to use. */
7736 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7737 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7738 if (vec_flags & VEC_SVE_DATA)
7739 /* LDR supports a 9-bit range, but the move patterns for
7740 structure modes require all vectors to be in range of the
7741 same base. The simplest way of accomodating that while still
7742 promoting reuse of anchor points between different modes is
7743 to use an 8-bit range unconditionally. */
7744 vnum = ((vnum + 128) & 255) - 128;
7745 else
7746 /* Predicates are only handled singly, so we might as well use
7747 the full range. */
7748 vnum = ((vnum + 256) & 511) - 256;
7749 if (vnum == 0)
7750 return false;
7751
7752 /* Convert the "mul vl" multiplier into a byte offset. */
7753 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7754 if (known_eq (second_offset, orig_offset))
7755 return false;
7756
7757 /* Split the offset into second_offset and the rest. */
7758 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7759 *offset2 = gen_int_mode (second_offset, Pmode);
7760 return true;
7761 }
7762 }
7763
7764 /* Return the binary representation of floating point constant VALUE in INTVAL.
7765 If the value cannot be converted, return false without setting INTVAL.
7766 The conversion is done in the given MODE. */
7767 bool
7768 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7769 {
7770
7771 /* We make a general exception for 0. */
7772 if (aarch64_float_const_zero_rtx_p (value))
7773 {
7774 *intval = 0;
7775 return true;
7776 }
7777
7778 scalar_float_mode mode;
7779 if (GET_CODE (value) != CONST_DOUBLE
7780 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7781 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7782 /* Only support up to DF mode. */
7783 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7784 return false;
7785
7786 unsigned HOST_WIDE_INT ival = 0;
7787
7788 long res[2];
7789 real_to_target (res,
7790 CONST_DOUBLE_REAL_VALUE (value),
7791 REAL_MODE_FORMAT (mode));
7792
7793 if (mode == DFmode)
7794 {
7795 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7796 ival = zext_hwi (res[order], 32);
7797 ival |= (zext_hwi (res[1 - order], 32) << 32);
7798 }
7799 else
7800 ival = zext_hwi (res[0], 32);
7801
7802 *intval = ival;
7803 return true;
7804 }
7805
7806 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7807 single MOV(+MOVK) followed by an FMOV. */
7808 bool
7809 aarch64_float_const_rtx_p (rtx x)
7810 {
7811 machine_mode mode = GET_MODE (x);
7812 if (mode == VOIDmode)
7813 return false;
7814
7815 /* Determine whether it's cheaper to write float constants as
7816 mov/movk pairs over ldr/adrp pairs. */
7817 unsigned HOST_WIDE_INT ival;
7818
7819 if (GET_CODE (x) == CONST_DOUBLE
7820 && SCALAR_FLOAT_MODE_P (mode)
7821 && aarch64_reinterpret_float_as_int (x, &ival))
7822 {
7823 scalar_int_mode imode = (mode == HFmode
7824 ? SImode
7825 : int_mode_for_mode (mode).require ());
7826 int num_instr = aarch64_internal_mov_immediate
7827 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7828 return num_instr < 3;
7829 }
7830
7831 return false;
7832 }
7833
7834 /* Return TRUE if rtx X is immediate constant 0.0 */
7835 bool
7836 aarch64_float_const_zero_rtx_p (rtx x)
7837 {
7838 if (GET_MODE (x) == VOIDmode)
7839 return false;
7840
7841 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7842 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7843 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7844 }
7845
7846 /* Return TRUE if rtx X is immediate constant that fits in a single
7847 MOVI immediate operation. */
7848 bool
7849 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7850 {
7851 if (!TARGET_SIMD)
7852 return false;
7853
7854 machine_mode vmode;
7855 scalar_int_mode imode;
7856 unsigned HOST_WIDE_INT ival;
7857
7858 if (GET_CODE (x) == CONST_DOUBLE
7859 && SCALAR_FLOAT_MODE_P (mode))
7860 {
7861 if (!aarch64_reinterpret_float_as_int (x, &ival))
7862 return false;
7863
7864 /* We make a general exception for 0. */
7865 if (aarch64_float_const_zero_rtx_p (x))
7866 return true;
7867
7868 imode = int_mode_for_mode (mode).require ();
7869 }
7870 else if (GET_CODE (x) == CONST_INT
7871 && is_a <scalar_int_mode> (mode, &imode))
7872 ival = INTVAL (x);
7873 else
7874 return false;
7875
7876 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7877 a 128 bit vector mode. */
7878 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7879
7880 vmode = aarch64_simd_container_mode (imode, width);
7881 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7882
7883 return aarch64_simd_valid_immediate (v_op, NULL);
7884 }
7885
7886
7887 /* Return the fixed registers used for condition codes. */
7888
7889 static bool
7890 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7891 {
7892 *p1 = CC_REGNUM;
7893 *p2 = INVALID_REGNUM;
7894 return true;
7895 }
7896
7897 /* This function is used by the call expanders of the machine description.
7898 RESULT is the register in which the result is returned. It's NULL for
7899 "call" and "sibcall".
7900 MEM is the location of the function call.
7901 SIBCALL indicates whether this function call is normal call or sibling call.
7902 It will generate different pattern accordingly. */
7903
7904 void
7905 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7906 {
7907 rtx call, callee, tmp;
7908 rtvec vec;
7909 machine_mode mode;
7910
7911 gcc_assert (MEM_P (mem));
7912 callee = XEXP (mem, 0);
7913 mode = GET_MODE (callee);
7914 gcc_assert (mode == Pmode);
7915
7916 /* Decide if we should generate indirect calls by loading the
7917 address of the callee into a register before performing
7918 the branch-and-link. */
7919 if (SYMBOL_REF_P (callee)
7920 ? (aarch64_is_long_call_p (callee)
7921 || aarch64_is_noplt_call_p (callee))
7922 : !REG_P (callee))
7923 XEXP (mem, 0) = force_reg (mode, callee);
7924
7925 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7926
7927 if (result != NULL_RTX)
7928 call = gen_rtx_SET (result, call);
7929
7930 if (sibcall)
7931 tmp = ret_rtx;
7932 else
7933 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7934
7935 vec = gen_rtvec (2, call, tmp);
7936 call = gen_rtx_PARALLEL (VOIDmode, vec);
7937
7938 aarch64_emit_call_insn (call);
7939 }
7940
7941 /* Emit call insn with PAT and do aarch64-specific handling. */
7942
7943 void
7944 aarch64_emit_call_insn (rtx pat)
7945 {
7946 rtx insn = emit_call_insn (pat);
7947
7948 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7949 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7950 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7951 }
7952
7953 machine_mode
7954 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7955 {
7956 machine_mode mode_x = GET_MODE (x);
7957 rtx_code code_x = GET_CODE (x);
7958
7959 /* All floating point compares return CCFP if it is an equality
7960 comparison, and CCFPE otherwise. */
7961 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7962 {
7963 switch (code)
7964 {
7965 case EQ:
7966 case NE:
7967 case UNORDERED:
7968 case ORDERED:
7969 case UNLT:
7970 case UNLE:
7971 case UNGT:
7972 case UNGE:
7973 case UNEQ:
7974 return CCFPmode;
7975
7976 case LT:
7977 case LE:
7978 case GT:
7979 case GE:
7980 case LTGT:
7981 return CCFPEmode;
7982
7983 default:
7984 gcc_unreachable ();
7985 }
7986 }
7987
7988 /* Equality comparisons of short modes against zero can be performed
7989 using the TST instruction with the appropriate bitmask. */
7990 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7991 && (code == EQ || code == NE)
7992 && (mode_x == HImode || mode_x == QImode))
7993 return CC_NZmode;
7994
7995 /* Similarly, comparisons of zero_extends from shorter modes can
7996 be performed using an ANDS with an immediate mask. */
7997 if (y == const0_rtx && code_x == ZERO_EXTEND
7998 && (mode_x == SImode || mode_x == DImode)
7999 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8000 && (code == EQ || code == NE))
8001 return CC_NZmode;
8002
8003 if ((mode_x == SImode || mode_x == DImode)
8004 && y == const0_rtx
8005 && (code == EQ || code == NE || code == LT || code == GE)
8006 && (code_x == PLUS || code_x == MINUS || code_x == AND
8007 || code_x == NEG
8008 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8009 && CONST_INT_P (XEXP (x, 2)))))
8010 return CC_NZmode;
8011
8012 /* A compare with a shifted operand. Because of canonicalization,
8013 the comparison will have to be swapped when we emit the assembly
8014 code. */
8015 if ((mode_x == SImode || mode_x == DImode)
8016 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8017 && (code_x == ASHIFT || code_x == ASHIFTRT
8018 || code_x == LSHIFTRT
8019 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8020 return CC_SWPmode;
8021
8022 /* Similarly for a negated operand, but we can only do this for
8023 equalities. */
8024 if ((mode_x == SImode || mode_x == DImode)
8025 && (REG_P (y) || GET_CODE (y) == SUBREG)
8026 && (code == EQ || code == NE)
8027 && code_x == NEG)
8028 return CC_Zmode;
8029
8030 /* A test for unsigned overflow from an addition. */
8031 if ((mode_x == DImode || mode_x == TImode)
8032 && (code == LTU || code == GEU)
8033 && code_x == PLUS
8034 && rtx_equal_p (XEXP (x, 0), y))
8035 return CC_Cmode;
8036
8037 /* A test for unsigned overflow from an add with carry. */
8038 if ((mode_x == DImode || mode_x == TImode)
8039 && (code == LTU || code == GEU)
8040 && code_x == PLUS
8041 && CONST_SCALAR_INT_P (y)
8042 && (rtx_mode_t (y, mode_x)
8043 == (wi::shwi (1, mode_x)
8044 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8045 return CC_ADCmode;
8046
8047 /* A test for signed overflow. */
8048 if ((mode_x == DImode || mode_x == TImode)
8049 && code == NE
8050 && code_x == PLUS
8051 && GET_CODE (y) == SIGN_EXTEND)
8052 return CC_Vmode;
8053
8054 /* For everything else, return CCmode. */
8055 return CCmode;
8056 }
8057
8058 static int
8059 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8060
8061 int
8062 aarch64_get_condition_code (rtx x)
8063 {
8064 machine_mode mode = GET_MODE (XEXP (x, 0));
8065 enum rtx_code comp_code = GET_CODE (x);
8066
8067 if (GET_MODE_CLASS (mode) != MODE_CC)
8068 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8069 return aarch64_get_condition_code_1 (mode, comp_code);
8070 }
8071
8072 static int
8073 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8074 {
8075 switch (mode)
8076 {
8077 case E_CCFPmode:
8078 case E_CCFPEmode:
8079 switch (comp_code)
8080 {
8081 case GE: return AARCH64_GE;
8082 case GT: return AARCH64_GT;
8083 case LE: return AARCH64_LS;
8084 case LT: return AARCH64_MI;
8085 case NE: return AARCH64_NE;
8086 case EQ: return AARCH64_EQ;
8087 case ORDERED: return AARCH64_VC;
8088 case UNORDERED: return AARCH64_VS;
8089 case UNLT: return AARCH64_LT;
8090 case UNLE: return AARCH64_LE;
8091 case UNGT: return AARCH64_HI;
8092 case UNGE: return AARCH64_PL;
8093 default: return -1;
8094 }
8095 break;
8096
8097 case E_CCmode:
8098 switch (comp_code)
8099 {
8100 case NE: return AARCH64_NE;
8101 case EQ: return AARCH64_EQ;
8102 case GE: return AARCH64_GE;
8103 case GT: return AARCH64_GT;
8104 case LE: return AARCH64_LE;
8105 case LT: return AARCH64_LT;
8106 case GEU: return AARCH64_CS;
8107 case GTU: return AARCH64_HI;
8108 case LEU: return AARCH64_LS;
8109 case LTU: return AARCH64_CC;
8110 default: return -1;
8111 }
8112 break;
8113
8114 case E_CC_SWPmode:
8115 switch (comp_code)
8116 {
8117 case NE: return AARCH64_NE;
8118 case EQ: return AARCH64_EQ;
8119 case GE: return AARCH64_LE;
8120 case GT: return AARCH64_LT;
8121 case LE: return AARCH64_GE;
8122 case LT: return AARCH64_GT;
8123 case GEU: return AARCH64_LS;
8124 case GTU: return AARCH64_CC;
8125 case LEU: return AARCH64_CS;
8126 case LTU: return AARCH64_HI;
8127 default: return -1;
8128 }
8129 break;
8130
8131 case E_CC_NZCmode:
8132 switch (comp_code)
8133 {
8134 case NE: return AARCH64_NE; /* = any */
8135 case EQ: return AARCH64_EQ; /* = none */
8136 case GE: return AARCH64_PL; /* = nfrst */
8137 case LT: return AARCH64_MI; /* = first */
8138 case GEU: return AARCH64_CS; /* = nlast */
8139 case GTU: return AARCH64_HI; /* = pmore */
8140 case LEU: return AARCH64_LS; /* = plast */
8141 case LTU: return AARCH64_CC; /* = last */
8142 default: return -1;
8143 }
8144 break;
8145
8146 case E_CC_NZmode:
8147 switch (comp_code)
8148 {
8149 case NE: return AARCH64_NE;
8150 case EQ: return AARCH64_EQ;
8151 case GE: return AARCH64_PL;
8152 case LT: return AARCH64_MI;
8153 default: return -1;
8154 }
8155 break;
8156
8157 case E_CC_Zmode:
8158 switch (comp_code)
8159 {
8160 case NE: return AARCH64_NE;
8161 case EQ: return AARCH64_EQ;
8162 default: return -1;
8163 }
8164 break;
8165
8166 case E_CC_Cmode:
8167 switch (comp_code)
8168 {
8169 case LTU: return AARCH64_CS;
8170 case GEU: return AARCH64_CC;
8171 default: return -1;
8172 }
8173 break;
8174
8175 case E_CC_ADCmode:
8176 switch (comp_code)
8177 {
8178 case GEU: return AARCH64_CS;
8179 case LTU: return AARCH64_CC;
8180 default: return -1;
8181 }
8182 break;
8183
8184 case E_CC_Vmode:
8185 switch (comp_code)
8186 {
8187 case NE: return AARCH64_VS;
8188 case EQ: return AARCH64_VC;
8189 default: return -1;
8190 }
8191 break;
8192
8193 default:
8194 return -1;
8195 }
8196
8197 return -1;
8198 }
8199
8200 bool
8201 aarch64_const_vec_all_same_in_range_p (rtx x,
8202 HOST_WIDE_INT minval,
8203 HOST_WIDE_INT maxval)
8204 {
8205 rtx elt;
8206 return (const_vec_duplicate_p (x, &elt)
8207 && CONST_INT_P (elt)
8208 && IN_RANGE (INTVAL (elt), minval, maxval));
8209 }
8210
8211 bool
8212 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8213 {
8214 return aarch64_const_vec_all_same_in_range_p (x, val, val);
8215 }
8216
8217 /* Return true if VEC is a constant in which every element is in the range
8218 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8219
8220 static bool
8221 aarch64_const_vec_all_in_range_p (rtx vec,
8222 HOST_WIDE_INT minval,
8223 HOST_WIDE_INT maxval)
8224 {
8225 if (GET_CODE (vec) != CONST_VECTOR
8226 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8227 return false;
8228
8229 int nunits;
8230 if (!CONST_VECTOR_STEPPED_P (vec))
8231 nunits = const_vector_encoded_nelts (vec);
8232 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8233 return false;
8234
8235 for (int i = 0; i < nunits; i++)
8236 {
8237 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8238 if (!CONST_INT_P (vec_elem)
8239 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8240 return false;
8241 }
8242 return true;
8243 }
8244
8245 /* N Z C V. */
8246 #define AARCH64_CC_V 1
8247 #define AARCH64_CC_C (1 << 1)
8248 #define AARCH64_CC_Z (1 << 2)
8249 #define AARCH64_CC_N (1 << 3)
8250
8251 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8252 static const int aarch64_nzcv_codes[] =
8253 {
8254 0, /* EQ, Z == 1. */
8255 AARCH64_CC_Z, /* NE, Z == 0. */
8256 0, /* CS, C == 1. */
8257 AARCH64_CC_C, /* CC, C == 0. */
8258 0, /* MI, N == 1. */
8259 AARCH64_CC_N, /* PL, N == 0. */
8260 0, /* VS, V == 1. */
8261 AARCH64_CC_V, /* VC, V == 0. */
8262 0, /* HI, C ==1 && Z == 0. */
8263 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
8264 AARCH64_CC_V, /* GE, N == V. */
8265 0, /* LT, N != V. */
8266 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
8267 0, /* LE, !(Z == 0 && N == V). */
8268 0, /* AL, Any. */
8269 0 /* NV, Any. */
8270 };
8271
8272 /* Print floating-point vector immediate operand X to F, negating it
8273 first if NEGATE is true. Return true on success, false if it isn't
8274 a constant we can handle. */
8275
8276 static bool
8277 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8278 {
8279 rtx elt;
8280
8281 if (!const_vec_duplicate_p (x, &elt))
8282 return false;
8283
8284 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8285 if (negate)
8286 r = real_value_negate (&r);
8287
8288 /* Handle the SVE single-bit immediates specially, since they have a
8289 fixed form in the assembly syntax. */
8290 if (real_equal (&r, &dconst0))
8291 asm_fprintf (f, "0.0");
8292 else if (real_equal (&r, &dconst2))
8293 asm_fprintf (f, "2.0");
8294 else if (real_equal (&r, &dconst1))
8295 asm_fprintf (f, "1.0");
8296 else if (real_equal (&r, &dconsthalf))
8297 asm_fprintf (f, "0.5");
8298 else
8299 {
8300 const int buf_size = 20;
8301 char float_buf[buf_size] = {'\0'};
8302 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8303 1, GET_MODE (elt));
8304 asm_fprintf (f, "%s", float_buf);
8305 }
8306
8307 return true;
8308 }
8309
8310 /* Return the equivalent letter for size. */
8311 static char
8312 sizetochar (int size)
8313 {
8314 switch (size)
8315 {
8316 case 64: return 'd';
8317 case 32: return 's';
8318 case 16: return 'h';
8319 case 8 : return 'b';
8320 default: gcc_unreachable ();
8321 }
8322 }
8323
8324 /* Print operand X to file F in a target specific manner according to CODE.
8325 The acceptable formatting commands given by CODE are:
8326 'c': An integer or symbol address without a preceding #
8327 sign.
8328 'C': Take the duplicated element in a vector constant
8329 and print it in hex.
8330 'D': Take the duplicated element in a vector constant
8331 and print it as an unsigned integer, in decimal.
8332 'e': Print the sign/zero-extend size as a character 8->b,
8333 16->h, 32->w. Can also be used for masks:
8334 0xff->b, 0xffff->h, 0xffffffff->w.
8335 'I': If the operand is a duplicated vector constant,
8336 replace it with the duplicated scalar. If the
8337 operand is then a floating-point constant, replace
8338 it with the integer bit representation. Print the
8339 transformed constant as a signed decimal number.
8340 'p': Prints N such that 2^N == X (X must be power of 2 and
8341 const int).
8342 'P': Print the number of non-zero bits in X (a const_int).
8343 'H': Print the higher numbered register of a pair (TImode)
8344 of regs.
8345 'm': Print a condition (eq, ne, etc).
8346 'M': Same as 'm', but invert condition.
8347 'N': Take the duplicated element in a vector constant
8348 and print the negative of it in decimal.
8349 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8350 'S/T/U/V': Print a FP/SIMD register name for a register list.
8351 The register printed is the FP/SIMD register name
8352 of X + 0/1/2/3 for S/T/U/V.
8353 'R': Print a scalar FP/SIMD register name + 1.
8354 'X': Print bottom 16 bits of integer constant in hex.
8355 'w/x': Print a general register name or the zero register
8356 (32-bit or 64-bit).
8357 '0': Print a normal operand, if it's a general register,
8358 then we assume DImode.
8359 'k': Print NZCV for conditional compare instructions.
8360 'A': Output address constant representing the first
8361 argument of X, specifying a relocation offset
8362 if appropriate.
8363 'L': Output constant address specified by X
8364 with a relocation offset if appropriate.
8365 'G': Prints address of X, specifying a PC relative
8366 relocation mode if appropriate.
8367 'y': Output address of LDP or STP - this is used for
8368 some LDP/STPs which don't use a PARALLEL in their
8369 pattern (so the mode needs to be adjusted).
8370 'z': Output address of a typical LDP or STP. */
8371
8372 static void
8373 aarch64_print_operand (FILE *f, rtx x, int code)
8374 {
8375 rtx elt;
8376 switch (code)
8377 {
8378 case 'c':
8379 switch (GET_CODE (x))
8380 {
8381 case CONST_INT:
8382 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8383 break;
8384
8385 case SYMBOL_REF:
8386 output_addr_const (f, x);
8387 break;
8388
8389 case CONST:
8390 if (GET_CODE (XEXP (x, 0)) == PLUS
8391 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8392 {
8393 output_addr_const (f, x);
8394 break;
8395 }
8396 /* Fall through. */
8397
8398 default:
8399 output_operand_lossage ("unsupported operand for code '%c'", code);
8400 }
8401 break;
8402
8403 case 'e':
8404 {
8405 x = unwrap_const_vec_duplicate (x);
8406 if (!CONST_INT_P (x))
8407 {
8408 output_operand_lossage ("invalid operand for '%%%c'", code);
8409 return;
8410 }
8411
8412 HOST_WIDE_INT val = INTVAL (x);
8413 if ((val & ~7) == 8 || val == 0xff)
8414 fputc ('b', f);
8415 else if ((val & ~7) == 16 || val == 0xffff)
8416 fputc ('h', f);
8417 else if ((val & ~7) == 32 || val == 0xffffffff)
8418 fputc ('w', f);
8419 else
8420 {
8421 output_operand_lossage ("invalid operand for '%%%c'", code);
8422 return;
8423 }
8424 }
8425 break;
8426
8427 case 'p':
8428 {
8429 int n;
8430
8431 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8432 {
8433 output_operand_lossage ("invalid operand for '%%%c'", code);
8434 return;
8435 }
8436
8437 asm_fprintf (f, "%d", n);
8438 }
8439 break;
8440
8441 case 'P':
8442 if (!CONST_INT_P (x))
8443 {
8444 output_operand_lossage ("invalid operand for '%%%c'", code);
8445 return;
8446 }
8447
8448 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8449 break;
8450
8451 case 'H':
8452 if (x == const0_rtx)
8453 {
8454 asm_fprintf (f, "xzr");
8455 break;
8456 }
8457
8458 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8459 {
8460 output_operand_lossage ("invalid operand for '%%%c'", code);
8461 return;
8462 }
8463
8464 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8465 break;
8466
8467 case 'I':
8468 {
8469 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8470 if (CONST_INT_P (x))
8471 asm_fprintf (f, "%wd", INTVAL (x));
8472 else
8473 {
8474 output_operand_lossage ("invalid operand for '%%%c'", code);
8475 return;
8476 }
8477 break;
8478 }
8479
8480 case 'M':
8481 case 'm':
8482 {
8483 int cond_code;
8484 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8485 if (x == const_true_rtx)
8486 {
8487 if (code == 'M')
8488 fputs ("nv", f);
8489 return;
8490 }
8491
8492 if (!COMPARISON_P (x))
8493 {
8494 output_operand_lossage ("invalid operand for '%%%c'", code);
8495 return;
8496 }
8497
8498 cond_code = aarch64_get_condition_code (x);
8499 gcc_assert (cond_code >= 0);
8500 if (code == 'M')
8501 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8502 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8503 fputs (aarch64_sve_condition_codes[cond_code], f);
8504 else
8505 fputs (aarch64_condition_codes[cond_code], f);
8506 }
8507 break;
8508
8509 case 'N':
8510 if (!const_vec_duplicate_p (x, &elt))
8511 {
8512 output_operand_lossage ("invalid vector constant");
8513 return;
8514 }
8515
8516 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8517 asm_fprintf (f, "%wd", -INTVAL (elt));
8518 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8519 && aarch64_print_vector_float_operand (f, x, true))
8520 ;
8521 else
8522 {
8523 output_operand_lossage ("invalid vector constant");
8524 return;
8525 }
8526 break;
8527
8528 case 'b':
8529 case 'h':
8530 case 's':
8531 case 'd':
8532 case 'q':
8533 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8534 {
8535 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8536 return;
8537 }
8538 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8539 break;
8540
8541 case 'S':
8542 case 'T':
8543 case 'U':
8544 case 'V':
8545 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8546 {
8547 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8548 return;
8549 }
8550 asm_fprintf (f, "%c%d",
8551 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8552 REGNO (x) - V0_REGNUM + (code - 'S'));
8553 break;
8554
8555 case 'R':
8556 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8557 {
8558 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8559 return;
8560 }
8561 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8562 break;
8563
8564 case 'X':
8565 if (!CONST_INT_P (x))
8566 {
8567 output_operand_lossage ("invalid operand for '%%%c'", code);
8568 return;
8569 }
8570 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8571 break;
8572
8573 case 'C':
8574 {
8575 /* Print a replicated constant in hex. */
8576 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8577 {
8578 output_operand_lossage ("invalid operand for '%%%c'", code);
8579 return;
8580 }
8581 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8582 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8583 }
8584 break;
8585
8586 case 'D':
8587 {
8588 /* Print a replicated constant in decimal, treating it as
8589 unsigned. */
8590 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8591 {
8592 output_operand_lossage ("invalid operand for '%%%c'", code);
8593 return;
8594 }
8595 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8596 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8597 }
8598 break;
8599
8600 case 'w':
8601 case 'x':
8602 if (x == const0_rtx
8603 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8604 {
8605 asm_fprintf (f, "%czr", code);
8606 break;
8607 }
8608
8609 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8610 {
8611 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8612 break;
8613 }
8614
8615 if (REG_P (x) && REGNO (x) == SP_REGNUM)
8616 {
8617 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8618 break;
8619 }
8620
8621 /* Fall through */
8622
8623 case 0:
8624 if (x == NULL)
8625 {
8626 output_operand_lossage ("missing operand");
8627 return;
8628 }
8629
8630 switch (GET_CODE (x))
8631 {
8632 case REG:
8633 if (aarch64_sve_data_mode_p (GET_MODE (x)))
8634 {
8635 if (REG_NREGS (x) == 1)
8636 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8637 else
8638 {
8639 char suffix
8640 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8641 asm_fprintf (f, "{z%d.%c - z%d.%c}",
8642 REGNO (x) - V0_REGNUM, suffix,
8643 END_REGNO (x) - V0_REGNUM - 1, suffix);
8644 }
8645 }
8646 else
8647 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8648 break;
8649
8650 case MEM:
8651 output_address (GET_MODE (x), XEXP (x, 0));
8652 break;
8653
8654 case LABEL_REF:
8655 case SYMBOL_REF:
8656 output_addr_const (asm_out_file, x);
8657 break;
8658
8659 case CONST_INT:
8660 asm_fprintf (f, "%wd", INTVAL (x));
8661 break;
8662
8663 case CONST:
8664 if (!VECTOR_MODE_P (GET_MODE (x)))
8665 {
8666 output_addr_const (asm_out_file, x);
8667 break;
8668 }
8669 /* fall through */
8670
8671 case CONST_VECTOR:
8672 if (!const_vec_duplicate_p (x, &elt))
8673 {
8674 output_operand_lossage ("invalid vector constant");
8675 return;
8676 }
8677
8678 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8679 asm_fprintf (f, "%wd", INTVAL (elt));
8680 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8681 && aarch64_print_vector_float_operand (f, x, false))
8682 ;
8683 else
8684 {
8685 output_operand_lossage ("invalid vector constant");
8686 return;
8687 }
8688 break;
8689
8690 case CONST_DOUBLE:
8691 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8692 be getting CONST_DOUBLEs holding integers. */
8693 gcc_assert (GET_MODE (x) != VOIDmode);
8694 if (aarch64_float_const_zero_rtx_p (x))
8695 {
8696 fputc ('0', f);
8697 break;
8698 }
8699 else if (aarch64_float_const_representable_p (x))
8700 {
8701 #define buf_size 20
8702 char float_buf[buf_size] = {'\0'};
8703 real_to_decimal_for_mode (float_buf,
8704 CONST_DOUBLE_REAL_VALUE (x),
8705 buf_size, buf_size,
8706 1, GET_MODE (x));
8707 asm_fprintf (asm_out_file, "%s", float_buf);
8708 break;
8709 #undef buf_size
8710 }
8711 output_operand_lossage ("invalid constant");
8712 return;
8713 default:
8714 output_operand_lossage ("invalid operand");
8715 return;
8716 }
8717 break;
8718
8719 case 'A':
8720 if (GET_CODE (x) == HIGH)
8721 x = XEXP (x, 0);
8722
8723 switch (aarch64_classify_symbolic_expression (x))
8724 {
8725 case SYMBOL_SMALL_GOT_4G:
8726 asm_fprintf (asm_out_file, ":got:");
8727 break;
8728
8729 case SYMBOL_SMALL_TLSGD:
8730 asm_fprintf (asm_out_file, ":tlsgd:");
8731 break;
8732
8733 case SYMBOL_SMALL_TLSDESC:
8734 asm_fprintf (asm_out_file, ":tlsdesc:");
8735 break;
8736
8737 case SYMBOL_SMALL_TLSIE:
8738 asm_fprintf (asm_out_file, ":gottprel:");
8739 break;
8740
8741 case SYMBOL_TLSLE24:
8742 asm_fprintf (asm_out_file, ":tprel:");
8743 break;
8744
8745 case SYMBOL_TINY_GOT:
8746 gcc_unreachable ();
8747 break;
8748
8749 default:
8750 break;
8751 }
8752 output_addr_const (asm_out_file, x);
8753 break;
8754
8755 case 'L':
8756 switch (aarch64_classify_symbolic_expression (x))
8757 {
8758 case SYMBOL_SMALL_GOT_4G:
8759 asm_fprintf (asm_out_file, ":lo12:");
8760 break;
8761
8762 case SYMBOL_SMALL_TLSGD:
8763 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8764 break;
8765
8766 case SYMBOL_SMALL_TLSDESC:
8767 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8768 break;
8769
8770 case SYMBOL_SMALL_TLSIE:
8771 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8772 break;
8773
8774 case SYMBOL_TLSLE12:
8775 asm_fprintf (asm_out_file, ":tprel_lo12:");
8776 break;
8777
8778 case SYMBOL_TLSLE24:
8779 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8780 break;
8781
8782 case SYMBOL_TINY_GOT:
8783 asm_fprintf (asm_out_file, ":got:");
8784 break;
8785
8786 case SYMBOL_TINY_TLSIE:
8787 asm_fprintf (asm_out_file, ":gottprel:");
8788 break;
8789
8790 default:
8791 break;
8792 }
8793 output_addr_const (asm_out_file, x);
8794 break;
8795
8796 case 'G':
8797 switch (aarch64_classify_symbolic_expression (x))
8798 {
8799 case SYMBOL_TLSLE24:
8800 asm_fprintf (asm_out_file, ":tprel_hi12:");
8801 break;
8802 default:
8803 break;
8804 }
8805 output_addr_const (asm_out_file, x);
8806 break;
8807
8808 case 'k':
8809 {
8810 HOST_WIDE_INT cond_code;
8811
8812 if (!CONST_INT_P (x))
8813 {
8814 output_operand_lossage ("invalid operand for '%%%c'", code);
8815 return;
8816 }
8817
8818 cond_code = INTVAL (x);
8819 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8820 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8821 }
8822 break;
8823
8824 case 'y':
8825 case 'z':
8826 {
8827 machine_mode mode = GET_MODE (x);
8828
8829 if (GET_CODE (x) != MEM
8830 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8831 {
8832 output_operand_lossage ("invalid operand for '%%%c'", code);
8833 return;
8834 }
8835
8836 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8837 code == 'y'
8838 ? ADDR_QUERY_LDP_STP_N
8839 : ADDR_QUERY_LDP_STP))
8840 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8841 }
8842 break;
8843
8844 default:
8845 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8846 return;
8847 }
8848 }
8849
8850 /* Print address 'x' of a memory access with mode 'mode'.
8851 'op' is the context required by aarch64_classify_address. It can either be
8852 MEM for a normal memory access or PARALLEL for LDP/STP. */
8853 static bool
8854 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8855 aarch64_addr_query_type type)
8856 {
8857 struct aarch64_address_info addr;
8858 unsigned int size;
8859
8860 /* Check all addresses are Pmode - including ILP32. */
8861 if (GET_MODE (x) != Pmode
8862 && (!CONST_INT_P (x)
8863 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8864 {
8865 output_operand_lossage ("invalid address mode");
8866 return false;
8867 }
8868
8869 if (aarch64_classify_address (&addr, x, mode, true, type))
8870 switch (addr.type)
8871 {
8872 case ADDRESS_REG_IMM:
8873 if (known_eq (addr.const_offset, 0))
8874 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8875 else if (aarch64_sve_data_mode_p (mode))
8876 {
8877 HOST_WIDE_INT vnum
8878 = exact_div (addr.const_offset,
8879 BYTES_PER_SVE_VECTOR).to_constant ();
8880 asm_fprintf (f, "[%s, #%wd, mul vl]",
8881 reg_names[REGNO (addr.base)], vnum);
8882 }
8883 else if (aarch64_sve_pred_mode_p (mode))
8884 {
8885 HOST_WIDE_INT vnum
8886 = exact_div (addr.const_offset,
8887 BYTES_PER_SVE_PRED).to_constant ();
8888 asm_fprintf (f, "[%s, #%wd, mul vl]",
8889 reg_names[REGNO (addr.base)], vnum);
8890 }
8891 else
8892 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8893 INTVAL (addr.offset));
8894 return true;
8895
8896 case ADDRESS_REG_REG:
8897 if (addr.shift == 0)
8898 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8899 reg_names [REGNO (addr.offset)]);
8900 else
8901 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8902 reg_names [REGNO (addr.offset)], addr.shift);
8903 return true;
8904
8905 case ADDRESS_REG_UXTW:
8906 if (addr.shift == 0)
8907 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8908 REGNO (addr.offset) - R0_REGNUM);
8909 else
8910 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8911 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8912 return true;
8913
8914 case ADDRESS_REG_SXTW:
8915 if (addr.shift == 0)
8916 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8917 REGNO (addr.offset) - R0_REGNUM);
8918 else
8919 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8920 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8921 return true;
8922
8923 case ADDRESS_REG_WB:
8924 /* Writeback is only supported for fixed-width modes. */
8925 size = GET_MODE_SIZE (mode).to_constant ();
8926 switch (GET_CODE (x))
8927 {
8928 case PRE_INC:
8929 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8930 return true;
8931 case POST_INC:
8932 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8933 return true;
8934 case PRE_DEC:
8935 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8936 return true;
8937 case POST_DEC:
8938 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8939 return true;
8940 case PRE_MODIFY:
8941 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8942 INTVAL (addr.offset));
8943 return true;
8944 case POST_MODIFY:
8945 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8946 INTVAL (addr.offset));
8947 return true;
8948 default:
8949 break;
8950 }
8951 break;
8952
8953 case ADDRESS_LO_SUM:
8954 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8955 output_addr_const (f, addr.offset);
8956 asm_fprintf (f, "]");
8957 return true;
8958
8959 case ADDRESS_SYMBOLIC:
8960 output_addr_const (f, x);
8961 return true;
8962 }
8963
8964 return false;
8965 }
8966
8967 /* Print address 'x' of a memory access with mode 'mode'. */
8968 static void
8969 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8970 {
8971 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8972 output_addr_const (f, x);
8973 }
8974
8975 bool
8976 aarch64_label_mentioned_p (rtx x)
8977 {
8978 const char *fmt;
8979 int i;
8980
8981 if (GET_CODE (x) == LABEL_REF)
8982 return true;
8983
8984 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8985 referencing instruction, but they are constant offsets, not
8986 symbols. */
8987 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8988 return false;
8989
8990 fmt = GET_RTX_FORMAT (GET_CODE (x));
8991 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8992 {
8993 if (fmt[i] == 'E')
8994 {
8995 int j;
8996
8997 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8998 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8999 return 1;
9000 }
9001 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9002 return 1;
9003 }
9004
9005 return 0;
9006 }
9007
9008 /* Implement REGNO_REG_CLASS. */
9009
9010 enum reg_class
9011 aarch64_regno_regclass (unsigned regno)
9012 {
9013 if (GP_REGNUM_P (regno))
9014 return GENERAL_REGS;
9015
9016 if (regno == SP_REGNUM)
9017 return STACK_REG;
9018
9019 if (regno == FRAME_POINTER_REGNUM
9020 || regno == ARG_POINTER_REGNUM)
9021 return POINTER_REGS;
9022
9023 if (FP_REGNUM_P (regno))
9024 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9025 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9026
9027 if (PR_REGNUM_P (regno))
9028 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9029
9030 return NO_REGS;
9031 }
9032
9033 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9034 If OFFSET is out of range, return an offset of an anchor point
9035 that is in range. Return 0 otherwise. */
9036
9037 static HOST_WIDE_INT
9038 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9039 machine_mode mode)
9040 {
9041 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9042 if (size > 16)
9043 return (offset + 0x400) & ~0x7f0;
9044
9045 /* For offsets that aren't a multiple of the access size, the limit is
9046 -256...255. */
9047 if (offset & (size - 1))
9048 {
9049 /* BLKmode typically uses LDP of X-registers. */
9050 if (mode == BLKmode)
9051 return (offset + 512) & ~0x3ff;
9052 return (offset + 0x100) & ~0x1ff;
9053 }
9054
9055 /* Small negative offsets are supported. */
9056 if (IN_RANGE (offset, -256, 0))
9057 return 0;
9058
9059 if (mode == TImode || mode == TFmode)
9060 return (offset + 0x100) & ~0x1ff;
9061
9062 /* Use 12-bit offset by access size. */
9063 return offset & (~0xfff * size);
9064 }
9065
9066 static rtx
9067 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
9068 {
9069 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9070 where mask is selected by alignment and size of the offset.
9071 We try to pick as large a range for the offset as possible to
9072 maximize the chance of a CSE. However, for aligned addresses
9073 we limit the range to 4k so that structures with different sized
9074 elements are likely to use the same base. We need to be careful
9075 not to split a CONST for some forms of address expression, otherwise
9076 it will generate sub-optimal code. */
9077
9078 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9079 {
9080 rtx base = XEXP (x, 0);
9081 rtx offset_rtx = XEXP (x, 1);
9082 HOST_WIDE_INT offset = INTVAL (offset_rtx);
9083
9084 if (GET_CODE (base) == PLUS)
9085 {
9086 rtx op0 = XEXP (base, 0);
9087 rtx op1 = XEXP (base, 1);
9088
9089 /* Force any scaling into a temp for CSE. */
9090 op0 = force_reg (Pmode, op0);
9091 op1 = force_reg (Pmode, op1);
9092
9093 /* Let the pointer register be in op0. */
9094 if (REG_POINTER (op1))
9095 std::swap (op0, op1);
9096
9097 /* If the pointer is virtual or frame related, then we know that
9098 virtual register instantiation or register elimination is going
9099 to apply a second constant. We want the two constants folded
9100 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9101 if (virt_or_elim_regno_p (REGNO (op0)))
9102 {
9103 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9104 NULL_RTX, true, OPTAB_DIRECT);
9105 return gen_rtx_PLUS (Pmode, base, op1);
9106 }
9107
9108 /* Otherwise, in order to encourage CSE (and thence loop strength
9109 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9110 base = expand_binop (Pmode, add_optab, op0, op1,
9111 NULL_RTX, true, OPTAB_DIRECT);
9112 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9113 }
9114
9115 HOST_WIDE_INT size;
9116 if (GET_MODE_SIZE (mode).is_constant (&size))
9117 {
9118 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9119 mode);
9120 if (base_offset != 0)
9121 {
9122 base = plus_constant (Pmode, base, base_offset);
9123 base = force_operand (base, NULL_RTX);
9124 return plus_constant (Pmode, base, offset - base_offset);
9125 }
9126 }
9127 }
9128
9129 return x;
9130 }
9131
9132 static reg_class_t
9133 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9134 reg_class_t rclass,
9135 machine_mode mode,
9136 secondary_reload_info *sri)
9137 {
9138 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9139 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9140 comment at the head of aarch64-sve.md for more details about the
9141 big-endian handling. */
9142 if (BYTES_BIG_ENDIAN
9143 && reg_class_subset_p (rclass, FP_REGS)
9144 && !((REG_P (x) && HARD_REGISTER_P (x))
9145 || aarch64_simd_valid_immediate (x, NULL))
9146 && aarch64_sve_data_mode_p (mode))
9147 {
9148 sri->icode = CODE_FOR_aarch64_sve_reload_be;
9149 return NO_REGS;
9150 }
9151
9152 /* If we have to disable direct literal pool loads and stores because the
9153 function is too big, then we need a scratch register. */
9154 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9155 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9156 || targetm.vector_mode_supported_p (GET_MODE (x)))
9157 && !aarch64_pcrelative_literal_loads)
9158 {
9159 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9160 return NO_REGS;
9161 }
9162
9163 /* Without the TARGET_SIMD instructions we cannot move a Q register
9164 to a Q register directly. We need a scratch. */
9165 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9166 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9167 && reg_class_subset_p (rclass, FP_REGS))
9168 {
9169 sri->icode = code_for_aarch64_reload_mov (mode);
9170 return NO_REGS;
9171 }
9172
9173 /* A TFmode or TImode memory access should be handled via an FP_REGS
9174 because AArch64 has richer addressing modes for LDR/STR instructions
9175 than LDP/STP instructions. */
9176 if (TARGET_FLOAT && rclass == GENERAL_REGS
9177 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9178 return FP_REGS;
9179
9180 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9181 return GENERAL_REGS;
9182
9183 return NO_REGS;
9184 }
9185
9186 static bool
9187 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9188 {
9189 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9190
9191 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9192 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9193 if (frame_pointer_needed)
9194 return to == HARD_FRAME_POINTER_REGNUM;
9195 return true;
9196 }
9197
9198 poly_int64
9199 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9200 {
9201 if (to == HARD_FRAME_POINTER_REGNUM)
9202 {
9203 if (from == ARG_POINTER_REGNUM)
9204 return cfun->machine->frame.hard_fp_offset;
9205
9206 if (from == FRAME_POINTER_REGNUM)
9207 return cfun->machine->frame.hard_fp_offset
9208 - cfun->machine->frame.locals_offset;
9209 }
9210
9211 if (to == STACK_POINTER_REGNUM)
9212 {
9213 if (from == FRAME_POINTER_REGNUM)
9214 return cfun->machine->frame.frame_size
9215 - cfun->machine->frame.locals_offset;
9216 }
9217
9218 return cfun->machine->frame.frame_size;
9219 }
9220
9221 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9222 previous frame. */
9223
9224 rtx
9225 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9226 {
9227 if (count != 0)
9228 return const0_rtx;
9229 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9230 }
9231
9232
9233 static void
9234 aarch64_asm_trampoline_template (FILE *f)
9235 {
9236 int offset1 = 16;
9237 int offset2 = 20;
9238
9239 if (aarch64_bti_enabled ())
9240 {
9241 asm_fprintf (f, "\thint\t34 // bti c\n");
9242 offset1 -= 4;
9243 offset2 -= 4;
9244 }
9245
9246 if (TARGET_ILP32)
9247 {
9248 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9249 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9250 offset1);
9251 }
9252 else
9253 {
9254 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9255 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9256 offset2);
9257 }
9258 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9259
9260 /* The trampoline needs an extra padding instruction. In case if BTI is
9261 enabled the padding instruction is replaced by the BTI instruction at
9262 the beginning. */
9263 if (!aarch64_bti_enabled ())
9264 assemble_aligned_integer (4, const0_rtx);
9265
9266 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9267 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9268 }
9269
9270 static void
9271 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9272 {
9273 rtx fnaddr, mem, a_tramp;
9274 const int tramp_code_sz = 16;
9275
9276 /* Don't need to copy the trailing D-words, we fill those in below. */
9277 emit_block_move (m_tramp, assemble_trampoline_template (),
9278 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9279 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9280 fnaddr = XEXP (DECL_RTL (fndecl), 0);
9281 if (GET_MODE (fnaddr) != ptr_mode)
9282 fnaddr = convert_memory_address (ptr_mode, fnaddr);
9283 emit_move_insn (mem, fnaddr);
9284
9285 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9286 emit_move_insn (mem, chain_value);
9287
9288 /* XXX We should really define a "clear_cache" pattern and use
9289 gen_clear_cache(). */
9290 a_tramp = XEXP (m_tramp, 0);
9291 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9292 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9293 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9294 ptr_mode);
9295 }
9296
9297 static unsigned char
9298 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9299 {
9300 /* ??? Logically we should only need to provide a value when
9301 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9302 can hold MODE, but at the moment we need to handle all modes.
9303 Just ignore any runtime parts for registers that can't store them. */
9304 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9305 unsigned int nregs;
9306 switch (regclass)
9307 {
9308 case TAILCALL_ADDR_REGS:
9309 case POINTER_REGS:
9310 case GENERAL_REGS:
9311 case ALL_REGS:
9312 case POINTER_AND_FP_REGS:
9313 case FP_REGS:
9314 case FP_LO_REGS:
9315 case FP_LO8_REGS:
9316 if (aarch64_sve_data_mode_p (mode)
9317 && constant_multiple_p (GET_MODE_SIZE (mode),
9318 BYTES_PER_SVE_VECTOR, &nregs))
9319 return nregs;
9320 return (aarch64_vector_data_mode_p (mode)
9321 ? CEIL (lowest_size, UNITS_PER_VREG)
9322 : CEIL (lowest_size, UNITS_PER_WORD));
9323 case STACK_REG:
9324 case PR_REGS:
9325 case PR_LO_REGS:
9326 case PR_HI_REGS:
9327 return 1;
9328
9329 case NO_REGS:
9330 return 0;
9331
9332 default:
9333 break;
9334 }
9335 gcc_unreachable ();
9336 }
9337
9338 static reg_class_t
9339 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9340 {
9341 if (regclass == POINTER_REGS)
9342 return GENERAL_REGS;
9343
9344 if (regclass == STACK_REG)
9345 {
9346 if (REG_P(x)
9347 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9348 return regclass;
9349
9350 return NO_REGS;
9351 }
9352
9353 /* Register eliminiation can result in a request for
9354 SP+constant->FP_REGS. We cannot support such operations which
9355 use SP as source and an FP_REG as destination, so reject out
9356 right now. */
9357 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9358 {
9359 rtx lhs = XEXP (x, 0);
9360
9361 /* Look through a possible SUBREG introduced by ILP32. */
9362 if (GET_CODE (lhs) == SUBREG)
9363 lhs = SUBREG_REG (lhs);
9364
9365 gcc_assert (REG_P (lhs));
9366 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9367 POINTER_REGS));
9368 return NO_REGS;
9369 }
9370
9371 return regclass;
9372 }
9373
9374 void
9375 aarch64_asm_output_labelref (FILE* f, const char *name)
9376 {
9377 asm_fprintf (f, "%U%s", name);
9378 }
9379
9380 static void
9381 aarch64_elf_asm_constructor (rtx symbol, int priority)
9382 {
9383 if (priority == DEFAULT_INIT_PRIORITY)
9384 default_ctor_section_asm_out_constructor (symbol, priority);
9385 else
9386 {
9387 section *s;
9388 /* While priority is known to be in range [0, 65535], so 18 bytes
9389 would be enough, the compiler might not know that. To avoid
9390 -Wformat-truncation false positive, use a larger size. */
9391 char buf[23];
9392 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9393 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9394 switch_to_section (s);
9395 assemble_align (POINTER_SIZE);
9396 assemble_aligned_integer (POINTER_BYTES, symbol);
9397 }
9398 }
9399
9400 static void
9401 aarch64_elf_asm_destructor (rtx symbol, int priority)
9402 {
9403 if (priority == DEFAULT_INIT_PRIORITY)
9404 default_dtor_section_asm_out_destructor (symbol, priority);
9405 else
9406 {
9407 section *s;
9408 /* While priority is known to be in range [0, 65535], so 18 bytes
9409 would be enough, the compiler might not know that. To avoid
9410 -Wformat-truncation false positive, use a larger size. */
9411 char buf[23];
9412 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9413 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9414 switch_to_section (s);
9415 assemble_align (POINTER_SIZE);
9416 assemble_aligned_integer (POINTER_BYTES, symbol);
9417 }
9418 }
9419
9420 const char*
9421 aarch64_output_casesi (rtx *operands)
9422 {
9423 char buf[100];
9424 char label[100];
9425 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9426 int index;
9427 static const char *const patterns[4][2] =
9428 {
9429 {
9430 "ldrb\t%w3, [%0,%w1,uxtw]",
9431 "add\t%3, %4, %w3, sxtb #2"
9432 },
9433 {
9434 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9435 "add\t%3, %4, %w3, sxth #2"
9436 },
9437 {
9438 "ldr\t%w3, [%0,%w1,uxtw #2]",
9439 "add\t%3, %4, %w3, sxtw #2"
9440 },
9441 /* We assume that DImode is only generated when not optimizing and
9442 that we don't really need 64-bit address offsets. That would
9443 imply an object file with 8GB of code in a single function! */
9444 {
9445 "ldr\t%w3, [%0,%w1,uxtw #2]",
9446 "add\t%3, %4, %w3, sxtw #2"
9447 }
9448 };
9449
9450 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9451
9452 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9453 index = exact_log2 (GET_MODE_SIZE (mode));
9454
9455 gcc_assert (index >= 0 && index <= 3);
9456
9457 /* Need to implement table size reduction, by chaning the code below. */
9458 output_asm_insn (patterns[index][0], operands);
9459 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9460 snprintf (buf, sizeof (buf),
9461 "adr\t%%4, %s", targetm.strip_name_encoding (label));
9462 output_asm_insn (buf, operands);
9463 output_asm_insn (patterns[index][1], operands);
9464 output_asm_insn ("br\t%3", operands);
9465 assemble_label (asm_out_file, label);
9466 return "";
9467 }
9468
9469
9470 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9471 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9472 operator. */
9473
9474 int
9475 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9476 {
9477 if (shift >= 0 && shift <= 3)
9478 {
9479 int size;
9480 for (size = 8; size <= 32; size *= 2)
9481 {
9482 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9483 if (mask == bits << shift)
9484 return size;
9485 }
9486 }
9487 return 0;
9488 }
9489
9490 /* Constant pools are per function only when PC relative
9491 literal loads are true or we are in the large memory
9492 model. */
9493
9494 static inline bool
9495 aarch64_can_use_per_function_literal_pools_p (void)
9496 {
9497 return (aarch64_pcrelative_literal_loads
9498 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9499 }
9500
9501 static bool
9502 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9503 {
9504 /* We can't use blocks for constants when we're using a per-function
9505 constant pool. */
9506 return !aarch64_can_use_per_function_literal_pools_p ();
9507 }
9508
9509 /* Select appropriate section for constants depending
9510 on where we place literal pools. */
9511
9512 static section *
9513 aarch64_select_rtx_section (machine_mode mode,
9514 rtx x,
9515 unsigned HOST_WIDE_INT align)
9516 {
9517 if (aarch64_can_use_per_function_literal_pools_p ())
9518 return function_section (current_function_decl);
9519
9520 return default_elf_select_rtx_section (mode, x, align);
9521 }
9522
9523 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9524 void
9525 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9526 HOST_WIDE_INT offset)
9527 {
9528 /* When using per-function literal pools, we must ensure that any code
9529 section is aligned to the minimal instruction length, lest we get
9530 errors from the assembler re "unaligned instructions". */
9531 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9532 ASM_OUTPUT_ALIGN (f, 2);
9533 }
9534
9535 /* Costs. */
9536
9537 /* Helper function for rtx cost calculation. Strip a shift expression
9538 from X. Returns the inner operand if successful, or the original
9539 expression on failure. */
9540 static rtx
9541 aarch64_strip_shift (rtx x)
9542 {
9543 rtx op = x;
9544
9545 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9546 we can convert both to ROR during final output. */
9547 if ((GET_CODE (op) == ASHIFT
9548 || GET_CODE (op) == ASHIFTRT
9549 || GET_CODE (op) == LSHIFTRT
9550 || GET_CODE (op) == ROTATERT
9551 || GET_CODE (op) == ROTATE)
9552 && CONST_INT_P (XEXP (op, 1)))
9553 return XEXP (op, 0);
9554
9555 if (GET_CODE (op) == MULT
9556 && CONST_INT_P (XEXP (op, 1))
9557 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9558 return XEXP (op, 0);
9559
9560 return x;
9561 }
9562
9563 /* Helper function for rtx cost calculation. Strip an extend
9564 expression from X. Returns the inner operand if successful, or the
9565 original expression on failure. We deal with a number of possible
9566 canonicalization variations here. If STRIP_SHIFT is true, then
9567 we can strip off a shift also. */
9568 static rtx
9569 aarch64_strip_extend (rtx x, bool strip_shift)
9570 {
9571 scalar_int_mode mode;
9572 rtx op = x;
9573
9574 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9575 return op;
9576
9577 /* Zero and sign extraction of a widened value. */
9578 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9579 && XEXP (op, 2) == const0_rtx
9580 && GET_CODE (XEXP (op, 0)) == MULT
9581 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9582 XEXP (op, 1)))
9583 return XEXP (XEXP (op, 0), 0);
9584
9585 /* It can also be represented (for zero-extend) as an AND with an
9586 immediate. */
9587 if (GET_CODE (op) == AND
9588 && GET_CODE (XEXP (op, 0)) == MULT
9589 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9590 && CONST_INT_P (XEXP (op, 1))
9591 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9592 INTVAL (XEXP (op, 1))) != 0)
9593 return XEXP (XEXP (op, 0), 0);
9594
9595 /* Now handle extended register, as this may also have an optional
9596 left shift by 1..4. */
9597 if (strip_shift
9598 && GET_CODE (op) == ASHIFT
9599 && CONST_INT_P (XEXP (op, 1))
9600 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9601 op = XEXP (op, 0);
9602
9603 if (GET_CODE (op) == ZERO_EXTEND
9604 || GET_CODE (op) == SIGN_EXTEND)
9605 op = XEXP (op, 0);
9606
9607 if (op != x)
9608 return op;
9609
9610 return x;
9611 }
9612
9613 /* Return true iff CODE is a shift supported in combination
9614 with arithmetic instructions. */
9615
9616 static bool
9617 aarch64_shift_p (enum rtx_code code)
9618 {
9619 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9620 }
9621
9622
9623 /* Return true iff X is a cheap shift without a sign extend. */
9624
9625 static bool
9626 aarch64_cheap_mult_shift_p (rtx x)
9627 {
9628 rtx op0, op1;
9629
9630 op0 = XEXP (x, 0);
9631 op1 = XEXP (x, 1);
9632
9633 if (!(aarch64_tune_params.extra_tuning_flags
9634 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9635 return false;
9636
9637 if (GET_CODE (op0) == SIGN_EXTEND)
9638 return false;
9639
9640 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9641 && UINTVAL (op1) <= 4)
9642 return true;
9643
9644 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9645 return false;
9646
9647 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9648
9649 if (l2 > 0 && l2 <= 4)
9650 return true;
9651
9652 return false;
9653 }
9654
9655 /* Helper function for rtx cost calculation. Calculate the cost of
9656 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9657 Return the calculated cost of the expression, recursing manually in to
9658 operands where needed. */
9659
9660 static int
9661 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9662 {
9663 rtx op0, op1;
9664 const struct cpu_cost_table *extra_cost
9665 = aarch64_tune_params.insn_extra_cost;
9666 int cost = 0;
9667 bool compound_p = (outer == PLUS || outer == MINUS);
9668 machine_mode mode = GET_MODE (x);
9669
9670 gcc_checking_assert (code == MULT);
9671
9672 op0 = XEXP (x, 0);
9673 op1 = XEXP (x, 1);
9674
9675 if (VECTOR_MODE_P (mode))
9676 mode = GET_MODE_INNER (mode);
9677
9678 /* Integer multiply/fma. */
9679 if (GET_MODE_CLASS (mode) == MODE_INT)
9680 {
9681 /* The multiply will be canonicalized as a shift, cost it as such. */
9682 if (aarch64_shift_p (GET_CODE (x))
9683 || (CONST_INT_P (op1)
9684 && exact_log2 (INTVAL (op1)) > 0))
9685 {
9686 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9687 || GET_CODE (op0) == SIGN_EXTEND;
9688 if (speed)
9689 {
9690 if (compound_p)
9691 {
9692 /* If the shift is considered cheap,
9693 then don't add any cost. */
9694 if (aarch64_cheap_mult_shift_p (x))
9695 ;
9696 else if (REG_P (op1))
9697 /* ARITH + shift-by-register. */
9698 cost += extra_cost->alu.arith_shift_reg;
9699 else if (is_extend)
9700 /* ARITH + extended register. We don't have a cost field
9701 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9702 cost += extra_cost->alu.extend_arith;
9703 else
9704 /* ARITH + shift-by-immediate. */
9705 cost += extra_cost->alu.arith_shift;
9706 }
9707 else
9708 /* LSL (immediate). */
9709 cost += extra_cost->alu.shift;
9710
9711 }
9712 /* Strip extends as we will have costed them in the case above. */
9713 if (is_extend)
9714 op0 = aarch64_strip_extend (op0, true);
9715
9716 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9717
9718 return cost;
9719 }
9720
9721 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9722 compound and let the below cases handle it. After all, MNEG is a
9723 special-case alias of MSUB. */
9724 if (GET_CODE (op0) == NEG)
9725 {
9726 op0 = XEXP (op0, 0);
9727 compound_p = true;
9728 }
9729
9730 /* Integer multiplies or FMAs have zero/sign extending variants. */
9731 if ((GET_CODE (op0) == ZERO_EXTEND
9732 && GET_CODE (op1) == ZERO_EXTEND)
9733 || (GET_CODE (op0) == SIGN_EXTEND
9734 && GET_CODE (op1) == SIGN_EXTEND))
9735 {
9736 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9737 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9738
9739 if (speed)
9740 {
9741 if (compound_p)
9742 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9743 cost += extra_cost->mult[0].extend_add;
9744 else
9745 /* MUL/SMULL/UMULL. */
9746 cost += extra_cost->mult[0].extend;
9747 }
9748
9749 return cost;
9750 }
9751
9752 /* This is either an integer multiply or a MADD. In both cases
9753 we want to recurse and cost the operands. */
9754 cost += rtx_cost (op0, mode, MULT, 0, speed);
9755 cost += rtx_cost (op1, mode, MULT, 1, speed);
9756
9757 if (speed)
9758 {
9759 if (compound_p)
9760 /* MADD/MSUB. */
9761 cost += extra_cost->mult[mode == DImode].add;
9762 else
9763 /* MUL. */
9764 cost += extra_cost->mult[mode == DImode].simple;
9765 }
9766
9767 return cost;
9768 }
9769 else
9770 {
9771 if (speed)
9772 {
9773 /* Floating-point FMA/FMUL can also support negations of the
9774 operands, unless the rounding mode is upward or downward in
9775 which case FNMUL is different than FMUL with operand negation. */
9776 bool neg0 = GET_CODE (op0) == NEG;
9777 bool neg1 = GET_CODE (op1) == NEG;
9778 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9779 {
9780 if (neg0)
9781 op0 = XEXP (op0, 0);
9782 if (neg1)
9783 op1 = XEXP (op1, 0);
9784 }
9785
9786 if (compound_p)
9787 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9788 cost += extra_cost->fp[mode == DFmode].fma;
9789 else
9790 /* FMUL/FNMUL. */
9791 cost += extra_cost->fp[mode == DFmode].mult;
9792 }
9793
9794 cost += rtx_cost (op0, mode, MULT, 0, speed);
9795 cost += rtx_cost (op1, mode, MULT, 1, speed);
9796 return cost;
9797 }
9798 }
9799
9800 static int
9801 aarch64_address_cost (rtx x,
9802 machine_mode mode,
9803 addr_space_t as ATTRIBUTE_UNUSED,
9804 bool speed)
9805 {
9806 enum rtx_code c = GET_CODE (x);
9807 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9808 struct aarch64_address_info info;
9809 int cost = 0;
9810 info.shift = 0;
9811
9812 if (!aarch64_classify_address (&info, x, mode, false))
9813 {
9814 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9815 {
9816 /* This is a CONST or SYMBOL ref which will be split
9817 in a different way depending on the code model in use.
9818 Cost it through the generic infrastructure. */
9819 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9820 /* Divide through by the cost of one instruction to
9821 bring it to the same units as the address costs. */
9822 cost_symbol_ref /= COSTS_N_INSNS (1);
9823 /* The cost is then the cost of preparing the address,
9824 followed by an immediate (possibly 0) offset. */
9825 return cost_symbol_ref + addr_cost->imm_offset;
9826 }
9827 else
9828 {
9829 /* This is most likely a jump table from a case
9830 statement. */
9831 return addr_cost->register_offset;
9832 }
9833 }
9834
9835 switch (info.type)
9836 {
9837 case ADDRESS_LO_SUM:
9838 case ADDRESS_SYMBOLIC:
9839 case ADDRESS_REG_IMM:
9840 cost += addr_cost->imm_offset;
9841 break;
9842
9843 case ADDRESS_REG_WB:
9844 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9845 cost += addr_cost->pre_modify;
9846 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9847 cost += addr_cost->post_modify;
9848 else
9849 gcc_unreachable ();
9850
9851 break;
9852
9853 case ADDRESS_REG_REG:
9854 cost += addr_cost->register_offset;
9855 break;
9856
9857 case ADDRESS_REG_SXTW:
9858 cost += addr_cost->register_sextend;
9859 break;
9860
9861 case ADDRESS_REG_UXTW:
9862 cost += addr_cost->register_zextend;
9863 break;
9864
9865 default:
9866 gcc_unreachable ();
9867 }
9868
9869
9870 if (info.shift > 0)
9871 {
9872 /* For the sake of calculating the cost of the shifted register
9873 component, we can treat same sized modes in the same way. */
9874 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9875 cost += addr_cost->addr_scale_costs.hi;
9876 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9877 cost += addr_cost->addr_scale_costs.si;
9878 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9879 cost += addr_cost->addr_scale_costs.di;
9880 else
9881 /* We can't tell, or this is a 128-bit vector. */
9882 cost += addr_cost->addr_scale_costs.ti;
9883 }
9884
9885 return cost;
9886 }
9887
9888 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9889 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9890 to be taken. */
9891
9892 int
9893 aarch64_branch_cost (bool speed_p, bool predictable_p)
9894 {
9895 /* When optimizing for speed, use the cost of unpredictable branches. */
9896 const struct cpu_branch_cost *branch_costs =
9897 aarch64_tune_params.branch_costs;
9898
9899 if (!speed_p || predictable_p)
9900 return branch_costs->predictable;
9901 else
9902 return branch_costs->unpredictable;
9903 }
9904
9905 /* Return true if the RTX X in mode MODE is a zero or sign extract
9906 usable in an ADD or SUB (extended register) instruction. */
9907 static bool
9908 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9909 {
9910 /* Catch add with a sign extract.
9911 This is add_<optab><mode>_multp2. */
9912 if (GET_CODE (x) == SIGN_EXTRACT
9913 || GET_CODE (x) == ZERO_EXTRACT)
9914 {
9915 rtx op0 = XEXP (x, 0);
9916 rtx op1 = XEXP (x, 1);
9917 rtx op2 = XEXP (x, 2);
9918
9919 if (GET_CODE (op0) == MULT
9920 && CONST_INT_P (op1)
9921 && op2 == const0_rtx
9922 && CONST_INT_P (XEXP (op0, 1))
9923 && aarch64_is_extend_from_extract (mode,
9924 XEXP (op0, 1),
9925 op1))
9926 {
9927 return true;
9928 }
9929 }
9930 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9931 No shift. */
9932 else if (GET_CODE (x) == SIGN_EXTEND
9933 || GET_CODE (x) == ZERO_EXTEND)
9934 return REG_P (XEXP (x, 0));
9935
9936 return false;
9937 }
9938
9939 static bool
9940 aarch64_frint_unspec_p (unsigned int u)
9941 {
9942 switch (u)
9943 {
9944 case UNSPEC_FRINTZ:
9945 case UNSPEC_FRINTP:
9946 case UNSPEC_FRINTM:
9947 case UNSPEC_FRINTA:
9948 case UNSPEC_FRINTN:
9949 case UNSPEC_FRINTX:
9950 case UNSPEC_FRINTI:
9951 return true;
9952
9953 default:
9954 return false;
9955 }
9956 }
9957
9958 /* Return true iff X is an rtx that will match an extr instruction
9959 i.e. as described in the *extr<mode>5_insn family of patterns.
9960 OP0 and OP1 will be set to the operands of the shifts involved
9961 on success and will be NULL_RTX otherwise. */
9962
9963 static bool
9964 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9965 {
9966 rtx op0, op1;
9967 scalar_int_mode mode;
9968 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9969 return false;
9970
9971 *res_op0 = NULL_RTX;
9972 *res_op1 = NULL_RTX;
9973
9974 if (GET_CODE (x) != IOR)
9975 return false;
9976
9977 op0 = XEXP (x, 0);
9978 op1 = XEXP (x, 1);
9979
9980 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9981 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9982 {
9983 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9984 if (GET_CODE (op1) == ASHIFT)
9985 std::swap (op0, op1);
9986
9987 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9988 return false;
9989
9990 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9991 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9992
9993 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9994 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9995 {
9996 *res_op0 = XEXP (op0, 0);
9997 *res_op1 = XEXP (op1, 0);
9998 return true;
9999 }
10000 }
10001
10002 return false;
10003 }
10004
10005 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10006 storing it in *COST. Result is true if the total cost of the operation
10007 has now been calculated. */
10008 static bool
10009 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10010 {
10011 rtx inner;
10012 rtx comparator;
10013 enum rtx_code cmpcode;
10014
10015 if (COMPARISON_P (op0))
10016 {
10017 inner = XEXP (op0, 0);
10018 comparator = XEXP (op0, 1);
10019 cmpcode = GET_CODE (op0);
10020 }
10021 else
10022 {
10023 inner = op0;
10024 comparator = const0_rtx;
10025 cmpcode = NE;
10026 }
10027
10028 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10029 {
10030 /* Conditional branch. */
10031 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10032 return true;
10033 else
10034 {
10035 if (cmpcode == NE || cmpcode == EQ)
10036 {
10037 if (comparator == const0_rtx)
10038 {
10039 /* TBZ/TBNZ/CBZ/CBNZ. */
10040 if (GET_CODE (inner) == ZERO_EXTRACT)
10041 /* TBZ/TBNZ. */
10042 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10043 ZERO_EXTRACT, 0, speed);
10044 else
10045 /* CBZ/CBNZ. */
10046 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10047
10048 return true;
10049 }
10050 }
10051 else if (cmpcode == LT || cmpcode == GE)
10052 {
10053 /* TBZ/TBNZ. */
10054 if (comparator == const0_rtx)
10055 return true;
10056 }
10057 }
10058 }
10059 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10060 {
10061 /* CCMP. */
10062 if (GET_CODE (op1) == COMPARE)
10063 {
10064 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10065 if (XEXP (op1, 1) == const0_rtx)
10066 *cost += 1;
10067 if (speed)
10068 {
10069 machine_mode mode = GET_MODE (XEXP (op1, 0));
10070 const struct cpu_cost_table *extra_cost
10071 = aarch64_tune_params.insn_extra_cost;
10072
10073 if (GET_MODE_CLASS (mode) == MODE_INT)
10074 *cost += extra_cost->alu.arith;
10075 else
10076 *cost += extra_cost->fp[mode == DFmode].compare;
10077 }
10078 return true;
10079 }
10080
10081 /* It's a conditional operation based on the status flags,
10082 so it must be some flavor of CSEL. */
10083
10084 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10085 if (GET_CODE (op1) == NEG
10086 || GET_CODE (op1) == NOT
10087 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10088 op1 = XEXP (op1, 0);
10089 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10090 {
10091 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10092 op1 = XEXP (op1, 0);
10093 op2 = XEXP (op2, 0);
10094 }
10095
10096 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10097 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10098 return true;
10099 }
10100
10101 /* We don't know what this is, cost all operands. */
10102 return false;
10103 }
10104
10105 /* Check whether X is a bitfield operation of the form shift + extend that
10106 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10107 operand to which the bitfield operation is applied. Otherwise return
10108 NULL_RTX. */
10109
10110 static rtx
10111 aarch64_extend_bitfield_pattern_p (rtx x)
10112 {
10113 rtx_code outer_code = GET_CODE (x);
10114 machine_mode outer_mode = GET_MODE (x);
10115
10116 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10117 && outer_mode != SImode && outer_mode != DImode)
10118 return NULL_RTX;
10119
10120 rtx inner = XEXP (x, 0);
10121 rtx_code inner_code = GET_CODE (inner);
10122 machine_mode inner_mode = GET_MODE (inner);
10123 rtx op = NULL_RTX;
10124
10125 switch (inner_code)
10126 {
10127 case ASHIFT:
10128 if (CONST_INT_P (XEXP (inner, 1))
10129 && (inner_mode == QImode || inner_mode == HImode))
10130 op = XEXP (inner, 0);
10131 break;
10132 case LSHIFTRT:
10133 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10134 && (inner_mode == QImode || inner_mode == HImode))
10135 op = XEXP (inner, 0);
10136 break;
10137 case ASHIFTRT:
10138 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10139 && (inner_mode == QImode || inner_mode == HImode))
10140 op = XEXP (inner, 0);
10141 break;
10142 default:
10143 break;
10144 }
10145
10146 return op;
10147 }
10148
10149 /* Return true if the mask and a shift amount from an RTX of the form
10150 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10151 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10152
10153 bool
10154 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10155 rtx shft_amnt)
10156 {
10157 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10158 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10159 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10160 && (INTVAL (mask)
10161 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10162 }
10163
10164 /* Return true if the masks and a shift amount from an RTX of the form
10165 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10166 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10167
10168 bool
10169 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10170 unsigned HOST_WIDE_INT mask1,
10171 unsigned HOST_WIDE_INT shft_amnt,
10172 unsigned HOST_WIDE_INT mask2)
10173 {
10174 unsigned HOST_WIDE_INT t;
10175
10176 /* Verify that there is no overlap in what bits are set in the two masks. */
10177 if (mask1 != ~mask2)
10178 return false;
10179
10180 /* Verify that mask2 is not all zeros or ones. */
10181 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10182 return false;
10183
10184 /* The shift amount should always be less than the mode size. */
10185 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10186
10187 /* Verify that the mask being shifted is contiguous and would be in the
10188 least significant bits after shifting by shft_amnt. */
10189 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10190 return (t == (t & -t));
10191 }
10192
10193 /* Calculate the cost of calculating X, storing it in *COST. Result
10194 is true if the total cost of the operation has now been calculated. */
10195 static bool
10196 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10197 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10198 {
10199 rtx op0, op1, op2;
10200 const struct cpu_cost_table *extra_cost
10201 = aarch64_tune_params.insn_extra_cost;
10202 int code = GET_CODE (x);
10203 scalar_int_mode int_mode;
10204
10205 /* By default, assume that everything has equivalent cost to the
10206 cheapest instruction. Any additional costs are applied as a delta
10207 above this default. */
10208 *cost = COSTS_N_INSNS (1);
10209
10210 switch (code)
10211 {
10212 case SET:
10213 /* The cost depends entirely on the operands to SET. */
10214 *cost = 0;
10215 op0 = SET_DEST (x);
10216 op1 = SET_SRC (x);
10217
10218 switch (GET_CODE (op0))
10219 {
10220 case MEM:
10221 if (speed)
10222 {
10223 rtx address = XEXP (op0, 0);
10224 if (VECTOR_MODE_P (mode))
10225 *cost += extra_cost->ldst.storev;
10226 else if (GET_MODE_CLASS (mode) == MODE_INT)
10227 *cost += extra_cost->ldst.store;
10228 else if (mode == SFmode)
10229 *cost += extra_cost->ldst.storef;
10230 else if (mode == DFmode)
10231 *cost += extra_cost->ldst.stored;
10232
10233 *cost +=
10234 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10235 0, speed));
10236 }
10237
10238 *cost += rtx_cost (op1, mode, SET, 1, speed);
10239 return true;
10240
10241 case SUBREG:
10242 if (! REG_P (SUBREG_REG (op0)))
10243 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10244
10245 /* Fall through. */
10246 case REG:
10247 /* The cost is one per vector-register copied. */
10248 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10249 {
10250 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10251 *cost = COSTS_N_INSNS (nregs);
10252 }
10253 /* const0_rtx is in general free, but we will use an
10254 instruction to set a register to 0. */
10255 else if (REG_P (op1) || op1 == const0_rtx)
10256 {
10257 /* The cost is 1 per register copied. */
10258 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10259 *cost = COSTS_N_INSNS (nregs);
10260 }
10261 else
10262 /* Cost is just the cost of the RHS of the set. */
10263 *cost += rtx_cost (op1, mode, SET, 1, speed);
10264 return true;
10265
10266 case ZERO_EXTRACT:
10267 case SIGN_EXTRACT:
10268 /* Bit-field insertion. Strip any redundant widening of
10269 the RHS to meet the width of the target. */
10270 if (GET_CODE (op1) == SUBREG)
10271 op1 = SUBREG_REG (op1);
10272 if ((GET_CODE (op1) == ZERO_EXTEND
10273 || GET_CODE (op1) == SIGN_EXTEND)
10274 && CONST_INT_P (XEXP (op0, 1))
10275 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10276 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10277 op1 = XEXP (op1, 0);
10278
10279 if (CONST_INT_P (op1))
10280 {
10281 /* MOV immediate is assumed to always be cheap. */
10282 *cost = COSTS_N_INSNS (1);
10283 }
10284 else
10285 {
10286 /* BFM. */
10287 if (speed)
10288 *cost += extra_cost->alu.bfi;
10289 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10290 }
10291
10292 return true;
10293
10294 default:
10295 /* We can't make sense of this, assume default cost. */
10296 *cost = COSTS_N_INSNS (1);
10297 return false;
10298 }
10299 return false;
10300
10301 case CONST_INT:
10302 /* If an instruction can incorporate a constant within the
10303 instruction, the instruction's expression avoids calling
10304 rtx_cost() on the constant. If rtx_cost() is called on a
10305 constant, then it is usually because the constant must be
10306 moved into a register by one or more instructions.
10307
10308 The exception is constant 0, which can be expressed
10309 as XZR/WZR and is therefore free. The exception to this is
10310 if we have (set (reg) (const0_rtx)) in which case we must cost
10311 the move. However, we can catch that when we cost the SET, so
10312 we don't need to consider that here. */
10313 if (x == const0_rtx)
10314 *cost = 0;
10315 else
10316 {
10317 /* To an approximation, building any other constant is
10318 proportionally expensive to the number of instructions
10319 required to build that constant. This is true whether we
10320 are compiling for SPEED or otherwise. */
10321 if (!is_a <scalar_int_mode> (mode, &int_mode))
10322 int_mode = word_mode;
10323 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10324 (NULL_RTX, x, false, int_mode));
10325 }
10326 return true;
10327
10328 case CONST_DOUBLE:
10329
10330 /* First determine number of instructions to do the move
10331 as an integer constant. */
10332 if (!aarch64_float_const_representable_p (x)
10333 && !aarch64_can_const_movi_rtx_p (x, mode)
10334 && aarch64_float_const_rtx_p (x))
10335 {
10336 unsigned HOST_WIDE_INT ival;
10337 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10338 gcc_assert (succeed);
10339
10340 scalar_int_mode imode = (mode == HFmode
10341 ? SImode
10342 : int_mode_for_mode (mode).require ());
10343 int ncost = aarch64_internal_mov_immediate
10344 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10345 *cost += COSTS_N_INSNS (ncost);
10346 return true;
10347 }
10348
10349 if (speed)
10350 {
10351 /* mov[df,sf]_aarch64. */
10352 if (aarch64_float_const_representable_p (x))
10353 /* FMOV (scalar immediate). */
10354 *cost += extra_cost->fp[mode == DFmode].fpconst;
10355 else if (!aarch64_float_const_zero_rtx_p (x))
10356 {
10357 /* This will be a load from memory. */
10358 if (mode == DFmode)
10359 *cost += extra_cost->ldst.loadd;
10360 else
10361 *cost += extra_cost->ldst.loadf;
10362 }
10363 else
10364 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10365 or MOV v0.s[0], wzr - neither of which are modeled by the
10366 cost tables. Just use the default cost. */
10367 {
10368 }
10369 }
10370
10371 return true;
10372
10373 case MEM:
10374 if (speed)
10375 {
10376 /* For loads we want the base cost of a load, plus an
10377 approximation for the additional cost of the addressing
10378 mode. */
10379 rtx address = XEXP (x, 0);
10380 if (VECTOR_MODE_P (mode))
10381 *cost += extra_cost->ldst.loadv;
10382 else if (GET_MODE_CLASS (mode) == MODE_INT)
10383 *cost += extra_cost->ldst.load;
10384 else if (mode == SFmode)
10385 *cost += extra_cost->ldst.loadf;
10386 else if (mode == DFmode)
10387 *cost += extra_cost->ldst.loadd;
10388
10389 *cost +=
10390 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10391 0, speed));
10392 }
10393
10394 return true;
10395
10396 case NEG:
10397 op0 = XEXP (x, 0);
10398
10399 if (VECTOR_MODE_P (mode))
10400 {
10401 if (speed)
10402 {
10403 /* FNEG. */
10404 *cost += extra_cost->vect.alu;
10405 }
10406 return false;
10407 }
10408
10409 if (GET_MODE_CLASS (mode) == MODE_INT)
10410 {
10411 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10412 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10413 {
10414 /* CSETM. */
10415 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10416 return true;
10417 }
10418
10419 /* Cost this as SUB wzr, X. */
10420 op0 = CONST0_RTX (mode);
10421 op1 = XEXP (x, 0);
10422 goto cost_minus;
10423 }
10424
10425 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10426 {
10427 /* Support (neg(fma...)) as a single instruction only if
10428 sign of zeros is unimportant. This matches the decision
10429 making in aarch64.md. */
10430 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10431 {
10432 /* FNMADD. */
10433 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10434 return true;
10435 }
10436 if (GET_CODE (op0) == MULT)
10437 {
10438 /* FNMUL. */
10439 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10440 return true;
10441 }
10442 if (speed)
10443 /* FNEG. */
10444 *cost += extra_cost->fp[mode == DFmode].neg;
10445 return false;
10446 }
10447
10448 return false;
10449
10450 case CLRSB:
10451 case CLZ:
10452 if (speed)
10453 {
10454 if (VECTOR_MODE_P (mode))
10455 *cost += extra_cost->vect.alu;
10456 else
10457 *cost += extra_cost->alu.clz;
10458 }
10459
10460 return false;
10461
10462 case COMPARE:
10463 op0 = XEXP (x, 0);
10464 op1 = XEXP (x, 1);
10465
10466 if (op1 == const0_rtx
10467 && GET_CODE (op0) == AND)
10468 {
10469 x = op0;
10470 mode = GET_MODE (op0);
10471 goto cost_logic;
10472 }
10473
10474 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10475 {
10476 /* TODO: A write to the CC flags possibly costs extra, this
10477 needs encoding in the cost tables. */
10478
10479 mode = GET_MODE (op0);
10480 /* ANDS. */
10481 if (GET_CODE (op0) == AND)
10482 {
10483 x = op0;
10484 goto cost_logic;
10485 }
10486
10487 if (GET_CODE (op0) == PLUS)
10488 {
10489 /* ADDS (and CMN alias). */
10490 x = op0;
10491 goto cost_plus;
10492 }
10493
10494 if (GET_CODE (op0) == MINUS)
10495 {
10496 /* SUBS. */
10497 x = op0;
10498 goto cost_minus;
10499 }
10500
10501 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10502 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10503 && CONST_INT_P (XEXP (op0, 2)))
10504 {
10505 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10506 Handle it here directly rather than going to cost_logic
10507 since we know the immediate generated for the TST is valid
10508 so we can avoid creating an intermediate rtx for it only
10509 for costing purposes. */
10510 if (speed)
10511 *cost += extra_cost->alu.logical;
10512
10513 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10514 ZERO_EXTRACT, 0, speed);
10515 return true;
10516 }
10517
10518 if (GET_CODE (op1) == NEG)
10519 {
10520 /* CMN. */
10521 if (speed)
10522 *cost += extra_cost->alu.arith;
10523
10524 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10525 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10526 return true;
10527 }
10528
10529 /* CMP.
10530
10531 Compare can freely swap the order of operands, and
10532 canonicalization puts the more complex operation first.
10533 But the integer MINUS logic expects the shift/extend
10534 operation in op1. */
10535 if (! (REG_P (op0)
10536 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10537 {
10538 op0 = XEXP (x, 1);
10539 op1 = XEXP (x, 0);
10540 }
10541 goto cost_minus;
10542 }
10543
10544 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10545 {
10546 /* FCMP. */
10547 if (speed)
10548 *cost += extra_cost->fp[mode == DFmode].compare;
10549
10550 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10551 {
10552 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10553 /* FCMP supports constant 0.0 for no extra cost. */
10554 return true;
10555 }
10556 return false;
10557 }
10558
10559 if (VECTOR_MODE_P (mode))
10560 {
10561 /* Vector compare. */
10562 if (speed)
10563 *cost += extra_cost->vect.alu;
10564
10565 if (aarch64_float_const_zero_rtx_p (op1))
10566 {
10567 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10568 cost. */
10569 return true;
10570 }
10571 return false;
10572 }
10573 return false;
10574
10575 case MINUS:
10576 {
10577 op0 = XEXP (x, 0);
10578 op1 = XEXP (x, 1);
10579
10580 cost_minus:
10581 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10582
10583 /* Detect valid immediates. */
10584 if ((GET_MODE_CLASS (mode) == MODE_INT
10585 || (GET_MODE_CLASS (mode) == MODE_CC
10586 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10587 && CONST_INT_P (op1)
10588 && aarch64_uimm12_shift (INTVAL (op1)))
10589 {
10590 if (speed)
10591 /* SUB(S) (immediate). */
10592 *cost += extra_cost->alu.arith;
10593 return true;
10594 }
10595
10596 /* Look for SUB (extended register). */
10597 if (is_a <scalar_int_mode> (mode, &int_mode)
10598 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10599 {
10600 if (speed)
10601 *cost += extra_cost->alu.extend_arith;
10602
10603 op1 = aarch64_strip_extend (op1, true);
10604 *cost += rtx_cost (op1, VOIDmode,
10605 (enum rtx_code) GET_CODE (op1), 0, speed);
10606 return true;
10607 }
10608
10609 rtx new_op1 = aarch64_strip_extend (op1, false);
10610
10611 /* Cost this as an FMA-alike operation. */
10612 if ((GET_CODE (new_op1) == MULT
10613 || aarch64_shift_p (GET_CODE (new_op1)))
10614 && code != COMPARE)
10615 {
10616 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10617 (enum rtx_code) code,
10618 speed);
10619 return true;
10620 }
10621
10622 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10623
10624 if (speed)
10625 {
10626 if (VECTOR_MODE_P (mode))
10627 {
10628 /* Vector SUB. */
10629 *cost += extra_cost->vect.alu;
10630 }
10631 else if (GET_MODE_CLASS (mode) == MODE_INT)
10632 {
10633 /* SUB(S). */
10634 *cost += extra_cost->alu.arith;
10635 }
10636 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10637 {
10638 /* FSUB. */
10639 *cost += extra_cost->fp[mode == DFmode].addsub;
10640 }
10641 }
10642 return true;
10643 }
10644
10645 case PLUS:
10646 {
10647 rtx new_op0;
10648
10649 op0 = XEXP (x, 0);
10650 op1 = XEXP (x, 1);
10651
10652 cost_plus:
10653 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10654 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10655 {
10656 /* CSINC. */
10657 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10658 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10659 return true;
10660 }
10661
10662 if (GET_MODE_CLASS (mode) == MODE_INT
10663 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10664 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10665 {
10666 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10667
10668 if (speed)
10669 /* ADD (immediate). */
10670 *cost += extra_cost->alu.arith;
10671 return true;
10672 }
10673
10674 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10675
10676 /* Look for ADD (extended register). */
10677 if (is_a <scalar_int_mode> (mode, &int_mode)
10678 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10679 {
10680 if (speed)
10681 *cost += extra_cost->alu.extend_arith;
10682
10683 op0 = aarch64_strip_extend (op0, true);
10684 *cost += rtx_cost (op0, VOIDmode,
10685 (enum rtx_code) GET_CODE (op0), 0, speed);
10686 return true;
10687 }
10688
10689 /* Strip any extend, leave shifts behind as we will
10690 cost them through mult_cost. */
10691 new_op0 = aarch64_strip_extend (op0, false);
10692
10693 if (GET_CODE (new_op0) == MULT
10694 || aarch64_shift_p (GET_CODE (new_op0)))
10695 {
10696 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10697 speed);
10698 return true;
10699 }
10700
10701 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10702
10703 if (speed)
10704 {
10705 if (VECTOR_MODE_P (mode))
10706 {
10707 /* Vector ADD. */
10708 *cost += extra_cost->vect.alu;
10709 }
10710 else if (GET_MODE_CLASS (mode) == MODE_INT)
10711 {
10712 /* ADD. */
10713 *cost += extra_cost->alu.arith;
10714 }
10715 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10716 {
10717 /* FADD. */
10718 *cost += extra_cost->fp[mode == DFmode].addsub;
10719 }
10720 }
10721 return true;
10722 }
10723
10724 case BSWAP:
10725 *cost = COSTS_N_INSNS (1);
10726
10727 if (speed)
10728 {
10729 if (VECTOR_MODE_P (mode))
10730 *cost += extra_cost->vect.alu;
10731 else
10732 *cost += extra_cost->alu.rev;
10733 }
10734 return false;
10735
10736 case IOR:
10737 if (aarch_rev16_p (x))
10738 {
10739 *cost = COSTS_N_INSNS (1);
10740
10741 if (speed)
10742 {
10743 if (VECTOR_MODE_P (mode))
10744 *cost += extra_cost->vect.alu;
10745 else
10746 *cost += extra_cost->alu.rev;
10747 }
10748 return true;
10749 }
10750
10751 if (aarch64_extr_rtx_p (x, &op0, &op1))
10752 {
10753 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10754 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10755 if (speed)
10756 *cost += extra_cost->alu.shift;
10757
10758 return true;
10759 }
10760 /* Fall through. */
10761 case XOR:
10762 case AND:
10763 cost_logic:
10764 op0 = XEXP (x, 0);
10765 op1 = XEXP (x, 1);
10766
10767 if (VECTOR_MODE_P (mode))
10768 {
10769 if (speed)
10770 *cost += extra_cost->vect.alu;
10771 return true;
10772 }
10773
10774 if (code == AND
10775 && GET_CODE (op0) == MULT
10776 && CONST_INT_P (XEXP (op0, 1))
10777 && CONST_INT_P (op1)
10778 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10779 INTVAL (op1)) != 0)
10780 {
10781 /* This is a UBFM/SBFM. */
10782 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10783 if (speed)
10784 *cost += extra_cost->alu.bfx;
10785 return true;
10786 }
10787
10788 if (is_int_mode (mode, &int_mode))
10789 {
10790 if (CONST_INT_P (op1))
10791 {
10792 /* We have a mask + shift version of a UBFIZ
10793 i.e. the *andim_ashift<mode>_bfiz pattern. */
10794 if (GET_CODE (op0) == ASHIFT
10795 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10796 XEXP (op0, 1)))
10797 {
10798 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10799 (enum rtx_code) code, 0, speed);
10800 if (speed)
10801 *cost += extra_cost->alu.bfx;
10802
10803 return true;
10804 }
10805 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10806 {
10807 /* We possibly get the immediate for free, this is not
10808 modelled. */
10809 *cost += rtx_cost (op0, int_mode,
10810 (enum rtx_code) code, 0, speed);
10811 if (speed)
10812 *cost += extra_cost->alu.logical;
10813
10814 return true;
10815 }
10816 }
10817 else
10818 {
10819 rtx new_op0 = op0;
10820
10821 /* Handle ORN, EON, or BIC. */
10822 if (GET_CODE (op0) == NOT)
10823 op0 = XEXP (op0, 0);
10824
10825 new_op0 = aarch64_strip_shift (op0);
10826
10827 /* If we had a shift on op0 then this is a logical-shift-
10828 by-register/immediate operation. Otherwise, this is just
10829 a logical operation. */
10830 if (speed)
10831 {
10832 if (new_op0 != op0)
10833 {
10834 /* Shift by immediate. */
10835 if (CONST_INT_P (XEXP (op0, 1)))
10836 *cost += extra_cost->alu.log_shift;
10837 else
10838 *cost += extra_cost->alu.log_shift_reg;
10839 }
10840 else
10841 *cost += extra_cost->alu.logical;
10842 }
10843
10844 /* In both cases we want to cost both operands. */
10845 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10846 0, speed);
10847 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10848 1, speed);
10849
10850 return true;
10851 }
10852 }
10853 return false;
10854
10855 case NOT:
10856 x = XEXP (x, 0);
10857 op0 = aarch64_strip_shift (x);
10858
10859 if (VECTOR_MODE_P (mode))
10860 {
10861 /* Vector NOT. */
10862 *cost += extra_cost->vect.alu;
10863 return false;
10864 }
10865
10866 /* MVN-shifted-reg. */
10867 if (op0 != x)
10868 {
10869 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10870
10871 if (speed)
10872 *cost += extra_cost->alu.log_shift;
10873
10874 return true;
10875 }
10876 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10877 Handle the second form here taking care that 'a' in the above can
10878 be a shift. */
10879 else if (GET_CODE (op0) == XOR)
10880 {
10881 rtx newop0 = XEXP (op0, 0);
10882 rtx newop1 = XEXP (op0, 1);
10883 rtx op0_stripped = aarch64_strip_shift (newop0);
10884
10885 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10886 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10887
10888 if (speed)
10889 {
10890 if (op0_stripped != newop0)
10891 *cost += extra_cost->alu.log_shift;
10892 else
10893 *cost += extra_cost->alu.logical;
10894 }
10895
10896 return true;
10897 }
10898 /* MVN. */
10899 if (speed)
10900 *cost += extra_cost->alu.logical;
10901
10902 return false;
10903
10904 case ZERO_EXTEND:
10905
10906 op0 = XEXP (x, 0);
10907 /* If a value is written in SI mode, then zero extended to DI
10908 mode, the operation will in general be free as a write to
10909 a 'w' register implicitly zeroes the upper bits of an 'x'
10910 register. However, if this is
10911
10912 (set (reg) (zero_extend (reg)))
10913
10914 we must cost the explicit register move. */
10915 if (mode == DImode
10916 && GET_MODE (op0) == SImode
10917 && outer == SET)
10918 {
10919 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10920
10921 /* If OP_COST is non-zero, then the cost of the zero extend
10922 is effectively the cost of the inner operation. Otherwise
10923 we have a MOV instruction and we take the cost from the MOV
10924 itself. This is true independently of whether we are
10925 optimizing for space or time. */
10926 if (op_cost)
10927 *cost = op_cost;
10928
10929 return true;
10930 }
10931 else if (MEM_P (op0))
10932 {
10933 /* All loads can zero extend to any size for free. */
10934 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10935 return true;
10936 }
10937
10938 op0 = aarch64_extend_bitfield_pattern_p (x);
10939 if (op0)
10940 {
10941 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10942 if (speed)
10943 *cost += extra_cost->alu.bfx;
10944 return true;
10945 }
10946
10947 if (speed)
10948 {
10949 if (VECTOR_MODE_P (mode))
10950 {
10951 /* UMOV. */
10952 *cost += extra_cost->vect.alu;
10953 }
10954 else
10955 {
10956 /* We generate an AND instead of UXTB/UXTH. */
10957 *cost += extra_cost->alu.logical;
10958 }
10959 }
10960 return false;
10961
10962 case SIGN_EXTEND:
10963 if (MEM_P (XEXP (x, 0)))
10964 {
10965 /* LDRSH. */
10966 if (speed)
10967 {
10968 rtx address = XEXP (XEXP (x, 0), 0);
10969 *cost += extra_cost->ldst.load_sign_extend;
10970
10971 *cost +=
10972 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10973 0, speed));
10974 }
10975 return true;
10976 }
10977
10978 op0 = aarch64_extend_bitfield_pattern_p (x);
10979 if (op0)
10980 {
10981 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10982 if (speed)
10983 *cost += extra_cost->alu.bfx;
10984 return true;
10985 }
10986
10987 if (speed)
10988 {
10989 if (VECTOR_MODE_P (mode))
10990 *cost += extra_cost->vect.alu;
10991 else
10992 *cost += extra_cost->alu.extend;
10993 }
10994 return false;
10995
10996 case ASHIFT:
10997 op0 = XEXP (x, 0);
10998 op1 = XEXP (x, 1);
10999
11000 if (CONST_INT_P (op1))
11001 {
11002 if (speed)
11003 {
11004 if (VECTOR_MODE_P (mode))
11005 {
11006 /* Vector shift (immediate). */
11007 *cost += extra_cost->vect.alu;
11008 }
11009 else
11010 {
11011 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
11012 aliases. */
11013 *cost += extra_cost->alu.shift;
11014 }
11015 }
11016
11017 /* We can incorporate zero/sign extend for free. */
11018 if (GET_CODE (op0) == ZERO_EXTEND
11019 || GET_CODE (op0) == SIGN_EXTEND)
11020 op0 = XEXP (op0, 0);
11021
11022 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11023 return true;
11024 }
11025 else
11026 {
11027 if (VECTOR_MODE_P (mode))
11028 {
11029 if (speed)
11030 /* Vector shift (register). */
11031 *cost += extra_cost->vect.alu;
11032 }
11033 else
11034 {
11035 if (speed)
11036 /* LSLV. */
11037 *cost += extra_cost->alu.shift_reg;
11038
11039 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11040 && CONST_INT_P (XEXP (op1, 1))
11041 && known_eq (INTVAL (XEXP (op1, 1)),
11042 GET_MODE_BITSIZE (mode) - 1))
11043 {
11044 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11045 /* We already demanded XEXP (op1, 0) to be REG_P, so
11046 don't recurse into it. */
11047 return true;
11048 }
11049 }
11050 return false; /* All arguments need to be in registers. */
11051 }
11052
11053 case ROTATE:
11054 case ROTATERT:
11055 case LSHIFTRT:
11056 case ASHIFTRT:
11057 op0 = XEXP (x, 0);
11058 op1 = XEXP (x, 1);
11059
11060 if (CONST_INT_P (op1))
11061 {
11062 /* ASR (immediate) and friends. */
11063 if (speed)
11064 {
11065 if (VECTOR_MODE_P (mode))
11066 *cost += extra_cost->vect.alu;
11067 else
11068 *cost += extra_cost->alu.shift;
11069 }
11070
11071 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11072 return true;
11073 }
11074 else
11075 {
11076 if (VECTOR_MODE_P (mode))
11077 {
11078 if (speed)
11079 /* Vector shift (register). */
11080 *cost += extra_cost->vect.alu;
11081 }
11082 else
11083 {
11084 if (speed)
11085 /* ASR (register) and friends. */
11086 *cost += extra_cost->alu.shift_reg;
11087
11088 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11089 && CONST_INT_P (XEXP (op1, 1))
11090 && known_eq (INTVAL (XEXP (op1, 1)),
11091 GET_MODE_BITSIZE (mode) - 1))
11092 {
11093 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11094 /* We already demanded XEXP (op1, 0) to be REG_P, so
11095 don't recurse into it. */
11096 return true;
11097 }
11098 }
11099 return false; /* All arguments need to be in registers. */
11100 }
11101
11102 case SYMBOL_REF:
11103
11104 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11105 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11106 {
11107 /* LDR. */
11108 if (speed)
11109 *cost += extra_cost->ldst.load;
11110 }
11111 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11112 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11113 {
11114 /* ADRP, followed by ADD. */
11115 *cost += COSTS_N_INSNS (1);
11116 if (speed)
11117 *cost += 2 * extra_cost->alu.arith;
11118 }
11119 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11120 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11121 {
11122 /* ADR. */
11123 if (speed)
11124 *cost += extra_cost->alu.arith;
11125 }
11126
11127 if (flag_pic)
11128 {
11129 /* One extra load instruction, after accessing the GOT. */
11130 *cost += COSTS_N_INSNS (1);
11131 if (speed)
11132 *cost += extra_cost->ldst.load;
11133 }
11134 return true;
11135
11136 case HIGH:
11137 case LO_SUM:
11138 /* ADRP/ADD (immediate). */
11139 if (speed)
11140 *cost += extra_cost->alu.arith;
11141 return true;
11142
11143 case ZERO_EXTRACT:
11144 case SIGN_EXTRACT:
11145 /* UBFX/SBFX. */
11146 if (speed)
11147 {
11148 if (VECTOR_MODE_P (mode))
11149 *cost += extra_cost->vect.alu;
11150 else
11151 *cost += extra_cost->alu.bfx;
11152 }
11153
11154 /* We can trust that the immediates used will be correct (there
11155 are no by-register forms), so we need only cost op0. */
11156 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11157 return true;
11158
11159 case MULT:
11160 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11161 /* aarch64_rtx_mult_cost always handles recursion to its
11162 operands. */
11163 return true;
11164
11165 case MOD:
11166 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11167 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11168 an unconditional negate. This case should only ever be reached through
11169 the set_smod_pow2_cheap check in expmed.c. */
11170 if (CONST_INT_P (XEXP (x, 1))
11171 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11172 && (mode == SImode || mode == DImode))
11173 {
11174 /* We expand to 4 instructions. Reset the baseline. */
11175 *cost = COSTS_N_INSNS (4);
11176
11177 if (speed)
11178 *cost += 2 * extra_cost->alu.logical
11179 + 2 * extra_cost->alu.arith;
11180
11181 return true;
11182 }
11183
11184 /* Fall-through. */
11185 case UMOD:
11186 if (speed)
11187 {
11188 /* Slighly prefer UMOD over SMOD. */
11189 if (VECTOR_MODE_P (mode))
11190 *cost += extra_cost->vect.alu;
11191 else if (GET_MODE_CLASS (mode) == MODE_INT)
11192 *cost += (extra_cost->mult[mode == DImode].add
11193 + extra_cost->mult[mode == DImode].idiv
11194 + (code == MOD ? 1 : 0));
11195 }
11196 return false; /* All arguments need to be in registers. */
11197
11198 case DIV:
11199 case UDIV:
11200 case SQRT:
11201 if (speed)
11202 {
11203 if (VECTOR_MODE_P (mode))
11204 *cost += extra_cost->vect.alu;
11205 else if (GET_MODE_CLASS (mode) == MODE_INT)
11206 /* There is no integer SQRT, so only DIV and UDIV can get
11207 here. */
11208 *cost += (extra_cost->mult[mode == DImode].idiv
11209 /* Slighly prefer UDIV over SDIV. */
11210 + (code == DIV ? 1 : 0));
11211 else
11212 *cost += extra_cost->fp[mode == DFmode].div;
11213 }
11214 return false; /* All arguments need to be in registers. */
11215
11216 case IF_THEN_ELSE:
11217 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11218 XEXP (x, 2), cost, speed);
11219
11220 case EQ:
11221 case NE:
11222 case GT:
11223 case GTU:
11224 case LT:
11225 case LTU:
11226 case GE:
11227 case GEU:
11228 case LE:
11229 case LEU:
11230
11231 return false; /* All arguments must be in registers. */
11232
11233 case FMA:
11234 op0 = XEXP (x, 0);
11235 op1 = XEXP (x, 1);
11236 op2 = XEXP (x, 2);
11237
11238 if (speed)
11239 {
11240 if (VECTOR_MODE_P (mode))
11241 *cost += extra_cost->vect.alu;
11242 else
11243 *cost += extra_cost->fp[mode == DFmode].fma;
11244 }
11245
11246 /* FMSUB, FNMADD, and FNMSUB are free. */
11247 if (GET_CODE (op0) == NEG)
11248 op0 = XEXP (op0, 0);
11249
11250 if (GET_CODE (op2) == NEG)
11251 op2 = XEXP (op2, 0);
11252
11253 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11254 and the by-element operand as operand 0. */
11255 if (GET_CODE (op1) == NEG)
11256 op1 = XEXP (op1, 0);
11257
11258 /* Catch vector-by-element operations. The by-element operand can
11259 either be (vec_duplicate (vec_select (x))) or just
11260 (vec_select (x)), depending on whether we are multiplying by
11261 a vector or a scalar.
11262
11263 Canonicalization is not very good in these cases, FMA4 will put the
11264 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11265 if (GET_CODE (op0) == VEC_DUPLICATE)
11266 op0 = XEXP (op0, 0);
11267 else if (GET_CODE (op1) == VEC_DUPLICATE)
11268 op1 = XEXP (op1, 0);
11269
11270 if (GET_CODE (op0) == VEC_SELECT)
11271 op0 = XEXP (op0, 0);
11272 else if (GET_CODE (op1) == VEC_SELECT)
11273 op1 = XEXP (op1, 0);
11274
11275 /* If the remaining parameters are not registers,
11276 get the cost to put them into registers. */
11277 *cost += rtx_cost (op0, mode, FMA, 0, speed);
11278 *cost += rtx_cost (op1, mode, FMA, 1, speed);
11279 *cost += rtx_cost (op2, mode, FMA, 2, speed);
11280 return true;
11281
11282 case FLOAT:
11283 case UNSIGNED_FLOAT:
11284 if (speed)
11285 *cost += extra_cost->fp[mode == DFmode].fromint;
11286 return false;
11287
11288 case FLOAT_EXTEND:
11289 if (speed)
11290 {
11291 if (VECTOR_MODE_P (mode))
11292 {
11293 /*Vector truncate. */
11294 *cost += extra_cost->vect.alu;
11295 }
11296 else
11297 *cost += extra_cost->fp[mode == DFmode].widen;
11298 }
11299 return false;
11300
11301 case FLOAT_TRUNCATE:
11302 if (speed)
11303 {
11304 if (VECTOR_MODE_P (mode))
11305 {
11306 /*Vector conversion. */
11307 *cost += extra_cost->vect.alu;
11308 }
11309 else
11310 *cost += extra_cost->fp[mode == DFmode].narrow;
11311 }
11312 return false;
11313
11314 case FIX:
11315 case UNSIGNED_FIX:
11316 x = XEXP (x, 0);
11317 /* Strip the rounding part. They will all be implemented
11318 by the fcvt* family of instructions anyway. */
11319 if (GET_CODE (x) == UNSPEC)
11320 {
11321 unsigned int uns_code = XINT (x, 1);
11322
11323 if (uns_code == UNSPEC_FRINTA
11324 || uns_code == UNSPEC_FRINTM
11325 || uns_code == UNSPEC_FRINTN
11326 || uns_code == UNSPEC_FRINTP
11327 || uns_code == UNSPEC_FRINTZ)
11328 x = XVECEXP (x, 0, 0);
11329 }
11330
11331 if (speed)
11332 {
11333 if (VECTOR_MODE_P (mode))
11334 *cost += extra_cost->vect.alu;
11335 else
11336 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11337 }
11338
11339 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11340 fixed-point fcvt. */
11341 if (GET_CODE (x) == MULT
11342 && ((VECTOR_MODE_P (mode)
11343 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11344 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11345 {
11346 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11347 0, speed);
11348 return true;
11349 }
11350
11351 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11352 return true;
11353
11354 case ABS:
11355 if (VECTOR_MODE_P (mode))
11356 {
11357 /* ABS (vector). */
11358 if (speed)
11359 *cost += extra_cost->vect.alu;
11360 }
11361 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11362 {
11363 op0 = XEXP (x, 0);
11364
11365 /* FABD, which is analogous to FADD. */
11366 if (GET_CODE (op0) == MINUS)
11367 {
11368 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11369 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11370 if (speed)
11371 *cost += extra_cost->fp[mode == DFmode].addsub;
11372
11373 return true;
11374 }
11375 /* Simple FABS is analogous to FNEG. */
11376 if (speed)
11377 *cost += extra_cost->fp[mode == DFmode].neg;
11378 }
11379 else
11380 {
11381 /* Integer ABS will either be split to
11382 two arithmetic instructions, or will be an ABS
11383 (scalar), which we don't model. */
11384 *cost = COSTS_N_INSNS (2);
11385 if (speed)
11386 *cost += 2 * extra_cost->alu.arith;
11387 }
11388 return false;
11389
11390 case SMAX:
11391 case SMIN:
11392 if (speed)
11393 {
11394 if (VECTOR_MODE_P (mode))
11395 *cost += extra_cost->vect.alu;
11396 else
11397 {
11398 /* FMAXNM/FMINNM/FMAX/FMIN.
11399 TODO: This may not be accurate for all implementations, but
11400 we do not model this in the cost tables. */
11401 *cost += extra_cost->fp[mode == DFmode].addsub;
11402 }
11403 }
11404 return false;
11405
11406 case UNSPEC:
11407 /* The floating point round to integer frint* instructions. */
11408 if (aarch64_frint_unspec_p (XINT (x, 1)))
11409 {
11410 if (speed)
11411 *cost += extra_cost->fp[mode == DFmode].roundint;
11412
11413 return false;
11414 }
11415
11416 if (XINT (x, 1) == UNSPEC_RBIT)
11417 {
11418 if (speed)
11419 *cost += extra_cost->alu.rev;
11420
11421 return false;
11422 }
11423 break;
11424
11425 case TRUNCATE:
11426
11427 /* Decompose <su>muldi3_highpart. */
11428 if (/* (truncate:DI */
11429 mode == DImode
11430 /* (lshiftrt:TI */
11431 && GET_MODE (XEXP (x, 0)) == TImode
11432 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11433 /* (mult:TI */
11434 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11435 /* (ANY_EXTEND:TI (reg:DI))
11436 (ANY_EXTEND:TI (reg:DI))) */
11437 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11438 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11439 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11440 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11441 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11442 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11443 /* (const_int 64) */
11444 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11445 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11446 {
11447 /* UMULH/SMULH. */
11448 if (speed)
11449 *cost += extra_cost->mult[mode == DImode].extend;
11450 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11451 mode, MULT, 0, speed);
11452 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11453 mode, MULT, 1, speed);
11454 return true;
11455 }
11456
11457 /* Fall through. */
11458 default:
11459 break;
11460 }
11461
11462 if (dump_file
11463 && flag_aarch64_verbose_cost)
11464 fprintf (dump_file,
11465 "\nFailed to cost RTX. Assuming default cost.\n");
11466
11467 return true;
11468 }
11469
11470 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11471 calculated for X. This cost is stored in *COST. Returns true
11472 if the total cost of X was calculated. */
11473 static bool
11474 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11475 int param, int *cost, bool speed)
11476 {
11477 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11478
11479 if (dump_file
11480 && flag_aarch64_verbose_cost)
11481 {
11482 print_rtl_single (dump_file, x);
11483 fprintf (dump_file, "\n%s cost: %d (%s)\n",
11484 speed ? "Hot" : "Cold",
11485 *cost, result ? "final" : "partial");
11486 }
11487
11488 return result;
11489 }
11490
11491 static int
11492 aarch64_register_move_cost (machine_mode mode,
11493 reg_class_t from_i, reg_class_t to_i)
11494 {
11495 enum reg_class from = (enum reg_class) from_i;
11496 enum reg_class to = (enum reg_class) to_i;
11497 const struct cpu_regmove_cost *regmove_cost
11498 = aarch64_tune_params.regmove_cost;
11499
11500 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11501 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11502 to = GENERAL_REGS;
11503
11504 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11505 from = GENERAL_REGS;
11506
11507 /* Moving between GPR and stack cost is the same as GP2GP. */
11508 if ((from == GENERAL_REGS && to == STACK_REG)
11509 || (to == GENERAL_REGS && from == STACK_REG))
11510 return regmove_cost->GP2GP;
11511
11512 /* To/From the stack register, we move via the gprs. */
11513 if (to == STACK_REG || from == STACK_REG)
11514 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11515 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11516
11517 if (known_eq (GET_MODE_SIZE (mode), 16))
11518 {
11519 /* 128-bit operations on general registers require 2 instructions. */
11520 if (from == GENERAL_REGS && to == GENERAL_REGS)
11521 return regmove_cost->GP2GP * 2;
11522 else if (from == GENERAL_REGS)
11523 return regmove_cost->GP2FP * 2;
11524 else if (to == GENERAL_REGS)
11525 return regmove_cost->FP2GP * 2;
11526
11527 /* When AdvSIMD instructions are disabled it is not possible to move
11528 a 128-bit value directly between Q registers. This is handled in
11529 secondary reload. A general register is used as a scratch to move
11530 the upper DI value and the lower DI value is moved directly,
11531 hence the cost is the sum of three moves. */
11532 if (! TARGET_SIMD)
11533 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11534
11535 return regmove_cost->FP2FP;
11536 }
11537
11538 if (from == GENERAL_REGS && to == GENERAL_REGS)
11539 return regmove_cost->GP2GP;
11540 else if (from == GENERAL_REGS)
11541 return regmove_cost->GP2FP;
11542 else if (to == GENERAL_REGS)
11543 return regmove_cost->FP2GP;
11544
11545 return regmove_cost->FP2FP;
11546 }
11547
11548 static int
11549 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11550 reg_class_t rclass ATTRIBUTE_UNUSED,
11551 bool in ATTRIBUTE_UNUSED)
11552 {
11553 return aarch64_tune_params.memmov_cost;
11554 }
11555
11556 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11557 to optimize 1.0/sqrt. */
11558
11559 static bool
11560 use_rsqrt_p (machine_mode mode)
11561 {
11562 return (!flag_trapping_math
11563 && flag_unsafe_math_optimizations
11564 && ((aarch64_tune_params.approx_modes->recip_sqrt
11565 & AARCH64_APPROX_MODE (mode))
11566 || flag_mrecip_low_precision_sqrt));
11567 }
11568
11569 /* Function to decide when to use the approximate reciprocal square root
11570 builtin. */
11571
11572 static tree
11573 aarch64_builtin_reciprocal (tree fndecl)
11574 {
11575 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11576
11577 if (!use_rsqrt_p (mode))
11578 return NULL_TREE;
11579 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11580 }
11581
11582 /* Emit instruction sequence to compute either the approximate square root
11583 or its approximate reciprocal, depending on the flag RECP, and return
11584 whether the sequence was emitted or not. */
11585
11586 bool
11587 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11588 {
11589 machine_mode mode = GET_MODE (dst);
11590
11591 if (GET_MODE_INNER (mode) == HFmode)
11592 {
11593 gcc_assert (!recp);
11594 return false;
11595 }
11596
11597 if (!recp)
11598 {
11599 if (!(flag_mlow_precision_sqrt
11600 || (aarch64_tune_params.approx_modes->sqrt
11601 & AARCH64_APPROX_MODE (mode))))
11602 return false;
11603
11604 if (flag_finite_math_only
11605 || flag_trapping_math
11606 || !flag_unsafe_math_optimizations
11607 || optimize_function_for_size_p (cfun))
11608 return false;
11609 }
11610 else
11611 /* Caller assumes we cannot fail. */
11612 gcc_assert (use_rsqrt_p (mode));
11613
11614 machine_mode mmsk = mode_for_int_vector (mode).require ();
11615 rtx xmsk = gen_reg_rtx (mmsk);
11616 if (!recp)
11617 /* When calculating the approximate square root, compare the
11618 argument with 0.0 and create a mask. */
11619 emit_insn (gen_rtx_SET (xmsk,
11620 gen_rtx_NEG (mmsk,
11621 gen_rtx_EQ (mmsk, src,
11622 CONST0_RTX (mode)))));
11623
11624 /* Estimate the approximate reciprocal square root. */
11625 rtx xdst = gen_reg_rtx (mode);
11626 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11627
11628 /* Iterate over the series twice for SF and thrice for DF. */
11629 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11630
11631 /* Optionally iterate over the series once less for faster performance
11632 while sacrificing the accuracy. */
11633 if ((recp && flag_mrecip_low_precision_sqrt)
11634 || (!recp && flag_mlow_precision_sqrt))
11635 iterations--;
11636
11637 /* Iterate over the series to calculate the approximate reciprocal square
11638 root. */
11639 rtx x1 = gen_reg_rtx (mode);
11640 while (iterations--)
11641 {
11642 rtx x2 = gen_reg_rtx (mode);
11643 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11644
11645 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11646
11647 if (iterations > 0)
11648 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11649 }
11650
11651 if (!recp)
11652 {
11653 /* Qualify the approximate reciprocal square root when the argument is
11654 0.0 by squashing the intermediary result to 0.0. */
11655 rtx xtmp = gen_reg_rtx (mmsk);
11656 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11657 gen_rtx_SUBREG (mmsk, xdst, 0)));
11658 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11659
11660 /* Calculate the approximate square root. */
11661 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11662 }
11663
11664 /* Finalize the approximation. */
11665 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11666
11667 return true;
11668 }
11669
11670 /* Emit the instruction sequence to compute the approximation for the division
11671 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11672
11673 bool
11674 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11675 {
11676 machine_mode mode = GET_MODE (quo);
11677
11678 if (GET_MODE_INNER (mode) == HFmode)
11679 return false;
11680
11681 bool use_approx_division_p = (flag_mlow_precision_div
11682 || (aarch64_tune_params.approx_modes->division
11683 & AARCH64_APPROX_MODE (mode)));
11684
11685 if (!flag_finite_math_only
11686 || flag_trapping_math
11687 || !flag_unsafe_math_optimizations
11688 || optimize_function_for_size_p (cfun)
11689 || !use_approx_division_p)
11690 return false;
11691
11692 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11693 return false;
11694
11695 /* Estimate the approximate reciprocal. */
11696 rtx xrcp = gen_reg_rtx (mode);
11697 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11698
11699 /* Iterate over the series twice for SF and thrice for DF. */
11700 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11701
11702 /* Optionally iterate over the series once less for faster performance,
11703 while sacrificing the accuracy. */
11704 if (flag_mlow_precision_div)
11705 iterations--;
11706
11707 /* Iterate over the series to calculate the approximate reciprocal. */
11708 rtx xtmp = gen_reg_rtx (mode);
11709 while (iterations--)
11710 {
11711 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11712
11713 if (iterations > 0)
11714 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11715 }
11716
11717 if (num != CONST1_RTX (mode))
11718 {
11719 /* As the approximate reciprocal of DEN is already calculated, only
11720 calculate the approximate division when NUM is not 1.0. */
11721 rtx xnum = force_reg (mode, num);
11722 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11723 }
11724
11725 /* Finalize the approximation. */
11726 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11727 return true;
11728 }
11729
11730 /* Return the number of instructions that can be issued per cycle. */
11731 static int
11732 aarch64_sched_issue_rate (void)
11733 {
11734 return aarch64_tune_params.issue_rate;
11735 }
11736
11737 static int
11738 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11739 {
11740 int issue_rate = aarch64_sched_issue_rate ();
11741
11742 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11743 }
11744
11745
11746 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11747 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11748 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11749
11750 static int
11751 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11752 int ready_index)
11753 {
11754 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11755 }
11756
11757
11758 /* Vectorizer cost model target hooks. */
11759
11760 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11761 static int
11762 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11763 tree vectype,
11764 int misalign ATTRIBUTE_UNUSED)
11765 {
11766 unsigned elements;
11767 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11768 bool fp = false;
11769
11770 if (vectype != NULL)
11771 fp = FLOAT_TYPE_P (vectype);
11772
11773 switch (type_of_cost)
11774 {
11775 case scalar_stmt:
11776 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11777
11778 case scalar_load:
11779 return costs->scalar_load_cost;
11780
11781 case scalar_store:
11782 return costs->scalar_store_cost;
11783
11784 case vector_stmt:
11785 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11786
11787 case vector_load:
11788 return costs->vec_align_load_cost;
11789
11790 case vector_store:
11791 return costs->vec_store_cost;
11792
11793 case vec_to_scalar:
11794 return costs->vec_to_scalar_cost;
11795
11796 case scalar_to_vec:
11797 return costs->scalar_to_vec_cost;
11798
11799 case unaligned_load:
11800 case vector_gather_load:
11801 return costs->vec_unalign_load_cost;
11802
11803 case unaligned_store:
11804 case vector_scatter_store:
11805 return costs->vec_unalign_store_cost;
11806
11807 case cond_branch_taken:
11808 return costs->cond_taken_branch_cost;
11809
11810 case cond_branch_not_taken:
11811 return costs->cond_not_taken_branch_cost;
11812
11813 case vec_perm:
11814 return costs->vec_permute_cost;
11815
11816 case vec_promote_demote:
11817 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11818
11819 case vec_construct:
11820 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11821 return elements / 2 + 1;
11822
11823 default:
11824 gcc_unreachable ();
11825 }
11826 }
11827
11828 /* Implement targetm.vectorize.add_stmt_cost. */
11829 static unsigned
11830 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11831 struct _stmt_vec_info *stmt_info, int misalign,
11832 enum vect_cost_model_location where)
11833 {
11834 unsigned *cost = (unsigned *) data;
11835 unsigned retval = 0;
11836
11837 if (flag_vect_cost_model)
11838 {
11839 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11840 int stmt_cost =
11841 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11842
11843 /* Statements in an inner loop relative to the loop being
11844 vectorized are weighted more heavily. The value here is
11845 arbitrary and could potentially be improved with analysis. */
11846 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11847 count *= 50; /* FIXME */
11848
11849 retval = (unsigned) (count * stmt_cost);
11850 cost[where] += retval;
11851 }
11852
11853 return retval;
11854 }
11855
11856 static void initialize_aarch64_code_model (struct gcc_options *);
11857
11858 /* Parse the TO_PARSE string and put the architecture struct that it
11859 selects into RES and the architectural features into ISA_FLAGS.
11860 Return an aarch64_parse_opt_result describing the parse result.
11861 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11862 When the TO_PARSE string contains an invalid extension,
11863 a copy of the string is created and stored to INVALID_EXTENSION. */
11864
11865 static enum aarch64_parse_opt_result
11866 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11867 uint64_t *isa_flags, std::string *invalid_extension)
11868 {
11869 const char *ext;
11870 const struct processor *arch;
11871 size_t len;
11872
11873 ext = strchr (to_parse, '+');
11874
11875 if (ext != NULL)
11876 len = ext - to_parse;
11877 else
11878 len = strlen (to_parse);
11879
11880 if (len == 0)
11881 return AARCH64_PARSE_MISSING_ARG;
11882
11883
11884 /* Loop through the list of supported ARCHes to find a match. */
11885 for (arch = all_architectures; arch->name != NULL; arch++)
11886 {
11887 if (strlen (arch->name) == len
11888 && strncmp (arch->name, to_parse, len) == 0)
11889 {
11890 uint64_t isa_temp = arch->flags;
11891
11892 if (ext != NULL)
11893 {
11894 /* TO_PARSE string contains at least one extension. */
11895 enum aarch64_parse_opt_result ext_res
11896 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11897
11898 if (ext_res != AARCH64_PARSE_OK)
11899 return ext_res;
11900 }
11901 /* Extension parsing was successful. Confirm the result
11902 arch and ISA flags. */
11903 *res = arch;
11904 *isa_flags = isa_temp;
11905 return AARCH64_PARSE_OK;
11906 }
11907 }
11908
11909 /* ARCH name not found in list. */
11910 return AARCH64_PARSE_INVALID_ARG;
11911 }
11912
11913 /* Parse the TO_PARSE string and put the result tuning in RES and the
11914 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11915 describing the parse result. If there is an error parsing, RES and
11916 ISA_FLAGS are left unchanged.
11917 When the TO_PARSE string contains an invalid extension,
11918 a copy of the string is created and stored to INVALID_EXTENSION. */
11919
11920 static enum aarch64_parse_opt_result
11921 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11922 uint64_t *isa_flags, std::string *invalid_extension)
11923 {
11924 const char *ext;
11925 const struct processor *cpu;
11926 size_t len;
11927
11928 ext = strchr (to_parse, '+');
11929
11930 if (ext != NULL)
11931 len = ext - to_parse;
11932 else
11933 len = strlen (to_parse);
11934
11935 if (len == 0)
11936 return AARCH64_PARSE_MISSING_ARG;
11937
11938
11939 /* Loop through the list of supported CPUs to find a match. */
11940 for (cpu = all_cores; cpu->name != NULL; cpu++)
11941 {
11942 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11943 {
11944 uint64_t isa_temp = cpu->flags;
11945
11946
11947 if (ext != NULL)
11948 {
11949 /* TO_PARSE string contains at least one extension. */
11950 enum aarch64_parse_opt_result ext_res
11951 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11952
11953 if (ext_res != AARCH64_PARSE_OK)
11954 return ext_res;
11955 }
11956 /* Extension parsing was successfull. Confirm the result
11957 cpu and ISA flags. */
11958 *res = cpu;
11959 *isa_flags = isa_temp;
11960 return AARCH64_PARSE_OK;
11961 }
11962 }
11963
11964 /* CPU name not found in list. */
11965 return AARCH64_PARSE_INVALID_ARG;
11966 }
11967
11968 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11969 Return an aarch64_parse_opt_result describing the parse result.
11970 If the parsing fails the RES does not change. */
11971
11972 static enum aarch64_parse_opt_result
11973 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11974 {
11975 const struct processor *cpu;
11976
11977 /* Loop through the list of supported CPUs to find a match. */
11978 for (cpu = all_cores; cpu->name != NULL; cpu++)
11979 {
11980 if (strcmp (cpu->name, to_parse) == 0)
11981 {
11982 *res = cpu;
11983 return AARCH64_PARSE_OK;
11984 }
11985 }
11986
11987 /* CPU name not found in list. */
11988 return AARCH64_PARSE_INVALID_ARG;
11989 }
11990
11991 /* Parse TOKEN, which has length LENGTH to see if it is an option
11992 described in FLAG. If it is, return the index bit for that fusion type.
11993 If not, error (printing OPTION_NAME) and return zero. */
11994
11995 static unsigned int
11996 aarch64_parse_one_option_token (const char *token,
11997 size_t length,
11998 const struct aarch64_flag_desc *flag,
11999 const char *option_name)
12000 {
12001 for (; flag->name != NULL; flag++)
12002 {
12003 if (length == strlen (flag->name)
12004 && !strncmp (flag->name, token, length))
12005 return flag->flag;
12006 }
12007
12008 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12009 return 0;
12010 }
12011
12012 /* Parse OPTION which is a comma-separated list of flags to enable.
12013 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12014 default state we inherit from the CPU tuning structures. OPTION_NAME
12015 gives the top-level option we are parsing in the -moverride string,
12016 for use in error messages. */
12017
12018 static unsigned int
12019 aarch64_parse_boolean_options (const char *option,
12020 const struct aarch64_flag_desc *flags,
12021 unsigned int initial_state,
12022 const char *option_name)
12023 {
12024 const char separator = '.';
12025 const char* specs = option;
12026 const char* ntoken = option;
12027 unsigned int found_flags = initial_state;
12028
12029 while ((ntoken = strchr (specs, separator)))
12030 {
12031 size_t token_length = ntoken - specs;
12032 unsigned token_ops = aarch64_parse_one_option_token (specs,
12033 token_length,
12034 flags,
12035 option_name);
12036 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12037 in the token stream, reset the supported operations. So:
12038
12039 adrp+add.cmp+branch.none.adrp+add
12040
12041 would have the result of turning on only adrp+add fusion. */
12042 if (!token_ops)
12043 found_flags = 0;
12044
12045 found_flags |= token_ops;
12046 specs = ++ntoken;
12047 }
12048
12049 /* We ended with a comma, print something. */
12050 if (!(*specs))
12051 {
12052 error ("%s string ill-formed\n", option_name);
12053 return 0;
12054 }
12055
12056 /* We still have one more token to parse. */
12057 size_t token_length = strlen (specs);
12058 unsigned token_ops = aarch64_parse_one_option_token (specs,
12059 token_length,
12060 flags,
12061 option_name);
12062 if (!token_ops)
12063 found_flags = 0;
12064
12065 found_flags |= token_ops;
12066 return found_flags;
12067 }
12068
12069 /* Support for overriding instruction fusion. */
12070
12071 static void
12072 aarch64_parse_fuse_string (const char *fuse_string,
12073 struct tune_params *tune)
12074 {
12075 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12076 aarch64_fusible_pairs,
12077 tune->fusible_ops,
12078 "fuse=");
12079 }
12080
12081 /* Support for overriding other tuning flags. */
12082
12083 static void
12084 aarch64_parse_tune_string (const char *tune_string,
12085 struct tune_params *tune)
12086 {
12087 tune->extra_tuning_flags
12088 = aarch64_parse_boolean_options (tune_string,
12089 aarch64_tuning_flags,
12090 tune->extra_tuning_flags,
12091 "tune=");
12092 }
12093
12094 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12095 Accept the valid SVE vector widths allowed by
12096 aarch64_sve_vector_bits_enum and use it to override sve_width
12097 in TUNE. */
12098
12099 static void
12100 aarch64_parse_sve_width_string (const char *tune_string,
12101 struct tune_params *tune)
12102 {
12103 int width = -1;
12104
12105 int n = sscanf (tune_string, "%d", &width);
12106 if (n == EOF)
12107 {
12108 error ("invalid format for sve_width");
12109 return;
12110 }
12111 switch (width)
12112 {
12113 case SVE_128:
12114 case SVE_256:
12115 case SVE_512:
12116 case SVE_1024:
12117 case SVE_2048:
12118 break;
12119 default:
12120 error ("invalid sve_width value: %d", width);
12121 }
12122 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12123 }
12124
12125 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12126 we understand. If it is, extract the option string and handoff to
12127 the appropriate function. */
12128
12129 void
12130 aarch64_parse_one_override_token (const char* token,
12131 size_t length,
12132 struct tune_params *tune)
12133 {
12134 const struct aarch64_tuning_override_function *fn
12135 = aarch64_tuning_override_functions;
12136
12137 const char *option_part = strchr (token, '=');
12138 if (!option_part)
12139 {
12140 error ("tuning string missing in option (%s)", token);
12141 return;
12142 }
12143
12144 /* Get the length of the option name. */
12145 length = option_part - token;
12146 /* Skip the '=' to get to the option string. */
12147 option_part++;
12148
12149 for (; fn->name != NULL; fn++)
12150 {
12151 if (!strncmp (fn->name, token, length))
12152 {
12153 fn->parse_override (option_part, tune);
12154 return;
12155 }
12156 }
12157
12158 error ("unknown tuning option (%s)",token);
12159 return;
12160 }
12161
12162 /* A checking mechanism for the implementation of the tls size. */
12163
12164 static void
12165 initialize_aarch64_tls_size (struct gcc_options *opts)
12166 {
12167 if (aarch64_tls_size == 0)
12168 aarch64_tls_size = 24;
12169
12170 switch (opts->x_aarch64_cmodel_var)
12171 {
12172 case AARCH64_CMODEL_TINY:
12173 /* Both the default and maximum TLS size allowed under tiny is 1M which
12174 needs two instructions to address, so we clamp the size to 24. */
12175 if (aarch64_tls_size > 24)
12176 aarch64_tls_size = 24;
12177 break;
12178 case AARCH64_CMODEL_SMALL:
12179 /* The maximum TLS size allowed under small is 4G. */
12180 if (aarch64_tls_size > 32)
12181 aarch64_tls_size = 32;
12182 break;
12183 case AARCH64_CMODEL_LARGE:
12184 /* The maximum TLS size allowed under large is 16E.
12185 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12186 if (aarch64_tls_size > 48)
12187 aarch64_tls_size = 48;
12188 break;
12189 default:
12190 gcc_unreachable ();
12191 }
12192
12193 return;
12194 }
12195
12196 /* Parse STRING looking for options in the format:
12197 string :: option:string
12198 option :: name=substring
12199 name :: {a-z}
12200 substring :: defined by option. */
12201
12202 static void
12203 aarch64_parse_override_string (const char* input_string,
12204 struct tune_params* tune)
12205 {
12206 const char separator = ':';
12207 size_t string_length = strlen (input_string) + 1;
12208 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12209 char *string = string_root;
12210 strncpy (string, input_string, string_length);
12211 string[string_length - 1] = '\0';
12212
12213 char* ntoken = string;
12214
12215 while ((ntoken = strchr (string, separator)))
12216 {
12217 size_t token_length = ntoken - string;
12218 /* Make this substring look like a string. */
12219 *ntoken = '\0';
12220 aarch64_parse_one_override_token (string, token_length, tune);
12221 string = ++ntoken;
12222 }
12223
12224 /* One last option to parse. */
12225 aarch64_parse_one_override_token (string, strlen (string), tune);
12226 free (string_root);
12227 }
12228
12229
12230 static void
12231 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12232 {
12233 if (accepted_branch_protection_string)
12234 {
12235 opts->x_aarch64_branch_protection_string
12236 = xstrdup (accepted_branch_protection_string);
12237 }
12238
12239 /* PR 70044: We have to be careful about being called multiple times for the
12240 same function. This means all changes should be repeatable. */
12241
12242 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12243 Disable the frame pointer flag so the mid-end will not use a frame
12244 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12245 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12246 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12247 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12248 if (opts->x_flag_omit_frame_pointer == 0)
12249 opts->x_flag_omit_frame_pointer = 2;
12250
12251 /* If not optimizing for size, set the default
12252 alignment to what the target wants. */
12253 if (!opts->x_optimize_size)
12254 {
12255 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12256 opts->x_str_align_loops = aarch64_tune_params.loop_align;
12257 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12258 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12259 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12260 opts->x_str_align_functions = aarch64_tune_params.function_align;
12261 }
12262
12263 /* We default to no pc-relative literal loads. */
12264
12265 aarch64_pcrelative_literal_loads = false;
12266
12267 /* If -mpc-relative-literal-loads is set on the command line, this
12268 implies that the user asked for PC relative literal loads. */
12269 if (opts->x_pcrelative_literal_loads == 1)
12270 aarch64_pcrelative_literal_loads = true;
12271
12272 /* In the tiny memory model it makes no sense to disallow PC relative
12273 literal pool loads. */
12274 if (aarch64_cmodel == AARCH64_CMODEL_TINY
12275 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12276 aarch64_pcrelative_literal_loads = true;
12277
12278 /* When enabling the lower precision Newton series for the square root, also
12279 enable it for the reciprocal square root, since the latter is an
12280 intermediary step for the former. */
12281 if (flag_mlow_precision_sqrt)
12282 flag_mrecip_low_precision_sqrt = true;
12283 }
12284
12285 /* 'Unpack' up the internal tuning structs and update the options
12286 in OPTS. The caller must have set up selected_tune and selected_arch
12287 as all the other target-specific codegen decisions are
12288 derived from them. */
12289
12290 void
12291 aarch64_override_options_internal (struct gcc_options *opts)
12292 {
12293 aarch64_tune_flags = selected_tune->flags;
12294 aarch64_tune = selected_tune->sched_core;
12295 /* Make a copy of the tuning parameters attached to the core, which
12296 we may later overwrite. */
12297 aarch64_tune_params = *(selected_tune->tune);
12298 aarch64_architecture_version = selected_arch->architecture_version;
12299
12300 if (opts->x_aarch64_override_tune_string)
12301 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12302 &aarch64_tune_params);
12303
12304 /* This target defaults to strict volatile bitfields. */
12305 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12306 opts->x_flag_strict_volatile_bitfields = 1;
12307
12308 if (aarch64_stack_protector_guard == SSP_GLOBAL
12309 && opts->x_aarch64_stack_protector_guard_offset_str)
12310 {
12311 error ("incompatible options %<-mstack-protector-guard=global%> and "
12312 "%<-mstack-protector-guard-offset=%s%>",
12313 aarch64_stack_protector_guard_offset_str);
12314 }
12315
12316 if (aarch64_stack_protector_guard == SSP_SYSREG
12317 && !(opts->x_aarch64_stack_protector_guard_offset_str
12318 && opts->x_aarch64_stack_protector_guard_reg_str))
12319 {
12320 error ("both %<-mstack-protector-guard-offset%> and "
12321 "%<-mstack-protector-guard-reg%> must be used "
12322 "with %<-mstack-protector-guard=sysreg%>");
12323 }
12324
12325 if (opts->x_aarch64_stack_protector_guard_reg_str)
12326 {
12327 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12328 error ("specify a system register with a small string length.");
12329 }
12330
12331 if (opts->x_aarch64_stack_protector_guard_offset_str)
12332 {
12333 char *end;
12334 const char *str = aarch64_stack_protector_guard_offset_str;
12335 errno = 0;
12336 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12337 if (!*str || *end || errno)
12338 error ("%qs is not a valid offset in %qs", str,
12339 "-mstack-protector-guard-offset=");
12340 aarch64_stack_protector_guard_offset = offs;
12341 }
12342
12343 initialize_aarch64_code_model (opts);
12344 initialize_aarch64_tls_size (opts);
12345
12346 int queue_depth = 0;
12347 switch (aarch64_tune_params.autoprefetcher_model)
12348 {
12349 case tune_params::AUTOPREFETCHER_OFF:
12350 queue_depth = -1;
12351 break;
12352 case tune_params::AUTOPREFETCHER_WEAK:
12353 queue_depth = 0;
12354 break;
12355 case tune_params::AUTOPREFETCHER_STRONG:
12356 queue_depth = max_insn_queue_index + 1;
12357 break;
12358 default:
12359 gcc_unreachable ();
12360 }
12361
12362 /* We don't mind passing in global_options_set here as we don't use
12363 the *options_set structs anyway. */
12364 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12365 queue_depth,
12366 opts->x_param_values,
12367 global_options_set.x_param_values);
12368
12369 /* Set up parameters to be used in prefetching algorithm. Do not
12370 override the defaults unless we are tuning for a core we have
12371 researched values for. */
12372 if (aarch64_tune_params.prefetch->num_slots > 0)
12373 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12374 aarch64_tune_params.prefetch->num_slots,
12375 opts->x_param_values,
12376 global_options_set.x_param_values);
12377 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12378 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12379 aarch64_tune_params.prefetch->l1_cache_size,
12380 opts->x_param_values,
12381 global_options_set.x_param_values);
12382 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12383 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12384 aarch64_tune_params.prefetch->l1_cache_line_size,
12385 opts->x_param_values,
12386 global_options_set.x_param_values);
12387 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12388 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12389 aarch64_tune_params.prefetch->l2_cache_size,
12390 opts->x_param_values,
12391 global_options_set.x_param_values);
12392 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12393 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12394 0,
12395 opts->x_param_values,
12396 global_options_set.x_param_values);
12397 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12398 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12399 aarch64_tune_params.prefetch->minimum_stride,
12400 opts->x_param_values,
12401 global_options_set.x_param_values);
12402
12403 /* Use the alternative scheduling-pressure algorithm by default. */
12404 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12405 opts->x_param_values,
12406 global_options_set.x_param_values);
12407
12408 /* If the user hasn't changed it via configure then set the default to 64 KB
12409 for the backend. */
12410 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12411 DEFAULT_STK_CLASH_GUARD_SIZE == 0
12412 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12413 opts->x_param_values,
12414 global_options_set.x_param_values);
12415
12416 /* Validate the guard size. */
12417 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12418
12419 /* Enforce that interval is the same size as size so the mid-end does the
12420 right thing. */
12421 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12422 guard_size,
12423 opts->x_param_values,
12424 global_options_set.x_param_values);
12425
12426 /* The maybe_set calls won't update the value if the user has explicitly set
12427 one. Which means we need to validate that probing interval and guard size
12428 are equal. */
12429 int probe_interval
12430 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12431 if (guard_size != probe_interval)
12432 error ("stack clash guard size %<%d%> must be equal to probing interval "
12433 "%<%d%>", guard_size, probe_interval);
12434
12435 /* Enable sw prefetching at specified optimization level for
12436 CPUS that have prefetch. Lower optimization level threshold by 1
12437 when profiling is enabled. */
12438 if (opts->x_flag_prefetch_loop_arrays < 0
12439 && !opts->x_optimize_size
12440 && aarch64_tune_params.prefetch->default_opt_level >= 0
12441 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12442 opts->x_flag_prefetch_loop_arrays = 1;
12443
12444 if (opts->x_aarch64_arch_string == NULL)
12445 opts->x_aarch64_arch_string = selected_arch->name;
12446 if (opts->x_aarch64_cpu_string == NULL)
12447 opts->x_aarch64_cpu_string = selected_cpu->name;
12448 if (opts->x_aarch64_tune_string == NULL)
12449 opts->x_aarch64_tune_string = selected_tune->name;
12450
12451 aarch64_override_options_after_change_1 (opts);
12452 }
12453
12454 /* Print a hint with a suggestion for a core or architecture name that
12455 most closely resembles what the user passed in STR. ARCH is true if
12456 the user is asking for an architecture name. ARCH is false if the user
12457 is asking for a core name. */
12458
12459 static void
12460 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12461 {
12462 auto_vec<const char *> candidates;
12463 const struct processor *entry = arch ? all_architectures : all_cores;
12464 for (; entry->name != NULL; entry++)
12465 candidates.safe_push (entry->name);
12466
12467 #ifdef HAVE_LOCAL_CPU_DETECT
12468 /* Add also "native" as possible value. */
12469 if (arch)
12470 candidates.safe_push ("native");
12471 #endif
12472
12473 char *s;
12474 const char *hint = candidates_list_and_hint (str, s, candidates);
12475 if (hint)
12476 inform (input_location, "valid arguments are: %s;"
12477 " did you mean %qs?", s, hint);
12478 else
12479 inform (input_location, "valid arguments are: %s", s);
12480
12481 XDELETEVEC (s);
12482 }
12483
12484 /* Print a hint with a suggestion for a core name that most closely resembles
12485 what the user passed in STR. */
12486
12487 inline static void
12488 aarch64_print_hint_for_core (const char *str)
12489 {
12490 aarch64_print_hint_for_core_or_arch (str, false);
12491 }
12492
12493 /* Print a hint with a suggestion for an architecture name that most closely
12494 resembles what the user passed in STR. */
12495
12496 inline static void
12497 aarch64_print_hint_for_arch (const char *str)
12498 {
12499 aarch64_print_hint_for_core_or_arch (str, true);
12500 }
12501
12502
12503 /* Print a hint with a suggestion for an extension name
12504 that most closely resembles what the user passed in STR. */
12505
12506 void
12507 aarch64_print_hint_for_extensions (const std::string &str)
12508 {
12509 auto_vec<const char *> candidates;
12510 aarch64_get_all_extension_candidates (&candidates);
12511 char *s;
12512 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12513 if (hint)
12514 inform (input_location, "valid arguments are: %s;"
12515 " did you mean %qs?", s, hint);
12516 else
12517 inform (input_location, "valid arguments are: %s;", s);
12518
12519 XDELETEVEC (s);
12520 }
12521
12522 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12523 specified in STR and throw errors if appropriate. Put the results if
12524 they are valid in RES and ISA_FLAGS. Return whether the option is
12525 valid. */
12526
12527 static bool
12528 aarch64_validate_mcpu (const char *str, const struct processor **res,
12529 uint64_t *isa_flags)
12530 {
12531 std::string invalid_extension;
12532 enum aarch64_parse_opt_result parse_res
12533 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12534
12535 if (parse_res == AARCH64_PARSE_OK)
12536 return true;
12537
12538 switch (parse_res)
12539 {
12540 case AARCH64_PARSE_MISSING_ARG:
12541 error ("missing cpu name in %<-mcpu=%s%>", str);
12542 break;
12543 case AARCH64_PARSE_INVALID_ARG:
12544 error ("unknown value %qs for %<-mcpu%>", str);
12545 aarch64_print_hint_for_core (str);
12546 break;
12547 case AARCH64_PARSE_INVALID_FEATURE:
12548 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12549 invalid_extension.c_str (), str);
12550 aarch64_print_hint_for_extensions (invalid_extension);
12551 break;
12552 default:
12553 gcc_unreachable ();
12554 }
12555
12556 return false;
12557 }
12558
12559 /* Parses CONST_STR for branch protection features specified in
12560 aarch64_branch_protect_types, and set any global variables required. Returns
12561 the parsing result and assigns LAST_STR to the last processed token from
12562 CONST_STR so that it can be used for error reporting. */
12563
12564 static enum
12565 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12566 char** last_str)
12567 {
12568 char *str_root = xstrdup (const_str);
12569 char* token_save = NULL;
12570 char *str = strtok_r (str_root, "+", &token_save);
12571 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12572 if (!str)
12573 res = AARCH64_PARSE_MISSING_ARG;
12574 else
12575 {
12576 char *next_str = strtok_r (NULL, "+", &token_save);
12577 /* Reset the branch protection features to their defaults. */
12578 aarch64_handle_no_branch_protection (NULL, NULL);
12579
12580 while (str && res == AARCH64_PARSE_OK)
12581 {
12582 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12583 bool found = false;
12584 /* Search for this type. */
12585 while (type && type->name && !found && res == AARCH64_PARSE_OK)
12586 {
12587 if (strcmp (str, type->name) == 0)
12588 {
12589 found = true;
12590 res = type->handler (str, next_str);
12591 str = next_str;
12592 next_str = strtok_r (NULL, "+", &token_save);
12593 }
12594 else
12595 type++;
12596 }
12597 if (found && res == AARCH64_PARSE_OK)
12598 {
12599 bool found_subtype = true;
12600 /* Loop through each token until we find one that isn't a
12601 subtype. */
12602 while (found_subtype)
12603 {
12604 found_subtype = false;
12605 const aarch64_branch_protect_type *subtype = type->subtypes;
12606 /* Search for the subtype. */
12607 while (str && subtype && subtype->name && !found_subtype
12608 && res == AARCH64_PARSE_OK)
12609 {
12610 if (strcmp (str, subtype->name) == 0)
12611 {
12612 found_subtype = true;
12613 res = subtype->handler (str, next_str);
12614 str = next_str;
12615 next_str = strtok_r (NULL, "+", &token_save);
12616 }
12617 else
12618 subtype++;
12619 }
12620 }
12621 }
12622 else if (!found)
12623 res = AARCH64_PARSE_INVALID_ARG;
12624 }
12625 }
12626 /* Copy the last processed token into the argument to pass it back.
12627 Used by option and attribute validation to print the offending token. */
12628 if (last_str)
12629 {
12630 if (str) strcpy (*last_str, str);
12631 else *last_str = NULL;
12632 }
12633 if (res == AARCH64_PARSE_OK)
12634 {
12635 /* If needed, alloc the accepted string then copy in const_str.
12636 Used by override_option_after_change_1. */
12637 if (!accepted_branch_protection_string)
12638 accepted_branch_protection_string = (char *) xmalloc (
12639 BRANCH_PROTECT_STR_MAX
12640 + 1);
12641 strncpy (accepted_branch_protection_string, const_str,
12642 BRANCH_PROTECT_STR_MAX + 1);
12643 /* Forcibly null-terminate. */
12644 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12645 }
12646 return res;
12647 }
12648
12649 static bool
12650 aarch64_validate_mbranch_protection (const char *const_str)
12651 {
12652 char *str = (char *) xmalloc (strlen (const_str));
12653 enum aarch64_parse_opt_result res =
12654 aarch64_parse_branch_protection (const_str, &str);
12655 if (res == AARCH64_PARSE_INVALID_ARG)
12656 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12657 else if (res == AARCH64_PARSE_MISSING_ARG)
12658 error ("missing argument for %<-mbranch-protection=%>");
12659 free (str);
12660 return res == AARCH64_PARSE_OK;
12661 }
12662
12663 /* Validate a command-line -march option. Parse the arch and extensions
12664 (if any) specified in STR and throw errors if appropriate. Put the
12665 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12666 option is valid. */
12667
12668 static bool
12669 aarch64_validate_march (const char *str, const struct processor **res,
12670 uint64_t *isa_flags)
12671 {
12672 std::string invalid_extension;
12673 enum aarch64_parse_opt_result parse_res
12674 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12675
12676 if (parse_res == AARCH64_PARSE_OK)
12677 return true;
12678
12679 switch (parse_res)
12680 {
12681 case AARCH64_PARSE_MISSING_ARG:
12682 error ("missing arch name in %<-march=%s%>", str);
12683 break;
12684 case AARCH64_PARSE_INVALID_ARG:
12685 error ("unknown value %qs for %<-march%>", str);
12686 aarch64_print_hint_for_arch (str);
12687 break;
12688 case AARCH64_PARSE_INVALID_FEATURE:
12689 error ("invalid feature modifier %qs in %<-march=%s%>",
12690 invalid_extension.c_str (), str);
12691 aarch64_print_hint_for_extensions (invalid_extension);
12692 break;
12693 default:
12694 gcc_unreachable ();
12695 }
12696
12697 return false;
12698 }
12699
12700 /* Validate a command-line -mtune option. Parse the cpu
12701 specified in STR and throw errors if appropriate. Put the
12702 result, if it is valid, in RES. Return whether the option is
12703 valid. */
12704
12705 static bool
12706 aarch64_validate_mtune (const char *str, const struct processor **res)
12707 {
12708 enum aarch64_parse_opt_result parse_res
12709 = aarch64_parse_tune (str, res);
12710
12711 if (parse_res == AARCH64_PARSE_OK)
12712 return true;
12713
12714 switch (parse_res)
12715 {
12716 case AARCH64_PARSE_MISSING_ARG:
12717 error ("missing cpu name in %<-mtune=%s%>", str);
12718 break;
12719 case AARCH64_PARSE_INVALID_ARG:
12720 error ("unknown value %qs for %<-mtune%>", str);
12721 aarch64_print_hint_for_core (str);
12722 break;
12723 default:
12724 gcc_unreachable ();
12725 }
12726 return false;
12727 }
12728
12729 /* Return the CPU corresponding to the enum CPU.
12730 If it doesn't specify a cpu, return the default. */
12731
12732 static const struct processor *
12733 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12734 {
12735 if (cpu != aarch64_none)
12736 return &all_cores[cpu];
12737
12738 /* The & 0x3f is to extract the bottom 6 bits that encode the
12739 default cpu as selected by the --with-cpu GCC configure option
12740 in config.gcc.
12741 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12742 flags mechanism should be reworked to make it more sane. */
12743 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12744 }
12745
12746 /* Return the architecture corresponding to the enum ARCH.
12747 If it doesn't specify a valid architecture, return the default. */
12748
12749 static const struct processor *
12750 aarch64_get_arch (enum aarch64_arch arch)
12751 {
12752 if (arch != aarch64_no_arch)
12753 return &all_architectures[arch];
12754
12755 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12756
12757 return &all_architectures[cpu->arch];
12758 }
12759
12760 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12761
12762 static poly_uint16
12763 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12764 {
12765 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12766 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12767 deciding which .md file patterns to use and when deciding whether
12768 something is a legitimate address or constant. */
12769 if (value == SVE_SCALABLE || value == SVE_128)
12770 return poly_uint16 (2, 2);
12771 else
12772 return (int) value / 64;
12773 }
12774
12775 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12776 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12777 tuning structs. In particular it must set selected_tune and
12778 aarch64_isa_flags that define the available ISA features and tuning
12779 decisions. It must also set selected_arch as this will be used to
12780 output the .arch asm tags for each function. */
12781
12782 static void
12783 aarch64_override_options (void)
12784 {
12785 uint64_t cpu_isa = 0;
12786 uint64_t arch_isa = 0;
12787 aarch64_isa_flags = 0;
12788
12789 bool valid_cpu = true;
12790 bool valid_tune = true;
12791 bool valid_arch = true;
12792
12793 selected_cpu = NULL;
12794 selected_arch = NULL;
12795 selected_tune = NULL;
12796
12797 if (aarch64_branch_protection_string)
12798 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12799
12800 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12801 If either of -march or -mtune is given, they override their
12802 respective component of -mcpu. */
12803 if (aarch64_cpu_string)
12804 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12805 &cpu_isa);
12806
12807 if (aarch64_arch_string)
12808 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12809 &arch_isa);
12810
12811 if (aarch64_tune_string)
12812 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12813
12814 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12815 SUBTARGET_OVERRIDE_OPTIONS;
12816 #endif
12817
12818 /* If the user did not specify a processor, choose the default
12819 one for them. This will be the CPU set during configuration using
12820 --with-cpu, otherwise it is "generic". */
12821 if (!selected_cpu)
12822 {
12823 if (selected_arch)
12824 {
12825 selected_cpu = &all_cores[selected_arch->ident];
12826 aarch64_isa_flags = arch_isa;
12827 explicit_arch = selected_arch->arch;
12828 }
12829 else
12830 {
12831 /* Get default configure-time CPU. */
12832 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12833 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12834 }
12835
12836 if (selected_tune)
12837 explicit_tune_core = selected_tune->ident;
12838 }
12839 /* If both -mcpu and -march are specified check that they are architecturally
12840 compatible, warn if they're not and prefer the -march ISA flags. */
12841 else if (selected_arch)
12842 {
12843 if (selected_arch->arch != selected_cpu->arch)
12844 {
12845 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12846 all_architectures[selected_cpu->arch].name,
12847 selected_arch->name);
12848 }
12849 aarch64_isa_flags = arch_isa;
12850 explicit_arch = selected_arch->arch;
12851 explicit_tune_core = selected_tune ? selected_tune->ident
12852 : selected_cpu->ident;
12853 }
12854 else
12855 {
12856 /* -mcpu but no -march. */
12857 aarch64_isa_flags = cpu_isa;
12858 explicit_tune_core = selected_tune ? selected_tune->ident
12859 : selected_cpu->ident;
12860 gcc_assert (selected_cpu);
12861 selected_arch = &all_architectures[selected_cpu->arch];
12862 explicit_arch = selected_arch->arch;
12863 }
12864
12865 /* Set the arch as well as we will need it when outputing
12866 the .arch directive in assembly. */
12867 if (!selected_arch)
12868 {
12869 gcc_assert (selected_cpu);
12870 selected_arch = &all_architectures[selected_cpu->arch];
12871 }
12872
12873 if (!selected_tune)
12874 selected_tune = selected_cpu;
12875
12876 if (aarch64_enable_bti == 2)
12877 {
12878 #ifdef TARGET_ENABLE_BTI
12879 aarch64_enable_bti = 1;
12880 #else
12881 aarch64_enable_bti = 0;
12882 #endif
12883 }
12884
12885 /* Return address signing is currently not supported for ILP32 targets. For
12886 LP64 targets use the configured option in the absence of a command-line
12887 option for -mbranch-protection. */
12888 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12889 {
12890 #ifdef TARGET_ENABLE_PAC_RET
12891 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12892 #else
12893 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12894 #endif
12895 }
12896
12897 #ifndef HAVE_AS_MABI_OPTION
12898 /* The compiler may have been configured with 2.23.* binutils, which does
12899 not have support for ILP32. */
12900 if (TARGET_ILP32)
12901 error ("assembler does not support %<-mabi=ilp32%>");
12902 #endif
12903
12904 /* Convert -msve-vector-bits to a VG count. */
12905 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12906
12907 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12908 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12909
12910 /* Make sure we properly set up the explicit options. */
12911 if ((aarch64_cpu_string && valid_cpu)
12912 || (aarch64_tune_string && valid_tune))
12913 gcc_assert (explicit_tune_core != aarch64_none);
12914
12915 if ((aarch64_cpu_string && valid_cpu)
12916 || (aarch64_arch_string && valid_arch))
12917 gcc_assert (explicit_arch != aarch64_no_arch);
12918
12919 /* The pass to insert speculation tracking runs before
12920 shrink-wrapping and the latter does not know how to update the
12921 tracking status. So disable it in this case. */
12922 if (aarch64_track_speculation)
12923 flag_shrink_wrap = 0;
12924
12925 aarch64_override_options_internal (&global_options);
12926
12927 /* Save these options as the default ones in case we push and pop them later
12928 while processing functions with potential target attributes. */
12929 target_option_default_node = target_option_current_node
12930 = build_target_option_node (&global_options);
12931 }
12932
12933 /* Implement targetm.override_options_after_change. */
12934
12935 static void
12936 aarch64_override_options_after_change (void)
12937 {
12938 aarch64_override_options_after_change_1 (&global_options);
12939 }
12940
12941 static struct machine_function *
12942 aarch64_init_machine_status (void)
12943 {
12944 struct machine_function *machine;
12945 machine = ggc_cleared_alloc<machine_function> ();
12946 return machine;
12947 }
12948
12949 void
12950 aarch64_init_expanders (void)
12951 {
12952 init_machine_status = aarch64_init_machine_status;
12953 }
12954
12955 /* A checking mechanism for the implementation of the various code models. */
12956 static void
12957 initialize_aarch64_code_model (struct gcc_options *opts)
12958 {
12959 if (opts->x_flag_pic)
12960 {
12961 switch (opts->x_aarch64_cmodel_var)
12962 {
12963 case AARCH64_CMODEL_TINY:
12964 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12965 break;
12966 case AARCH64_CMODEL_SMALL:
12967 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12968 aarch64_cmodel = (flag_pic == 2
12969 ? AARCH64_CMODEL_SMALL_PIC
12970 : AARCH64_CMODEL_SMALL_SPIC);
12971 #else
12972 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12973 #endif
12974 break;
12975 case AARCH64_CMODEL_LARGE:
12976 sorry ("code model %qs with %<-f%s%>", "large",
12977 opts->x_flag_pic > 1 ? "PIC" : "pic");
12978 break;
12979 default:
12980 gcc_unreachable ();
12981 }
12982 }
12983 else
12984 aarch64_cmodel = opts->x_aarch64_cmodel_var;
12985 }
12986
12987 /* Implement TARGET_OPTION_SAVE. */
12988
12989 static void
12990 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12991 {
12992 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12993 ptr->x_aarch64_branch_protection_string
12994 = opts->x_aarch64_branch_protection_string;
12995 }
12996
12997 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12998 using the information saved in PTR. */
12999
13000 static void
13001 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13002 {
13003 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13004 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13005 opts->x_explicit_arch = ptr->x_explicit_arch;
13006 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13007 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13008 opts->x_aarch64_branch_protection_string
13009 = ptr->x_aarch64_branch_protection_string;
13010 if (opts->x_aarch64_branch_protection_string)
13011 {
13012 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13013 NULL);
13014 }
13015
13016 aarch64_override_options_internal (opts);
13017 }
13018
13019 /* Implement TARGET_OPTION_PRINT. */
13020
13021 static void
13022 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13023 {
13024 const struct processor *cpu
13025 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13026 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13027 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13028 std::string extension
13029 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13030
13031 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13032 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13033 arch->name, extension.c_str ());
13034 }
13035
13036 static GTY(()) tree aarch64_previous_fndecl;
13037
13038 void
13039 aarch64_reset_previous_fndecl (void)
13040 {
13041 aarch64_previous_fndecl = NULL;
13042 }
13043
13044 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13045 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13046 make sure optab availability predicates are recomputed when necessary. */
13047
13048 void
13049 aarch64_save_restore_target_globals (tree new_tree)
13050 {
13051 if (TREE_TARGET_GLOBALS (new_tree))
13052 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13053 else if (new_tree == target_option_default_node)
13054 restore_target_globals (&default_target_globals);
13055 else
13056 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13057 }
13058
13059 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13060 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13061 of the function, if such exists. This function may be called multiple
13062 times on a single function so use aarch64_previous_fndecl to avoid
13063 setting up identical state. */
13064
13065 static void
13066 aarch64_set_current_function (tree fndecl)
13067 {
13068 if (!fndecl || fndecl == aarch64_previous_fndecl)
13069 return;
13070
13071 tree old_tree = (aarch64_previous_fndecl
13072 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13073 : NULL_TREE);
13074
13075 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13076
13077 /* If current function has no attributes but the previous one did,
13078 use the default node. */
13079 if (!new_tree && old_tree)
13080 new_tree = target_option_default_node;
13081
13082 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13083 the default have been handled by aarch64_save_restore_target_globals from
13084 aarch64_pragma_target_parse. */
13085 if (old_tree == new_tree)
13086 return;
13087
13088 aarch64_previous_fndecl = fndecl;
13089
13090 /* First set the target options. */
13091 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13092
13093 aarch64_save_restore_target_globals (new_tree);
13094 }
13095
13096 /* Enum describing the various ways we can handle attributes.
13097 In many cases we can reuse the generic option handling machinery. */
13098
13099 enum aarch64_attr_opt_type
13100 {
13101 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
13102 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
13103 aarch64_attr_enum, /* Attribute sets an enum variable. */
13104 aarch64_attr_custom /* Attribute requires a custom handling function. */
13105 };
13106
13107 /* All the information needed to handle a target attribute.
13108 NAME is the name of the attribute.
13109 ATTR_TYPE specifies the type of behavior of the attribute as described
13110 in the definition of enum aarch64_attr_opt_type.
13111 ALLOW_NEG is true if the attribute supports a "no-" form.
13112 HANDLER is the function that takes the attribute string as an argument
13113 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13114 OPT_NUM is the enum specifying the option that the attribute modifies.
13115 This is needed for attributes that mirror the behavior of a command-line
13116 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13117 aarch64_attr_enum. */
13118
13119 struct aarch64_attribute_info
13120 {
13121 const char *name;
13122 enum aarch64_attr_opt_type attr_type;
13123 bool allow_neg;
13124 bool (*handler) (const char *);
13125 enum opt_code opt_num;
13126 };
13127
13128 /* Handle the ARCH_STR argument to the arch= target attribute. */
13129
13130 static bool
13131 aarch64_handle_attr_arch (const char *str)
13132 {
13133 const struct processor *tmp_arch = NULL;
13134 std::string invalid_extension;
13135 enum aarch64_parse_opt_result parse_res
13136 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13137
13138 if (parse_res == AARCH64_PARSE_OK)
13139 {
13140 gcc_assert (tmp_arch);
13141 selected_arch = tmp_arch;
13142 explicit_arch = selected_arch->arch;
13143 return true;
13144 }
13145
13146 switch (parse_res)
13147 {
13148 case AARCH64_PARSE_MISSING_ARG:
13149 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13150 break;
13151 case AARCH64_PARSE_INVALID_ARG:
13152 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13153 aarch64_print_hint_for_arch (str);
13154 break;
13155 case AARCH64_PARSE_INVALID_FEATURE:
13156 error ("invalid feature modifier %s of value (\"%s\") in "
13157 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13158 aarch64_print_hint_for_extensions (invalid_extension);
13159 break;
13160 default:
13161 gcc_unreachable ();
13162 }
13163
13164 return false;
13165 }
13166
13167 /* Handle the argument CPU_STR to the cpu= target attribute. */
13168
13169 static bool
13170 aarch64_handle_attr_cpu (const char *str)
13171 {
13172 const struct processor *tmp_cpu = NULL;
13173 std::string invalid_extension;
13174 enum aarch64_parse_opt_result parse_res
13175 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13176
13177 if (parse_res == AARCH64_PARSE_OK)
13178 {
13179 gcc_assert (tmp_cpu);
13180 selected_tune = tmp_cpu;
13181 explicit_tune_core = selected_tune->ident;
13182
13183 selected_arch = &all_architectures[tmp_cpu->arch];
13184 explicit_arch = selected_arch->arch;
13185 return true;
13186 }
13187
13188 switch (parse_res)
13189 {
13190 case AARCH64_PARSE_MISSING_ARG:
13191 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13192 break;
13193 case AARCH64_PARSE_INVALID_ARG:
13194 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13195 aarch64_print_hint_for_core (str);
13196 break;
13197 case AARCH64_PARSE_INVALID_FEATURE:
13198 error ("invalid feature modifier %s of value (\"%s\") in "
13199 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13200 aarch64_print_hint_for_extensions (invalid_extension);
13201 break;
13202 default:
13203 gcc_unreachable ();
13204 }
13205
13206 return false;
13207 }
13208
13209 /* Handle the argument STR to the branch-protection= attribute. */
13210
13211 static bool
13212 aarch64_handle_attr_branch_protection (const char* str)
13213 {
13214 char *err_str = (char *) xmalloc (strlen (str));
13215 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13216 &err_str);
13217 bool success = false;
13218 switch (res)
13219 {
13220 case AARCH64_PARSE_MISSING_ARG:
13221 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13222 " attribute");
13223 break;
13224 case AARCH64_PARSE_INVALID_ARG:
13225 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13226 "=\")%> pragma or attribute", err_str);
13227 break;
13228 case AARCH64_PARSE_OK:
13229 success = true;
13230 /* Fall through. */
13231 case AARCH64_PARSE_INVALID_FEATURE:
13232 break;
13233 default:
13234 gcc_unreachable ();
13235 }
13236 free (err_str);
13237 return success;
13238 }
13239
13240 /* Handle the argument STR to the tune= target attribute. */
13241
13242 static bool
13243 aarch64_handle_attr_tune (const char *str)
13244 {
13245 const struct processor *tmp_tune = NULL;
13246 enum aarch64_parse_opt_result parse_res
13247 = aarch64_parse_tune (str, &tmp_tune);
13248
13249 if (parse_res == AARCH64_PARSE_OK)
13250 {
13251 gcc_assert (tmp_tune);
13252 selected_tune = tmp_tune;
13253 explicit_tune_core = selected_tune->ident;
13254 return true;
13255 }
13256
13257 switch (parse_res)
13258 {
13259 case AARCH64_PARSE_INVALID_ARG:
13260 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13261 aarch64_print_hint_for_core (str);
13262 break;
13263 default:
13264 gcc_unreachable ();
13265 }
13266
13267 return false;
13268 }
13269
13270 /* Parse an architecture extensions target attribute string specified in STR.
13271 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13272 if successful. Update aarch64_isa_flags to reflect the ISA features
13273 modified. */
13274
13275 static bool
13276 aarch64_handle_attr_isa_flags (char *str)
13277 {
13278 enum aarch64_parse_opt_result parse_res;
13279 uint64_t isa_flags = aarch64_isa_flags;
13280
13281 /* We allow "+nothing" in the beginning to clear out all architectural
13282 features if the user wants to handpick specific features. */
13283 if (strncmp ("+nothing", str, 8) == 0)
13284 {
13285 isa_flags = 0;
13286 str += 8;
13287 }
13288
13289 std::string invalid_extension;
13290 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13291
13292 if (parse_res == AARCH64_PARSE_OK)
13293 {
13294 aarch64_isa_flags = isa_flags;
13295 return true;
13296 }
13297
13298 switch (parse_res)
13299 {
13300 case AARCH64_PARSE_MISSING_ARG:
13301 error ("missing value in %<target()%> pragma or attribute");
13302 break;
13303
13304 case AARCH64_PARSE_INVALID_FEATURE:
13305 error ("invalid feature modifier %s of value (\"%s\") in "
13306 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13307 break;
13308
13309 default:
13310 gcc_unreachable ();
13311 }
13312
13313 return false;
13314 }
13315
13316 /* The target attributes that we support. On top of these we also support just
13317 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13318 handled explicitly in aarch64_process_one_target_attr. */
13319
13320 static const struct aarch64_attribute_info aarch64_attributes[] =
13321 {
13322 { "general-regs-only", aarch64_attr_mask, false, NULL,
13323 OPT_mgeneral_regs_only },
13324 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13325 OPT_mfix_cortex_a53_835769 },
13326 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13327 OPT_mfix_cortex_a53_843419 },
13328 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13329 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13330 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13331 OPT_momit_leaf_frame_pointer },
13332 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13333 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13334 OPT_march_ },
13335 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13336 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13337 OPT_mtune_ },
13338 { "branch-protection", aarch64_attr_custom, false,
13339 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13340 { "sign-return-address", aarch64_attr_enum, false, NULL,
13341 OPT_msign_return_address_ },
13342 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13343 };
13344
13345 /* Parse ARG_STR which contains the definition of one target attribute.
13346 Show appropriate errors if any or return true if the attribute is valid. */
13347
13348 static bool
13349 aarch64_process_one_target_attr (char *arg_str)
13350 {
13351 bool invert = false;
13352
13353 size_t len = strlen (arg_str);
13354
13355 if (len == 0)
13356 {
13357 error ("malformed %<target()%> pragma or attribute");
13358 return false;
13359 }
13360
13361 char *str_to_check = (char *) alloca (len + 1);
13362 strcpy (str_to_check, arg_str);
13363
13364 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13365 It is easier to detect and handle it explicitly here rather than going
13366 through the machinery for the rest of the target attributes in this
13367 function. */
13368 if (*str_to_check == '+')
13369 return aarch64_handle_attr_isa_flags (str_to_check);
13370
13371 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13372 {
13373 invert = true;
13374 str_to_check += 3;
13375 }
13376 char *arg = strchr (str_to_check, '=');
13377
13378 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13379 and point ARG to "foo". */
13380 if (arg)
13381 {
13382 *arg = '\0';
13383 arg++;
13384 }
13385 const struct aarch64_attribute_info *p_attr;
13386 bool found = false;
13387 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13388 {
13389 /* If the names don't match up, or the user has given an argument
13390 to an attribute that doesn't accept one, or didn't give an argument
13391 to an attribute that expects one, fail to match. */
13392 if (strcmp (str_to_check, p_attr->name) != 0)
13393 continue;
13394
13395 found = true;
13396 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13397 || p_attr->attr_type == aarch64_attr_enum;
13398
13399 if (attr_need_arg_p ^ (arg != NULL))
13400 {
13401 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13402 return false;
13403 }
13404
13405 /* If the name matches but the attribute does not allow "no-" versions
13406 then we can't match. */
13407 if (invert && !p_attr->allow_neg)
13408 {
13409 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13410 return false;
13411 }
13412
13413 switch (p_attr->attr_type)
13414 {
13415 /* Has a custom handler registered.
13416 For example, cpu=, arch=, tune=. */
13417 case aarch64_attr_custom:
13418 gcc_assert (p_attr->handler);
13419 if (!p_attr->handler (arg))
13420 return false;
13421 break;
13422
13423 /* Either set or unset a boolean option. */
13424 case aarch64_attr_bool:
13425 {
13426 struct cl_decoded_option decoded;
13427
13428 generate_option (p_attr->opt_num, NULL, !invert,
13429 CL_TARGET, &decoded);
13430 aarch64_handle_option (&global_options, &global_options_set,
13431 &decoded, input_location);
13432 break;
13433 }
13434 /* Set or unset a bit in the target_flags. aarch64_handle_option
13435 should know what mask to apply given the option number. */
13436 case aarch64_attr_mask:
13437 {
13438 struct cl_decoded_option decoded;
13439 /* We only need to specify the option number.
13440 aarch64_handle_option will know which mask to apply. */
13441 decoded.opt_index = p_attr->opt_num;
13442 decoded.value = !invert;
13443 aarch64_handle_option (&global_options, &global_options_set,
13444 &decoded, input_location);
13445 break;
13446 }
13447 /* Use the option setting machinery to set an option to an enum. */
13448 case aarch64_attr_enum:
13449 {
13450 gcc_assert (arg);
13451 bool valid;
13452 int value;
13453 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13454 &value, CL_TARGET);
13455 if (valid)
13456 {
13457 set_option (&global_options, NULL, p_attr->opt_num, value,
13458 NULL, DK_UNSPECIFIED, input_location,
13459 global_dc);
13460 }
13461 else
13462 {
13463 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13464 }
13465 break;
13466 }
13467 default:
13468 gcc_unreachable ();
13469 }
13470 }
13471
13472 /* If we reached here we either have found an attribute and validated
13473 it or didn't match any. If we matched an attribute but its arguments
13474 were malformed we will have returned false already. */
13475 return found;
13476 }
13477
13478 /* Count how many times the character C appears in
13479 NULL-terminated string STR. */
13480
13481 static unsigned int
13482 num_occurences_in_str (char c, char *str)
13483 {
13484 unsigned int res = 0;
13485 while (*str != '\0')
13486 {
13487 if (*str == c)
13488 res++;
13489
13490 str++;
13491 }
13492
13493 return res;
13494 }
13495
13496 /* Parse the tree in ARGS that contains the target attribute information
13497 and update the global target options space. */
13498
13499 bool
13500 aarch64_process_target_attr (tree args)
13501 {
13502 if (TREE_CODE (args) == TREE_LIST)
13503 {
13504 do
13505 {
13506 tree head = TREE_VALUE (args);
13507 if (head)
13508 {
13509 if (!aarch64_process_target_attr (head))
13510 return false;
13511 }
13512 args = TREE_CHAIN (args);
13513 } while (args);
13514
13515 return true;
13516 }
13517
13518 if (TREE_CODE (args) != STRING_CST)
13519 {
13520 error ("attribute %<target%> argument not a string");
13521 return false;
13522 }
13523
13524 size_t len = strlen (TREE_STRING_POINTER (args));
13525 char *str_to_check = (char *) alloca (len + 1);
13526 strcpy (str_to_check, TREE_STRING_POINTER (args));
13527
13528 if (len == 0)
13529 {
13530 error ("malformed %<target()%> pragma or attribute");
13531 return false;
13532 }
13533
13534 /* Used to catch empty spaces between commas i.e.
13535 attribute ((target ("attr1,,attr2"))). */
13536 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13537
13538 /* Handle multiple target attributes separated by ','. */
13539 char *token = strtok_r (str_to_check, ",", &str_to_check);
13540
13541 unsigned int num_attrs = 0;
13542 while (token)
13543 {
13544 num_attrs++;
13545 if (!aarch64_process_one_target_attr (token))
13546 {
13547 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13548 return false;
13549 }
13550
13551 token = strtok_r (NULL, ",", &str_to_check);
13552 }
13553
13554 if (num_attrs != num_commas + 1)
13555 {
13556 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13557 return false;
13558 }
13559
13560 return true;
13561 }
13562
13563 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13564 process attribute ((target ("..."))). */
13565
13566 static bool
13567 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13568 {
13569 struct cl_target_option cur_target;
13570 bool ret;
13571 tree old_optimize;
13572 tree new_target, new_optimize;
13573 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13574
13575 /* If what we're processing is the current pragma string then the
13576 target option node is already stored in target_option_current_node
13577 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13578 having to re-parse the string. This is especially useful to keep
13579 arm_neon.h compile times down since that header contains a lot
13580 of intrinsics enclosed in pragmas. */
13581 if (!existing_target && args == current_target_pragma)
13582 {
13583 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13584 return true;
13585 }
13586 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13587
13588 old_optimize = build_optimization_node (&global_options);
13589 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13590
13591 /* If the function changed the optimization levels as well as setting
13592 target options, start with the optimizations specified. */
13593 if (func_optimize && func_optimize != old_optimize)
13594 cl_optimization_restore (&global_options,
13595 TREE_OPTIMIZATION (func_optimize));
13596
13597 /* Save the current target options to restore at the end. */
13598 cl_target_option_save (&cur_target, &global_options);
13599
13600 /* If fndecl already has some target attributes applied to it, unpack
13601 them so that we add this attribute on top of them, rather than
13602 overwriting them. */
13603 if (existing_target)
13604 {
13605 struct cl_target_option *existing_options
13606 = TREE_TARGET_OPTION (existing_target);
13607
13608 if (existing_options)
13609 cl_target_option_restore (&global_options, existing_options);
13610 }
13611 else
13612 cl_target_option_restore (&global_options,
13613 TREE_TARGET_OPTION (target_option_current_node));
13614
13615 ret = aarch64_process_target_attr (args);
13616
13617 /* Set up any additional state. */
13618 if (ret)
13619 {
13620 aarch64_override_options_internal (&global_options);
13621 /* Initialize SIMD builtins if we haven't already.
13622 Set current_target_pragma to NULL for the duration so that
13623 the builtin initialization code doesn't try to tag the functions
13624 being built with the attributes specified by any current pragma, thus
13625 going into an infinite recursion. */
13626 if (TARGET_SIMD)
13627 {
13628 tree saved_current_target_pragma = current_target_pragma;
13629 current_target_pragma = NULL;
13630 aarch64_init_simd_builtins ();
13631 current_target_pragma = saved_current_target_pragma;
13632 }
13633 new_target = build_target_option_node (&global_options);
13634 }
13635 else
13636 new_target = NULL;
13637
13638 new_optimize = build_optimization_node (&global_options);
13639
13640 if (fndecl && ret)
13641 {
13642 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13643
13644 if (old_optimize != new_optimize)
13645 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13646 }
13647
13648 cl_target_option_restore (&global_options, &cur_target);
13649
13650 if (old_optimize != new_optimize)
13651 cl_optimization_restore (&global_options,
13652 TREE_OPTIMIZATION (old_optimize));
13653 return ret;
13654 }
13655
13656 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13657 tri-bool options (yes, no, don't care) and the default value is
13658 DEF, determine whether to reject inlining. */
13659
13660 static bool
13661 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13662 int dont_care, int def)
13663 {
13664 /* If the callee doesn't care, always allow inlining. */
13665 if (callee == dont_care)
13666 return true;
13667
13668 /* If the caller doesn't care, always allow inlining. */
13669 if (caller == dont_care)
13670 return true;
13671
13672 /* Otherwise, allow inlining if either the callee and caller values
13673 agree, or if the callee is using the default value. */
13674 return (callee == caller || callee == def);
13675 }
13676
13677 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13678 to inline CALLEE into CALLER based on target-specific info.
13679 Make sure that the caller and callee have compatible architectural
13680 features. Then go through the other possible target attributes
13681 and see if they can block inlining. Try not to reject always_inline
13682 callees unless they are incompatible architecturally. */
13683
13684 static bool
13685 aarch64_can_inline_p (tree caller, tree callee)
13686 {
13687 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13688 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13689
13690 struct cl_target_option *caller_opts
13691 = TREE_TARGET_OPTION (caller_tree ? caller_tree
13692 : target_option_default_node);
13693
13694 struct cl_target_option *callee_opts
13695 = TREE_TARGET_OPTION (callee_tree ? callee_tree
13696 : target_option_default_node);
13697
13698 /* Callee's ISA flags should be a subset of the caller's. */
13699 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13700 != callee_opts->x_aarch64_isa_flags)
13701 return false;
13702
13703 /* Allow non-strict aligned functions inlining into strict
13704 aligned ones. */
13705 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13706 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13707 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13708 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13709 return false;
13710
13711 bool always_inline = lookup_attribute ("always_inline",
13712 DECL_ATTRIBUTES (callee));
13713
13714 /* If the architectural features match up and the callee is always_inline
13715 then the other attributes don't matter. */
13716 if (always_inline)
13717 return true;
13718
13719 if (caller_opts->x_aarch64_cmodel_var
13720 != callee_opts->x_aarch64_cmodel_var)
13721 return false;
13722
13723 if (caller_opts->x_aarch64_tls_dialect
13724 != callee_opts->x_aarch64_tls_dialect)
13725 return false;
13726
13727 /* Honour explicit requests to workaround errata. */
13728 if (!aarch64_tribools_ok_for_inlining_p (
13729 caller_opts->x_aarch64_fix_a53_err835769,
13730 callee_opts->x_aarch64_fix_a53_err835769,
13731 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13732 return false;
13733
13734 if (!aarch64_tribools_ok_for_inlining_p (
13735 caller_opts->x_aarch64_fix_a53_err843419,
13736 callee_opts->x_aarch64_fix_a53_err843419,
13737 2, TARGET_FIX_ERR_A53_843419))
13738 return false;
13739
13740 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13741 caller and calle and they don't match up, reject inlining. */
13742 if (!aarch64_tribools_ok_for_inlining_p (
13743 caller_opts->x_flag_omit_leaf_frame_pointer,
13744 callee_opts->x_flag_omit_leaf_frame_pointer,
13745 2, 1))
13746 return false;
13747
13748 /* If the callee has specific tuning overrides, respect them. */
13749 if (callee_opts->x_aarch64_override_tune_string != NULL
13750 && caller_opts->x_aarch64_override_tune_string == NULL)
13751 return false;
13752
13753 /* If the user specified tuning override strings for the
13754 caller and callee and they don't match up, reject inlining.
13755 We just do a string compare here, we don't analyze the meaning
13756 of the string, as it would be too costly for little gain. */
13757 if (callee_opts->x_aarch64_override_tune_string
13758 && caller_opts->x_aarch64_override_tune_string
13759 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13760 caller_opts->x_aarch64_override_tune_string) != 0))
13761 return false;
13762
13763 return true;
13764 }
13765
13766 /* Return true if SYMBOL_REF X binds locally. */
13767
13768 static bool
13769 aarch64_symbol_binds_local_p (const_rtx x)
13770 {
13771 return (SYMBOL_REF_DECL (x)
13772 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13773 : SYMBOL_REF_LOCAL_P (x));
13774 }
13775
13776 /* Return true if SYMBOL_REF X is thread local */
13777 static bool
13778 aarch64_tls_symbol_p (rtx x)
13779 {
13780 if (! TARGET_HAVE_TLS)
13781 return false;
13782
13783 if (GET_CODE (x) != SYMBOL_REF)
13784 return false;
13785
13786 return SYMBOL_REF_TLS_MODEL (x) != 0;
13787 }
13788
13789 /* Classify a TLS symbol into one of the TLS kinds. */
13790 enum aarch64_symbol_type
13791 aarch64_classify_tls_symbol (rtx x)
13792 {
13793 enum tls_model tls_kind = tls_symbolic_operand_type (x);
13794
13795 switch (tls_kind)
13796 {
13797 case TLS_MODEL_GLOBAL_DYNAMIC:
13798 case TLS_MODEL_LOCAL_DYNAMIC:
13799 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13800
13801 case TLS_MODEL_INITIAL_EXEC:
13802 switch (aarch64_cmodel)
13803 {
13804 case AARCH64_CMODEL_TINY:
13805 case AARCH64_CMODEL_TINY_PIC:
13806 return SYMBOL_TINY_TLSIE;
13807 default:
13808 return SYMBOL_SMALL_TLSIE;
13809 }
13810
13811 case TLS_MODEL_LOCAL_EXEC:
13812 if (aarch64_tls_size == 12)
13813 return SYMBOL_TLSLE12;
13814 else if (aarch64_tls_size == 24)
13815 return SYMBOL_TLSLE24;
13816 else if (aarch64_tls_size == 32)
13817 return SYMBOL_TLSLE32;
13818 else if (aarch64_tls_size == 48)
13819 return SYMBOL_TLSLE48;
13820 else
13821 gcc_unreachable ();
13822
13823 case TLS_MODEL_EMULATED:
13824 case TLS_MODEL_NONE:
13825 return SYMBOL_FORCE_TO_MEM;
13826
13827 default:
13828 gcc_unreachable ();
13829 }
13830 }
13831
13832 /* Return the correct method for accessing X + OFFSET, where X is either
13833 a SYMBOL_REF or LABEL_REF. */
13834
13835 enum aarch64_symbol_type
13836 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13837 {
13838 if (GET_CODE (x) == LABEL_REF)
13839 {
13840 switch (aarch64_cmodel)
13841 {
13842 case AARCH64_CMODEL_LARGE:
13843 return SYMBOL_FORCE_TO_MEM;
13844
13845 case AARCH64_CMODEL_TINY_PIC:
13846 case AARCH64_CMODEL_TINY:
13847 return SYMBOL_TINY_ABSOLUTE;
13848
13849 case AARCH64_CMODEL_SMALL_SPIC:
13850 case AARCH64_CMODEL_SMALL_PIC:
13851 case AARCH64_CMODEL_SMALL:
13852 return SYMBOL_SMALL_ABSOLUTE;
13853
13854 default:
13855 gcc_unreachable ();
13856 }
13857 }
13858
13859 if (GET_CODE (x) == SYMBOL_REF)
13860 {
13861 if (aarch64_tls_symbol_p (x))
13862 return aarch64_classify_tls_symbol (x);
13863
13864 switch (aarch64_cmodel)
13865 {
13866 case AARCH64_CMODEL_TINY:
13867 /* When we retrieve symbol + offset address, we have to make sure
13868 the offset does not cause overflow of the final address. But
13869 we have no way of knowing the address of symbol at compile time
13870 so we can't accurately say if the distance between the PC and
13871 symbol + offset is outside the addressible range of +/-1M in the
13872 TINY code model. So we rely on images not being greater than
13873 1M and cap the offset at 1M and anything beyond 1M will have to
13874 be loaded using an alternative mechanism. Furthermore if the
13875 symbol is a weak reference to something that isn't known to
13876 resolve to a symbol in this module, then force to memory. */
13877 if ((SYMBOL_REF_WEAK (x)
13878 && !aarch64_symbol_binds_local_p (x))
13879 || !IN_RANGE (offset, -1048575, 1048575))
13880 return SYMBOL_FORCE_TO_MEM;
13881 return SYMBOL_TINY_ABSOLUTE;
13882
13883 case AARCH64_CMODEL_SMALL:
13884 /* Same reasoning as the tiny code model, but the offset cap here is
13885 4G. */
13886 if ((SYMBOL_REF_WEAK (x)
13887 && !aarch64_symbol_binds_local_p (x))
13888 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13889 HOST_WIDE_INT_C (4294967264)))
13890 return SYMBOL_FORCE_TO_MEM;
13891 return SYMBOL_SMALL_ABSOLUTE;
13892
13893 case AARCH64_CMODEL_TINY_PIC:
13894 if (!aarch64_symbol_binds_local_p (x))
13895 return SYMBOL_TINY_GOT;
13896 return SYMBOL_TINY_ABSOLUTE;
13897
13898 case AARCH64_CMODEL_SMALL_SPIC:
13899 case AARCH64_CMODEL_SMALL_PIC:
13900 if (!aarch64_symbol_binds_local_p (x))
13901 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13902 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13903 return SYMBOL_SMALL_ABSOLUTE;
13904
13905 case AARCH64_CMODEL_LARGE:
13906 /* This is alright even in PIC code as the constant
13907 pool reference is always PC relative and within
13908 the same translation unit. */
13909 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13910 return SYMBOL_SMALL_ABSOLUTE;
13911 else
13912 return SYMBOL_FORCE_TO_MEM;
13913
13914 default:
13915 gcc_unreachable ();
13916 }
13917 }
13918
13919 /* By default push everything into the constant pool. */
13920 return SYMBOL_FORCE_TO_MEM;
13921 }
13922
13923 bool
13924 aarch64_constant_address_p (rtx x)
13925 {
13926 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13927 }
13928
13929 bool
13930 aarch64_legitimate_pic_operand_p (rtx x)
13931 {
13932 if (GET_CODE (x) == SYMBOL_REF
13933 || (GET_CODE (x) == CONST
13934 && GET_CODE (XEXP (x, 0)) == PLUS
13935 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13936 return false;
13937
13938 return true;
13939 }
13940
13941 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13942 that should be rematerialized rather than spilled. */
13943
13944 static bool
13945 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13946 {
13947 /* Support CSE and rematerialization of common constants. */
13948 if (CONST_INT_P (x)
13949 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13950 || GET_CODE (x) == CONST_VECTOR)
13951 return true;
13952
13953 /* Do not allow vector struct mode constants for Advanced SIMD.
13954 We could support 0 and -1 easily, but they need support in
13955 aarch64-simd.md. */
13956 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13957 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13958 return false;
13959
13960 /* Only accept variable-length vector constants if they can be
13961 handled directly.
13962
13963 ??? It would be possible to handle rematerialization of other
13964 constants via secondary reloads. */
13965 if (vec_flags & VEC_ANY_SVE)
13966 return aarch64_simd_valid_immediate (x, NULL);
13967
13968 if (GET_CODE (x) == HIGH)
13969 x = XEXP (x, 0);
13970
13971 /* Accept polynomial constants that can be calculated by using the
13972 destination of a move as the sole temporary. Constants that
13973 require a second temporary cannot be rematerialized (they can't be
13974 forced to memory and also aren't legitimate constants). */
13975 poly_int64 offset;
13976 if (poly_int_rtx_p (x, &offset))
13977 return aarch64_offset_temporaries (false, offset) <= 1;
13978
13979 /* If an offset is being added to something else, we need to allow the
13980 base to be moved into the destination register, meaning that there
13981 are no free temporaries for the offset. */
13982 x = strip_offset (x, &offset);
13983 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13984 return false;
13985
13986 /* Do not allow const (plus (anchor_symbol, const_int)). */
13987 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13988 return false;
13989
13990 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13991 so spilling them is better than rematerialization. */
13992 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13993 return true;
13994
13995 /* Label references are always constant. */
13996 if (GET_CODE (x) == LABEL_REF)
13997 return true;
13998
13999 return false;
14000 }
14001
14002 rtx
14003 aarch64_load_tp (rtx target)
14004 {
14005 if (!target
14006 || GET_MODE (target) != Pmode
14007 || !register_operand (target, Pmode))
14008 target = gen_reg_rtx (Pmode);
14009
14010 /* Can return in any reg. */
14011 emit_insn (gen_aarch64_load_tp_hard (target));
14012 return target;
14013 }
14014
14015 /* On AAPCS systems, this is the "struct __va_list". */
14016 static GTY(()) tree va_list_type;
14017
14018 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14019 Return the type to use as __builtin_va_list.
14020
14021 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14022
14023 struct __va_list
14024 {
14025 void *__stack;
14026 void *__gr_top;
14027 void *__vr_top;
14028 int __gr_offs;
14029 int __vr_offs;
14030 }; */
14031
14032 static tree
14033 aarch64_build_builtin_va_list (void)
14034 {
14035 tree va_list_name;
14036 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14037
14038 /* Create the type. */
14039 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14040 /* Give it the required name. */
14041 va_list_name = build_decl (BUILTINS_LOCATION,
14042 TYPE_DECL,
14043 get_identifier ("__va_list"),
14044 va_list_type);
14045 DECL_ARTIFICIAL (va_list_name) = 1;
14046 TYPE_NAME (va_list_type) = va_list_name;
14047 TYPE_STUB_DECL (va_list_type) = va_list_name;
14048
14049 /* Create the fields. */
14050 f_stack = build_decl (BUILTINS_LOCATION,
14051 FIELD_DECL, get_identifier ("__stack"),
14052 ptr_type_node);
14053 f_grtop = build_decl (BUILTINS_LOCATION,
14054 FIELD_DECL, get_identifier ("__gr_top"),
14055 ptr_type_node);
14056 f_vrtop = build_decl (BUILTINS_LOCATION,
14057 FIELD_DECL, get_identifier ("__vr_top"),
14058 ptr_type_node);
14059 f_groff = build_decl (BUILTINS_LOCATION,
14060 FIELD_DECL, get_identifier ("__gr_offs"),
14061 integer_type_node);
14062 f_vroff = build_decl (BUILTINS_LOCATION,
14063 FIELD_DECL, get_identifier ("__vr_offs"),
14064 integer_type_node);
14065
14066 /* Tell tree-stdarg pass about our internal offset fields.
14067 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14068 purpose to identify whether the code is updating va_list internal
14069 offset fields through irregular way. */
14070 va_list_gpr_counter_field = f_groff;
14071 va_list_fpr_counter_field = f_vroff;
14072
14073 DECL_ARTIFICIAL (f_stack) = 1;
14074 DECL_ARTIFICIAL (f_grtop) = 1;
14075 DECL_ARTIFICIAL (f_vrtop) = 1;
14076 DECL_ARTIFICIAL (f_groff) = 1;
14077 DECL_ARTIFICIAL (f_vroff) = 1;
14078
14079 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14080 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14081 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14082 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14083 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14084
14085 TYPE_FIELDS (va_list_type) = f_stack;
14086 DECL_CHAIN (f_stack) = f_grtop;
14087 DECL_CHAIN (f_grtop) = f_vrtop;
14088 DECL_CHAIN (f_vrtop) = f_groff;
14089 DECL_CHAIN (f_groff) = f_vroff;
14090
14091 /* Compute its layout. */
14092 layout_type (va_list_type);
14093
14094 return va_list_type;
14095 }
14096
14097 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14098 static void
14099 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14100 {
14101 const CUMULATIVE_ARGS *cum;
14102 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14103 tree stack, grtop, vrtop, groff, vroff;
14104 tree t;
14105 int gr_save_area_size = cfun->va_list_gpr_size;
14106 int vr_save_area_size = cfun->va_list_fpr_size;
14107 int vr_offset;
14108
14109 cum = &crtl->args.info;
14110 if (cfun->va_list_gpr_size)
14111 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14112 cfun->va_list_gpr_size);
14113 if (cfun->va_list_fpr_size)
14114 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14115 * UNITS_PER_VREG, cfun->va_list_fpr_size);
14116
14117 if (!TARGET_FLOAT)
14118 {
14119 gcc_assert (cum->aapcs_nvrn == 0);
14120 vr_save_area_size = 0;
14121 }
14122
14123 f_stack = TYPE_FIELDS (va_list_type_node);
14124 f_grtop = DECL_CHAIN (f_stack);
14125 f_vrtop = DECL_CHAIN (f_grtop);
14126 f_groff = DECL_CHAIN (f_vrtop);
14127 f_vroff = DECL_CHAIN (f_groff);
14128
14129 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14130 NULL_TREE);
14131 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14132 NULL_TREE);
14133 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14134 NULL_TREE);
14135 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14136 NULL_TREE);
14137 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14138 NULL_TREE);
14139
14140 /* Emit code to initialize STACK, which points to the next varargs stack
14141 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14142 by named arguments. STACK is 8-byte aligned. */
14143 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14144 if (cum->aapcs_stack_size > 0)
14145 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14146 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14147 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14148
14149 /* Emit code to initialize GRTOP, the top of the GR save area.
14150 virtual_incoming_args_rtx should have been 16 byte aligned. */
14151 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14152 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14153 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14154
14155 /* Emit code to initialize VRTOP, the top of the VR save area.
14156 This address is gr_save_area_bytes below GRTOP, rounded
14157 down to the next 16-byte boundary. */
14158 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14159 vr_offset = ROUND_UP (gr_save_area_size,
14160 STACK_BOUNDARY / BITS_PER_UNIT);
14161
14162 if (vr_offset)
14163 t = fold_build_pointer_plus_hwi (t, -vr_offset);
14164 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14165 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14166
14167 /* Emit code to initialize GROFF, the offset from GRTOP of the
14168 next GPR argument. */
14169 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14170 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14171 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14172
14173 /* Likewise emit code to initialize VROFF, the offset from FTOP
14174 of the next VR argument. */
14175 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14176 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14177 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14178 }
14179
14180 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14181
14182 static tree
14183 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14184 gimple_seq *post_p ATTRIBUTE_UNUSED)
14185 {
14186 tree addr;
14187 bool indirect_p;
14188 bool is_ha; /* is HFA or HVA. */
14189 bool dw_align; /* double-word align. */
14190 machine_mode ag_mode = VOIDmode;
14191 int nregs;
14192 machine_mode mode;
14193
14194 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14195 tree stack, f_top, f_off, off, arg, roundup, on_stack;
14196 HOST_WIDE_INT size, rsize, adjust, align;
14197 tree t, u, cond1, cond2;
14198
14199 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
14200 if (indirect_p)
14201 type = build_pointer_type (type);
14202
14203 mode = TYPE_MODE (type);
14204
14205 f_stack = TYPE_FIELDS (va_list_type_node);
14206 f_grtop = DECL_CHAIN (f_stack);
14207 f_vrtop = DECL_CHAIN (f_grtop);
14208 f_groff = DECL_CHAIN (f_vrtop);
14209 f_vroff = DECL_CHAIN (f_groff);
14210
14211 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14212 f_stack, NULL_TREE);
14213 size = int_size_in_bytes (type);
14214
14215 bool abi_break;
14216 align
14217 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14218
14219 dw_align = false;
14220 adjust = 0;
14221 if (aarch64_vfp_is_call_or_return_candidate (mode,
14222 type,
14223 &ag_mode,
14224 &nregs,
14225 &is_ha))
14226 {
14227 /* No frontends can create types with variable-sized modes, so we
14228 shouldn't be asked to pass or return them. */
14229 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14230
14231 /* TYPE passed in fp/simd registers. */
14232 if (!TARGET_FLOAT)
14233 aarch64_err_no_fpadvsimd (mode);
14234
14235 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14236 unshare_expr (valist), f_vrtop, NULL_TREE);
14237 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14238 unshare_expr (valist), f_vroff, NULL_TREE);
14239
14240 rsize = nregs * UNITS_PER_VREG;
14241
14242 if (is_ha)
14243 {
14244 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14245 adjust = UNITS_PER_VREG - ag_size;
14246 }
14247 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14248 && size < UNITS_PER_VREG)
14249 {
14250 adjust = UNITS_PER_VREG - size;
14251 }
14252 }
14253 else
14254 {
14255 /* TYPE passed in general registers. */
14256 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14257 unshare_expr (valist), f_grtop, NULL_TREE);
14258 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14259 unshare_expr (valist), f_groff, NULL_TREE);
14260 rsize = ROUND_UP (size, UNITS_PER_WORD);
14261 nregs = rsize / UNITS_PER_WORD;
14262
14263 if (align > 8)
14264 {
14265 if (abi_break && warn_psabi)
14266 inform (input_location, "parameter passing for argument of type "
14267 "%qT changed in GCC 9.1", type);
14268 dw_align = true;
14269 }
14270
14271 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14272 && size < UNITS_PER_WORD)
14273 {
14274 adjust = UNITS_PER_WORD - size;
14275 }
14276 }
14277
14278 /* Get a local temporary for the field value. */
14279 off = get_initialized_tmp_var (f_off, pre_p, NULL);
14280
14281 /* Emit code to branch if off >= 0. */
14282 t = build2 (GE_EXPR, boolean_type_node, off,
14283 build_int_cst (TREE_TYPE (off), 0));
14284 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14285
14286 if (dw_align)
14287 {
14288 /* Emit: offs = (offs + 15) & -16. */
14289 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14290 build_int_cst (TREE_TYPE (off), 15));
14291 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14292 build_int_cst (TREE_TYPE (off), -16));
14293 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14294 }
14295 else
14296 roundup = NULL;
14297
14298 /* Update ap.__[g|v]r_offs */
14299 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14300 build_int_cst (TREE_TYPE (off), rsize));
14301 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14302
14303 /* String up. */
14304 if (roundup)
14305 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14306
14307 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14308 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14309 build_int_cst (TREE_TYPE (f_off), 0));
14310 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14311
14312 /* String up: make sure the assignment happens before the use. */
14313 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14314 COND_EXPR_ELSE (cond1) = t;
14315
14316 /* Prepare the trees handling the argument that is passed on the stack;
14317 the top level node will store in ON_STACK. */
14318 arg = get_initialized_tmp_var (stack, pre_p, NULL);
14319 if (align > 8)
14320 {
14321 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14322 t = fold_build_pointer_plus_hwi (arg, 15);
14323 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14324 build_int_cst (TREE_TYPE (t), -16));
14325 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14326 }
14327 else
14328 roundup = NULL;
14329 /* Advance ap.__stack */
14330 t = fold_build_pointer_plus_hwi (arg, size + 7);
14331 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14332 build_int_cst (TREE_TYPE (t), -8));
14333 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14334 /* String up roundup and advance. */
14335 if (roundup)
14336 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14337 /* String up with arg */
14338 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14339 /* Big-endianness related address adjustment. */
14340 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14341 && size < UNITS_PER_WORD)
14342 {
14343 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14344 size_int (UNITS_PER_WORD - size));
14345 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14346 }
14347
14348 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14349 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14350
14351 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14352 t = off;
14353 if (adjust)
14354 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14355 build_int_cst (TREE_TYPE (off), adjust));
14356
14357 t = fold_convert (sizetype, t);
14358 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14359
14360 if (is_ha)
14361 {
14362 /* type ha; // treat as "struct {ftype field[n];}"
14363 ... [computing offs]
14364 for (i = 0; i <nregs; ++i, offs += 16)
14365 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14366 return ha; */
14367 int i;
14368 tree tmp_ha, field_t, field_ptr_t;
14369
14370 /* Declare a local variable. */
14371 tmp_ha = create_tmp_var_raw (type, "ha");
14372 gimple_add_tmp_var (tmp_ha);
14373
14374 /* Establish the base type. */
14375 switch (ag_mode)
14376 {
14377 case E_SFmode:
14378 field_t = float_type_node;
14379 field_ptr_t = float_ptr_type_node;
14380 break;
14381 case E_DFmode:
14382 field_t = double_type_node;
14383 field_ptr_t = double_ptr_type_node;
14384 break;
14385 case E_TFmode:
14386 field_t = long_double_type_node;
14387 field_ptr_t = long_double_ptr_type_node;
14388 break;
14389 case E_HFmode:
14390 field_t = aarch64_fp16_type_node;
14391 field_ptr_t = aarch64_fp16_ptr_type_node;
14392 break;
14393 case E_V2SImode:
14394 case E_V4SImode:
14395 {
14396 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14397 field_t = build_vector_type_for_mode (innertype, ag_mode);
14398 field_ptr_t = build_pointer_type (field_t);
14399 }
14400 break;
14401 default:
14402 gcc_assert (0);
14403 }
14404
14405 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14406 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14407 addr = t;
14408 t = fold_convert (field_ptr_t, addr);
14409 t = build2 (MODIFY_EXPR, field_t,
14410 build1 (INDIRECT_REF, field_t, tmp_ha),
14411 build1 (INDIRECT_REF, field_t, t));
14412
14413 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14414 for (i = 1; i < nregs; ++i)
14415 {
14416 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14417 u = fold_convert (field_ptr_t, addr);
14418 u = build2 (MODIFY_EXPR, field_t,
14419 build2 (MEM_REF, field_t, tmp_ha,
14420 build_int_cst (field_ptr_t,
14421 (i *
14422 int_size_in_bytes (field_t)))),
14423 build1 (INDIRECT_REF, field_t, u));
14424 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14425 }
14426
14427 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14428 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14429 }
14430
14431 COND_EXPR_ELSE (cond2) = t;
14432 addr = fold_convert (build_pointer_type (type), cond1);
14433 addr = build_va_arg_indirect_ref (addr);
14434
14435 if (indirect_p)
14436 addr = build_va_arg_indirect_ref (addr);
14437
14438 return addr;
14439 }
14440
14441 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14442
14443 static void
14444 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14445 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14446 int no_rtl)
14447 {
14448 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14449 CUMULATIVE_ARGS local_cum;
14450 int gr_saved = cfun->va_list_gpr_size;
14451 int vr_saved = cfun->va_list_fpr_size;
14452
14453 /* The caller has advanced CUM up to, but not beyond, the last named
14454 argument. Advance a local copy of CUM past the last "real" named
14455 argument, to find out how many registers are left over. */
14456 local_cum = *cum;
14457 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14458
14459 /* Found out how many registers we need to save.
14460 Honor tree-stdvar analysis results. */
14461 if (cfun->va_list_gpr_size)
14462 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14463 cfun->va_list_gpr_size / UNITS_PER_WORD);
14464 if (cfun->va_list_fpr_size)
14465 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14466 cfun->va_list_fpr_size / UNITS_PER_VREG);
14467
14468 if (!TARGET_FLOAT)
14469 {
14470 gcc_assert (local_cum.aapcs_nvrn == 0);
14471 vr_saved = 0;
14472 }
14473
14474 if (!no_rtl)
14475 {
14476 if (gr_saved > 0)
14477 {
14478 rtx ptr, mem;
14479
14480 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14481 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14482 - gr_saved * UNITS_PER_WORD);
14483 mem = gen_frame_mem (BLKmode, ptr);
14484 set_mem_alias_set (mem, get_varargs_alias_set ());
14485
14486 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14487 mem, gr_saved);
14488 }
14489 if (vr_saved > 0)
14490 {
14491 /* We can't use move_block_from_reg, because it will use
14492 the wrong mode, storing D regs only. */
14493 machine_mode mode = TImode;
14494 int off, i, vr_start;
14495
14496 /* Set OFF to the offset from virtual_incoming_args_rtx of
14497 the first vector register. The VR save area lies below
14498 the GR one, and is aligned to 16 bytes. */
14499 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14500 STACK_BOUNDARY / BITS_PER_UNIT);
14501 off -= vr_saved * UNITS_PER_VREG;
14502
14503 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14504 for (i = 0; i < vr_saved; ++i)
14505 {
14506 rtx ptr, mem;
14507
14508 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14509 mem = gen_frame_mem (mode, ptr);
14510 set_mem_alias_set (mem, get_varargs_alias_set ());
14511 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14512 off += UNITS_PER_VREG;
14513 }
14514 }
14515 }
14516
14517 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14518 any complication of having crtl->args.pretend_args_size changed. */
14519 cfun->machine->frame.saved_varargs_size
14520 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14521 STACK_BOUNDARY / BITS_PER_UNIT)
14522 + vr_saved * UNITS_PER_VREG);
14523 }
14524
14525 static void
14526 aarch64_conditional_register_usage (void)
14527 {
14528 int i;
14529 if (!TARGET_FLOAT)
14530 {
14531 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14532 {
14533 fixed_regs[i] = 1;
14534 call_used_regs[i] = 1;
14535 }
14536 }
14537 if (!TARGET_SVE)
14538 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14539 {
14540 fixed_regs[i] = 1;
14541 call_used_regs[i] = 1;
14542 }
14543
14544 /* When tracking speculation, we need a couple of call-clobbered registers
14545 to track the speculation state. It would be nice to just use
14546 IP0 and IP1, but currently there are numerous places that just
14547 assume these registers are free for other uses (eg pointer
14548 authentication). */
14549 if (aarch64_track_speculation)
14550 {
14551 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14552 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14553 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14554 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14555 }
14556 }
14557
14558 /* Walk down the type tree of TYPE counting consecutive base elements.
14559 If *MODEP is VOIDmode, then set it to the first valid floating point
14560 type. If a non-floating point type is found, or if a floating point
14561 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14562 otherwise return the count in the sub-tree. */
14563 static int
14564 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14565 {
14566 machine_mode mode;
14567 HOST_WIDE_INT size;
14568
14569 switch (TREE_CODE (type))
14570 {
14571 case REAL_TYPE:
14572 mode = TYPE_MODE (type);
14573 if (mode != DFmode && mode != SFmode
14574 && mode != TFmode && mode != HFmode)
14575 return -1;
14576
14577 if (*modep == VOIDmode)
14578 *modep = mode;
14579
14580 if (*modep == mode)
14581 return 1;
14582
14583 break;
14584
14585 case COMPLEX_TYPE:
14586 mode = TYPE_MODE (TREE_TYPE (type));
14587 if (mode != DFmode && mode != SFmode
14588 && mode != TFmode && mode != HFmode)
14589 return -1;
14590
14591 if (*modep == VOIDmode)
14592 *modep = mode;
14593
14594 if (*modep == mode)
14595 return 2;
14596
14597 break;
14598
14599 case VECTOR_TYPE:
14600 /* Use V2SImode and V4SImode as representatives of all 64-bit
14601 and 128-bit vector types. */
14602 size = int_size_in_bytes (type);
14603 switch (size)
14604 {
14605 case 8:
14606 mode = V2SImode;
14607 break;
14608 case 16:
14609 mode = V4SImode;
14610 break;
14611 default:
14612 return -1;
14613 }
14614
14615 if (*modep == VOIDmode)
14616 *modep = mode;
14617
14618 /* Vector modes are considered to be opaque: two vectors are
14619 equivalent for the purposes of being homogeneous aggregates
14620 if they are the same size. */
14621 if (*modep == mode)
14622 return 1;
14623
14624 break;
14625
14626 case ARRAY_TYPE:
14627 {
14628 int count;
14629 tree index = TYPE_DOMAIN (type);
14630
14631 /* Can't handle incomplete types nor sizes that are not
14632 fixed. */
14633 if (!COMPLETE_TYPE_P (type)
14634 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14635 return -1;
14636
14637 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14638 if (count == -1
14639 || !index
14640 || !TYPE_MAX_VALUE (index)
14641 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14642 || !TYPE_MIN_VALUE (index)
14643 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14644 || count < 0)
14645 return -1;
14646
14647 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14648 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14649
14650 /* There must be no padding. */
14651 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14652 count * GET_MODE_BITSIZE (*modep)))
14653 return -1;
14654
14655 return count;
14656 }
14657
14658 case RECORD_TYPE:
14659 {
14660 int count = 0;
14661 int sub_count;
14662 tree field;
14663
14664 /* Can't handle incomplete types nor sizes that are not
14665 fixed. */
14666 if (!COMPLETE_TYPE_P (type)
14667 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14668 return -1;
14669
14670 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14671 {
14672 if (TREE_CODE (field) != FIELD_DECL)
14673 continue;
14674
14675 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14676 if (sub_count < 0)
14677 return -1;
14678 count += sub_count;
14679 }
14680
14681 /* There must be no padding. */
14682 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14683 count * GET_MODE_BITSIZE (*modep)))
14684 return -1;
14685
14686 return count;
14687 }
14688
14689 case UNION_TYPE:
14690 case QUAL_UNION_TYPE:
14691 {
14692 /* These aren't very interesting except in a degenerate case. */
14693 int count = 0;
14694 int sub_count;
14695 tree field;
14696
14697 /* Can't handle incomplete types nor sizes that are not
14698 fixed. */
14699 if (!COMPLETE_TYPE_P (type)
14700 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14701 return -1;
14702
14703 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14704 {
14705 if (TREE_CODE (field) != FIELD_DECL)
14706 continue;
14707
14708 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14709 if (sub_count < 0)
14710 return -1;
14711 count = count > sub_count ? count : sub_count;
14712 }
14713
14714 /* There must be no padding. */
14715 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14716 count * GET_MODE_BITSIZE (*modep)))
14717 return -1;
14718
14719 return count;
14720 }
14721
14722 default:
14723 break;
14724 }
14725
14726 return -1;
14727 }
14728
14729 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14730 type as described in AAPCS64 \S 4.1.2.
14731
14732 See the comment above aarch64_composite_type_p for the notes on MODE. */
14733
14734 static bool
14735 aarch64_short_vector_p (const_tree type,
14736 machine_mode mode)
14737 {
14738 poly_int64 size = -1;
14739
14740 if (type && TREE_CODE (type) == VECTOR_TYPE)
14741 size = int_size_in_bytes (type);
14742 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14743 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14744 size = GET_MODE_SIZE (mode);
14745
14746 return known_eq (size, 8) || known_eq (size, 16);
14747 }
14748
14749 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14750 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14751 array types. The C99 floating-point complex types are also considered
14752 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14753 types, which are GCC extensions and out of the scope of AAPCS64, are
14754 treated as composite types here as well.
14755
14756 Note that MODE itself is not sufficient in determining whether a type
14757 is such a composite type or not. This is because
14758 stor-layout.c:compute_record_mode may have already changed the MODE
14759 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14760 structure with only one field may have its MODE set to the mode of the
14761 field. Also an integer mode whose size matches the size of the
14762 RECORD_TYPE type may be used to substitute the original mode
14763 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14764 solely relied on. */
14765
14766 static bool
14767 aarch64_composite_type_p (const_tree type,
14768 machine_mode mode)
14769 {
14770 if (aarch64_short_vector_p (type, mode))
14771 return false;
14772
14773 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14774 return true;
14775
14776 if (mode == BLKmode
14777 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14778 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14779 return true;
14780
14781 return false;
14782 }
14783
14784 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14785 shall be passed or returned in simd/fp register(s) (providing these
14786 parameter passing registers are available).
14787
14788 Upon successful return, *COUNT returns the number of needed registers,
14789 *BASE_MODE returns the mode of the individual register and when IS_HAF
14790 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14791 floating-point aggregate or a homogeneous short-vector aggregate. */
14792
14793 static bool
14794 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14795 const_tree type,
14796 machine_mode *base_mode,
14797 int *count,
14798 bool *is_ha)
14799 {
14800 machine_mode new_mode = VOIDmode;
14801 bool composite_p = aarch64_composite_type_p (type, mode);
14802
14803 if (is_ha != NULL) *is_ha = false;
14804
14805 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14806 || aarch64_short_vector_p (type, mode))
14807 {
14808 *count = 1;
14809 new_mode = mode;
14810 }
14811 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14812 {
14813 if (is_ha != NULL) *is_ha = true;
14814 *count = 2;
14815 new_mode = GET_MODE_INNER (mode);
14816 }
14817 else if (type && composite_p)
14818 {
14819 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14820
14821 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14822 {
14823 if (is_ha != NULL) *is_ha = true;
14824 *count = ag_count;
14825 }
14826 else
14827 return false;
14828 }
14829 else
14830 return false;
14831
14832 *base_mode = new_mode;
14833 return true;
14834 }
14835
14836 /* Implement TARGET_STRUCT_VALUE_RTX. */
14837
14838 static rtx
14839 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14840 int incoming ATTRIBUTE_UNUSED)
14841 {
14842 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14843 }
14844
14845 /* Implements target hook vector_mode_supported_p. */
14846 static bool
14847 aarch64_vector_mode_supported_p (machine_mode mode)
14848 {
14849 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14850 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14851 }
14852
14853 /* Return the full-width SVE vector mode for element mode MODE, if one
14854 exists. */
14855 opt_machine_mode
14856 aarch64_full_sve_mode (scalar_mode mode)
14857 {
14858 switch (mode)
14859 {
14860 case E_DFmode:
14861 return VNx2DFmode;
14862 case E_SFmode:
14863 return VNx4SFmode;
14864 case E_HFmode:
14865 return VNx8HFmode;
14866 case E_DImode:
14867 return VNx2DImode;
14868 case E_SImode:
14869 return VNx4SImode;
14870 case E_HImode:
14871 return VNx8HImode;
14872 case E_QImode:
14873 return VNx16QImode;
14874 default:
14875 return opt_machine_mode ();
14876 }
14877 }
14878
14879 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14880 if it exists. */
14881 opt_machine_mode
14882 aarch64_vq_mode (scalar_mode mode)
14883 {
14884 switch (mode)
14885 {
14886 case E_DFmode:
14887 return V2DFmode;
14888 case E_SFmode:
14889 return V4SFmode;
14890 case E_HFmode:
14891 return V8HFmode;
14892 case E_SImode:
14893 return V4SImode;
14894 case E_HImode:
14895 return V8HImode;
14896 case E_QImode:
14897 return V16QImode;
14898 case E_DImode:
14899 return V2DImode;
14900 default:
14901 return opt_machine_mode ();
14902 }
14903 }
14904
14905 /* Return appropriate SIMD container
14906 for MODE within a vector of WIDTH bits. */
14907 static machine_mode
14908 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14909 {
14910 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14911 return aarch64_full_sve_mode (mode).else_mode (word_mode);
14912
14913 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14914 if (TARGET_SIMD)
14915 {
14916 if (known_eq (width, 128))
14917 return aarch64_vq_mode (mode).else_mode (word_mode);
14918 else
14919 switch (mode)
14920 {
14921 case E_SFmode:
14922 return V2SFmode;
14923 case E_HFmode:
14924 return V4HFmode;
14925 case E_SImode:
14926 return V2SImode;
14927 case E_HImode:
14928 return V4HImode;
14929 case E_QImode:
14930 return V8QImode;
14931 default:
14932 break;
14933 }
14934 }
14935 return word_mode;
14936 }
14937
14938 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14939 static machine_mode
14940 aarch64_preferred_simd_mode (scalar_mode mode)
14941 {
14942 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14943 return aarch64_simd_container_mode (mode, bits);
14944 }
14945
14946 /* Return a list of possible vector sizes for the vectorizer
14947 to iterate over. */
14948 static void
14949 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14950 {
14951 if (TARGET_SVE)
14952 sizes->safe_push (BYTES_PER_SVE_VECTOR);
14953 sizes->safe_push (16);
14954 sizes->safe_push (8);
14955 }
14956
14957 /* Implement TARGET_MANGLE_TYPE. */
14958
14959 static const char *
14960 aarch64_mangle_type (const_tree type)
14961 {
14962 /* The AArch64 ABI documents say that "__va_list" has to be
14963 mangled as if it is in the "std" namespace. */
14964 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14965 return "St9__va_list";
14966
14967 /* Half-precision float. */
14968 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14969 return "Dh";
14970
14971 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14972 builtin types. */
14973 if (TYPE_NAME (type) != NULL)
14974 return aarch64_mangle_builtin_type (type);
14975
14976 /* Use the default mangling. */
14977 return NULL;
14978 }
14979
14980 /* Find the first rtx_insn before insn that will generate an assembly
14981 instruction. */
14982
14983 static rtx_insn *
14984 aarch64_prev_real_insn (rtx_insn *insn)
14985 {
14986 if (!insn)
14987 return NULL;
14988
14989 do
14990 {
14991 insn = prev_real_insn (insn);
14992 }
14993 while (insn && recog_memoized (insn) < 0);
14994
14995 return insn;
14996 }
14997
14998 static bool
14999 is_madd_op (enum attr_type t1)
15000 {
15001 unsigned int i;
15002 /* A number of these may be AArch32 only. */
15003 enum attr_type mlatypes[] = {
15004 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15005 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15006 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15007 };
15008
15009 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15010 {
15011 if (t1 == mlatypes[i])
15012 return true;
15013 }
15014
15015 return false;
15016 }
15017
15018 /* Check if there is a register dependency between a load and the insn
15019 for which we hold recog_data. */
15020
15021 static bool
15022 dep_between_memop_and_curr (rtx memop)
15023 {
15024 rtx load_reg;
15025 int opno;
15026
15027 gcc_assert (GET_CODE (memop) == SET);
15028
15029 if (!REG_P (SET_DEST (memop)))
15030 return false;
15031
15032 load_reg = SET_DEST (memop);
15033 for (opno = 1; opno < recog_data.n_operands; opno++)
15034 {
15035 rtx operand = recog_data.operand[opno];
15036 if (REG_P (operand)
15037 && reg_overlap_mentioned_p (load_reg, operand))
15038 return true;
15039
15040 }
15041 return false;
15042 }
15043
15044
15045 /* When working around the Cortex-A53 erratum 835769,
15046 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15047 instruction and has a preceding memory instruction such that a NOP
15048 should be inserted between them. */
15049
15050 bool
15051 aarch64_madd_needs_nop (rtx_insn* insn)
15052 {
15053 enum attr_type attr_type;
15054 rtx_insn *prev;
15055 rtx body;
15056
15057 if (!TARGET_FIX_ERR_A53_835769)
15058 return false;
15059
15060 if (!INSN_P (insn) || recog_memoized (insn) < 0)
15061 return false;
15062
15063 attr_type = get_attr_type (insn);
15064 if (!is_madd_op (attr_type))
15065 return false;
15066
15067 prev = aarch64_prev_real_insn (insn);
15068 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15069 Restore recog state to INSN to avoid state corruption. */
15070 extract_constrain_insn_cached (insn);
15071
15072 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15073 return false;
15074
15075 body = single_set (prev);
15076
15077 /* If the previous insn is a memory op and there is no dependency between
15078 it and the DImode madd, emit a NOP between them. If body is NULL then we
15079 have a complex memory operation, probably a load/store pair.
15080 Be conservative for now and emit a NOP. */
15081 if (GET_MODE (recog_data.operand[0]) == DImode
15082 && (!body || !dep_between_memop_and_curr (body)))
15083 return true;
15084
15085 return false;
15086
15087 }
15088
15089
15090 /* Implement FINAL_PRESCAN_INSN. */
15091
15092 void
15093 aarch64_final_prescan_insn (rtx_insn *insn)
15094 {
15095 if (aarch64_madd_needs_nop (insn))
15096 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15097 }
15098
15099
15100 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15101 instruction. */
15102
15103 bool
15104 aarch64_sve_index_immediate_p (rtx base_or_step)
15105 {
15106 return (CONST_INT_P (base_or_step)
15107 && IN_RANGE (INTVAL (base_or_step), -16, 15));
15108 }
15109
15110 /* Return true if X is a valid immediate for the SVE ADD and SUB
15111 instructions. Negate X first if NEGATE_P is true. */
15112
15113 bool
15114 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15115 {
15116 rtx elt;
15117
15118 if (!const_vec_duplicate_p (x, &elt)
15119 || !CONST_INT_P (elt))
15120 return false;
15121
15122 HOST_WIDE_INT val = INTVAL (elt);
15123 if (negate_p)
15124 val = -val;
15125 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15126
15127 if (val & 0xff)
15128 return IN_RANGE (val, 0, 0xff);
15129 return IN_RANGE (val, 0, 0xff00);
15130 }
15131
15132 /* Return true if X is a valid immediate operand for an SVE logical
15133 instruction such as AND. */
15134
15135 bool
15136 aarch64_sve_bitmask_immediate_p (rtx x)
15137 {
15138 rtx elt;
15139
15140 return (const_vec_duplicate_p (x, &elt)
15141 && CONST_INT_P (elt)
15142 && aarch64_bitmask_imm (INTVAL (elt),
15143 GET_MODE_INNER (GET_MODE (x))));
15144 }
15145
15146 /* Return true if X is a valid immediate for the SVE DUP and CPY
15147 instructions. */
15148
15149 bool
15150 aarch64_sve_dup_immediate_p (rtx x)
15151 {
15152 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15153 if (!CONST_INT_P (x))
15154 return false;
15155
15156 HOST_WIDE_INT val = INTVAL (x);
15157 if (val & 0xff)
15158 return IN_RANGE (val, -0x80, 0x7f);
15159 return IN_RANGE (val, -0x8000, 0x7f00);
15160 }
15161
15162 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15163 SIGNED_P says whether the operand is signed rather than unsigned. */
15164
15165 bool
15166 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15167 {
15168 rtx elt;
15169
15170 return (const_vec_duplicate_p (x, &elt)
15171 && CONST_INT_P (elt)
15172 && (signed_p
15173 ? IN_RANGE (INTVAL (elt), -16, 15)
15174 : IN_RANGE (INTVAL (elt), 0, 127)));
15175 }
15176
15177 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15178 instruction. Negate X first if NEGATE_P is true. */
15179
15180 bool
15181 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15182 {
15183 rtx elt;
15184 REAL_VALUE_TYPE r;
15185
15186 if (!const_vec_duplicate_p (x, &elt)
15187 || GET_CODE (elt) != CONST_DOUBLE)
15188 return false;
15189
15190 r = *CONST_DOUBLE_REAL_VALUE (elt);
15191
15192 if (negate_p)
15193 r = real_value_negate (&r);
15194
15195 if (real_equal (&r, &dconst1))
15196 return true;
15197 if (real_equal (&r, &dconsthalf))
15198 return true;
15199 return false;
15200 }
15201
15202 /* Return true if X is a valid immediate operand for an SVE FMUL
15203 instruction. */
15204
15205 bool
15206 aarch64_sve_float_mul_immediate_p (rtx x)
15207 {
15208 rtx elt;
15209
15210 return (const_vec_duplicate_p (x, &elt)
15211 && GET_CODE (elt) == CONST_DOUBLE
15212 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15213 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15214 }
15215
15216 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15217 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15218 is nonnull, use it to describe valid immediates. */
15219 static bool
15220 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15221 simd_immediate_info *info,
15222 enum simd_immediate_check which,
15223 simd_immediate_info::insn_type insn)
15224 {
15225 /* Try a 4-byte immediate with LSL. */
15226 for (unsigned int shift = 0; shift < 32; shift += 8)
15227 if ((val32 & (0xff << shift)) == val32)
15228 {
15229 if (info)
15230 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15231 simd_immediate_info::LSL, shift);
15232 return true;
15233 }
15234
15235 /* Try a 2-byte immediate with LSL. */
15236 unsigned int imm16 = val32 & 0xffff;
15237 if (imm16 == (val32 >> 16))
15238 for (unsigned int shift = 0; shift < 16; shift += 8)
15239 if ((imm16 & (0xff << shift)) == imm16)
15240 {
15241 if (info)
15242 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15243 simd_immediate_info::LSL, shift);
15244 return true;
15245 }
15246
15247 /* Try a 4-byte immediate with MSL, except for cases that MVN
15248 can handle. */
15249 if (which == AARCH64_CHECK_MOV)
15250 for (unsigned int shift = 8; shift < 24; shift += 8)
15251 {
15252 unsigned int low = (1 << shift) - 1;
15253 if (((val32 & (0xff << shift)) | low) == val32)
15254 {
15255 if (info)
15256 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15257 simd_immediate_info::MSL, shift);
15258 return true;
15259 }
15260 }
15261
15262 return false;
15263 }
15264
15265 /* Return true if replicating VAL64 is a valid immediate for the
15266 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15267 use it to describe valid immediates. */
15268 static bool
15269 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15270 simd_immediate_info *info,
15271 enum simd_immediate_check which)
15272 {
15273 unsigned int val32 = val64 & 0xffffffff;
15274 unsigned int val16 = val64 & 0xffff;
15275 unsigned int val8 = val64 & 0xff;
15276
15277 if (val32 == (val64 >> 32))
15278 {
15279 if ((which & AARCH64_CHECK_ORR) != 0
15280 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15281 simd_immediate_info::MOV))
15282 return true;
15283
15284 if ((which & AARCH64_CHECK_BIC) != 0
15285 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15286 simd_immediate_info::MVN))
15287 return true;
15288
15289 /* Try using a replicated byte. */
15290 if (which == AARCH64_CHECK_MOV
15291 && val16 == (val32 >> 16)
15292 && val8 == (val16 >> 8))
15293 {
15294 if (info)
15295 *info = simd_immediate_info (QImode, val8);
15296 return true;
15297 }
15298 }
15299
15300 /* Try using a bit-to-bytemask. */
15301 if (which == AARCH64_CHECK_MOV)
15302 {
15303 unsigned int i;
15304 for (i = 0; i < 64; i += 8)
15305 {
15306 unsigned char byte = (val64 >> i) & 0xff;
15307 if (byte != 0 && byte != 0xff)
15308 break;
15309 }
15310 if (i == 64)
15311 {
15312 if (info)
15313 *info = simd_immediate_info (DImode, val64);
15314 return true;
15315 }
15316 }
15317 return false;
15318 }
15319
15320 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15321 instruction. If INFO is nonnull, use it to describe valid immediates. */
15322
15323 static bool
15324 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15325 simd_immediate_info *info)
15326 {
15327 scalar_int_mode mode = DImode;
15328 unsigned int val32 = val64 & 0xffffffff;
15329 if (val32 == (val64 >> 32))
15330 {
15331 mode = SImode;
15332 unsigned int val16 = val32 & 0xffff;
15333 if (val16 == (val32 >> 16))
15334 {
15335 mode = HImode;
15336 unsigned int val8 = val16 & 0xff;
15337 if (val8 == (val16 >> 8))
15338 mode = QImode;
15339 }
15340 }
15341 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15342 if (IN_RANGE (val, -0x80, 0x7f))
15343 {
15344 /* DUP with no shift. */
15345 if (info)
15346 *info = simd_immediate_info (mode, val);
15347 return true;
15348 }
15349 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15350 {
15351 /* DUP with LSL #8. */
15352 if (info)
15353 *info = simd_immediate_info (mode, val);
15354 return true;
15355 }
15356 if (aarch64_bitmask_imm (val64, mode))
15357 {
15358 /* DUPM. */
15359 if (info)
15360 *info = simd_immediate_info (mode, val);
15361 return true;
15362 }
15363 return false;
15364 }
15365
15366 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15367 it to describe valid immediates. */
15368
15369 static bool
15370 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15371 {
15372 if (x == CONST0_RTX (GET_MODE (x)))
15373 {
15374 if (info)
15375 *info = simd_immediate_info (DImode, 0);
15376 return true;
15377 }
15378
15379 /* Analyze the value as a VNx16BImode. This should be relatively
15380 efficient, since rtx_vector_builder has enough built-in capacity
15381 to store all VLA predicate constants without needing the heap. */
15382 rtx_vector_builder builder;
15383 if (!aarch64_get_sve_pred_bits (builder, x))
15384 return false;
15385
15386 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15387 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15388 {
15389 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15390 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15391 if (pattern != AARCH64_NUM_SVPATTERNS)
15392 {
15393 if (info)
15394 {
15395 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15396 *info = simd_immediate_info (int_mode, pattern);
15397 }
15398 return true;
15399 }
15400 }
15401 return false;
15402 }
15403
15404 /* Return true if OP is a valid SIMD immediate for the operation
15405 described by WHICH. If INFO is nonnull, use it to describe valid
15406 immediates. */
15407 bool
15408 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15409 enum simd_immediate_check which)
15410 {
15411 machine_mode mode = GET_MODE (op);
15412 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15413 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15414 return false;
15415
15416 if (vec_flags & VEC_SVE_PRED)
15417 return aarch64_sve_pred_valid_immediate (op, info);
15418
15419 scalar_mode elt_mode = GET_MODE_INNER (mode);
15420 rtx base, step;
15421 unsigned int n_elts;
15422 if (GET_CODE (op) == CONST_VECTOR
15423 && CONST_VECTOR_DUPLICATE_P (op))
15424 n_elts = CONST_VECTOR_NPATTERNS (op);
15425 else if ((vec_flags & VEC_SVE_DATA)
15426 && const_vec_series_p (op, &base, &step))
15427 {
15428 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15429 if (!aarch64_sve_index_immediate_p (base)
15430 || !aarch64_sve_index_immediate_p (step))
15431 return false;
15432
15433 if (info)
15434 *info = simd_immediate_info (elt_mode, base, step);
15435 return true;
15436 }
15437 else if (GET_CODE (op) == CONST_VECTOR
15438 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15439 /* N_ELTS set above. */;
15440 else
15441 return false;
15442
15443 scalar_float_mode elt_float_mode;
15444 if (n_elts == 1
15445 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15446 {
15447 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15448 if (aarch64_float_const_zero_rtx_p (elt)
15449 || aarch64_float_const_representable_p (elt))
15450 {
15451 if (info)
15452 *info = simd_immediate_info (elt_float_mode, elt);
15453 return true;
15454 }
15455 }
15456
15457 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15458 if (elt_size > 8)
15459 return false;
15460
15461 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15462
15463 /* Expand the vector constant out into a byte vector, with the least
15464 significant byte of the register first. */
15465 auto_vec<unsigned char, 16> bytes;
15466 bytes.reserve (n_elts * elt_size);
15467 for (unsigned int i = 0; i < n_elts; i++)
15468 {
15469 /* The vector is provided in gcc endian-neutral fashion.
15470 For aarch64_be Advanced SIMD, it must be laid out in the vector
15471 register in reverse order. */
15472 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15473 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15474
15475 if (elt_mode != elt_int_mode)
15476 elt = gen_lowpart (elt_int_mode, elt);
15477
15478 if (!CONST_INT_P (elt))
15479 return false;
15480
15481 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15482 for (unsigned int byte = 0; byte < elt_size; byte++)
15483 {
15484 bytes.quick_push (elt_val & 0xff);
15485 elt_val >>= BITS_PER_UNIT;
15486 }
15487 }
15488
15489 /* The immediate must repeat every eight bytes. */
15490 unsigned int nbytes = bytes.length ();
15491 for (unsigned i = 8; i < nbytes; ++i)
15492 if (bytes[i] != bytes[i - 8])
15493 return false;
15494
15495 /* Get the repeating 8-byte value as an integer. No endian correction
15496 is needed here because bytes is already in lsb-first order. */
15497 unsigned HOST_WIDE_INT val64 = 0;
15498 for (unsigned int i = 0; i < 8; i++)
15499 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15500 << (i * BITS_PER_UNIT));
15501
15502 if (vec_flags & VEC_SVE_DATA)
15503 return aarch64_sve_valid_immediate (val64, info);
15504 else
15505 return aarch64_advsimd_valid_immediate (val64, info, which);
15506 }
15507
15508 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15509 has a step in the range of INDEX. Return the index expression if so,
15510 otherwise return null. */
15511 rtx
15512 aarch64_check_zero_based_sve_index_immediate (rtx x)
15513 {
15514 rtx base, step;
15515 if (const_vec_series_p (x, &base, &step)
15516 && base == const0_rtx
15517 && aarch64_sve_index_immediate_p (step))
15518 return step;
15519 return NULL_RTX;
15520 }
15521
15522 /* Check of immediate shift constants are within range. */
15523 bool
15524 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15525 {
15526 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15527 if (left)
15528 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15529 else
15530 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15531 }
15532
15533 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15534 operation of width WIDTH at bit position POS. */
15535
15536 rtx
15537 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15538 {
15539 gcc_assert (CONST_INT_P (width));
15540 gcc_assert (CONST_INT_P (pos));
15541
15542 unsigned HOST_WIDE_INT mask
15543 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15544 return GEN_INT (mask << UINTVAL (pos));
15545 }
15546
15547 bool
15548 aarch64_mov_operand_p (rtx x, machine_mode mode)
15549 {
15550 if (GET_CODE (x) == HIGH
15551 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15552 return true;
15553
15554 if (CONST_INT_P (x))
15555 return true;
15556
15557 if (VECTOR_MODE_P (GET_MODE (x)))
15558 {
15559 /* Require predicate constants to be VNx16BI before RA, so that we
15560 force everything to have a canonical form. */
15561 if (!lra_in_progress
15562 && !reload_completed
15563 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15564 && GET_MODE (x) != VNx16BImode)
15565 return false;
15566
15567 return aarch64_simd_valid_immediate (x, NULL);
15568 }
15569
15570 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15571 return true;
15572
15573 if (aarch64_sve_cnt_immediate_p (x))
15574 return true;
15575
15576 return aarch64_classify_symbolic_expression (x)
15577 == SYMBOL_TINY_ABSOLUTE;
15578 }
15579
15580 /* Return a const_int vector of VAL. */
15581 rtx
15582 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15583 {
15584 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15585 return gen_const_vec_duplicate (mode, c);
15586 }
15587
15588 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15589
15590 bool
15591 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15592 {
15593 machine_mode vmode;
15594
15595 vmode = aarch64_simd_container_mode (mode, 64);
15596 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15597 return aarch64_simd_valid_immediate (op_v, NULL);
15598 }
15599
15600 /* Construct and return a PARALLEL RTX vector with elements numbering the
15601 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15602 the vector - from the perspective of the architecture. This does not
15603 line up with GCC's perspective on lane numbers, so we end up with
15604 different masks depending on our target endian-ness. The diagram
15605 below may help. We must draw the distinction when building masks
15606 which select one half of the vector. An instruction selecting
15607 architectural low-lanes for a big-endian target, must be described using
15608 a mask selecting GCC high-lanes.
15609
15610 Big-Endian Little-Endian
15611
15612 GCC 0 1 2 3 3 2 1 0
15613 | x | x | x | x | | x | x | x | x |
15614 Architecture 3 2 1 0 3 2 1 0
15615
15616 Low Mask: { 2, 3 } { 0, 1 }
15617 High Mask: { 0, 1 } { 2, 3 }
15618
15619 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15620
15621 rtx
15622 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15623 {
15624 rtvec v = rtvec_alloc (nunits / 2);
15625 int high_base = nunits / 2;
15626 int low_base = 0;
15627 int base;
15628 rtx t1;
15629 int i;
15630
15631 if (BYTES_BIG_ENDIAN)
15632 base = high ? low_base : high_base;
15633 else
15634 base = high ? high_base : low_base;
15635
15636 for (i = 0; i < nunits / 2; i++)
15637 RTVEC_ELT (v, i) = GEN_INT (base + i);
15638
15639 t1 = gen_rtx_PARALLEL (mode, v);
15640 return t1;
15641 }
15642
15643 /* Check OP for validity as a PARALLEL RTX vector with elements
15644 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15645 from the perspective of the architecture. See the diagram above
15646 aarch64_simd_vect_par_cnst_half for more details. */
15647
15648 bool
15649 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15650 bool high)
15651 {
15652 int nelts;
15653 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15654 return false;
15655
15656 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15657 HOST_WIDE_INT count_op = XVECLEN (op, 0);
15658 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15659 int i = 0;
15660
15661 if (count_op != count_ideal)
15662 return false;
15663
15664 for (i = 0; i < count_ideal; i++)
15665 {
15666 rtx elt_op = XVECEXP (op, 0, i);
15667 rtx elt_ideal = XVECEXP (ideal, 0, i);
15668
15669 if (!CONST_INT_P (elt_op)
15670 || INTVAL (elt_ideal) != INTVAL (elt_op))
15671 return false;
15672 }
15673 return true;
15674 }
15675
15676 /* Return a PARALLEL containing NELTS elements, with element I equal
15677 to BASE + I * STEP. */
15678
15679 rtx
15680 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15681 {
15682 rtvec vec = rtvec_alloc (nelts);
15683 for (unsigned int i = 0; i < nelts; ++i)
15684 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15685 return gen_rtx_PARALLEL (VOIDmode, vec);
15686 }
15687
15688 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15689 series with step STEP. */
15690
15691 bool
15692 aarch64_stepped_int_parallel_p (rtx op, int step)
15693 {
15694 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15695 return false;
15696
15697 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15698 for (int i = 1; i < XVECLEN (op, 0); ++i)
15699 if (!CONST_INT_P (XVECEXP (op, 0, i))
15700 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15701 return false;
15702
15703 return true;
15704 }
15705
15706 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15707 HIGH (exclusive). */
15708 void
15709 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15710 const_tree exp)
15711 {
15712 HOST_WIDE_INT lane;
15713 gcc_assert (CONST_INT_P (operand));
15714 lane = INTVAL (operand);
15715
15716 if (lane < low || lane >= high)
15717 {
15718 if (exp)
15719 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15720 else
15721 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15722 }
15723 }
15724
15725 /* Peform endian correction on lane number N, which indexes a vector
15726 of mode MODE, and return the result as an SImode rtx. */
15727
15728 rtx
15729 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15730 {
15731 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15732 }
15733
15734 /* Return TRUE if OP is a valid vector addressing mode. */
15735
15736 bool
15737 aarch64_simd_mem_operand_p (rtx op)
15738 {
15739 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15740 || REG_P (XEXP (op, 0)));
15741 }
15742
15743 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15744
15745 bool
15746 aarch64_sve_ld1r_operand_p (rtx op)
15747 {
15748 struct aarch64_address_info addr;
15749 scalar_mode mode;
15750
15751 return (MEM_P (op)
15752 && is_a <scalar_mode> (GET_MODE (op), &mode)
15753 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15754 && addr.type == ADDRESS_REG_IMM
15755 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15756 }
15757
15758 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15759 bool
15760 aarch64_sve_ld1rq_operand_p (rtx op)
15761 {
15762 struct aarch64_address_info addr;
15763 scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15764 if (!MEM_P (op)
15765 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15766 return false;
15767
15768 if (addr.type == ADDRESS_REG_IMM)
15769 return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15770
15771 if (addr.type == ADDRESS_REG_REG)
15772 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15773
15774 return false;
15775 }
15776
15777 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15778 The conditions for STR are the same. */
15779 bool
15780 aarch64_sve_ldr_operand_p (rtx op)
15781 {
15782 struct aarch64_address_info addr;
15783
15784 return (MEM_P (op)
15785 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15786 false, ADDR_QUERY_ANY)
15787 && addr.type == ADDRESS_REG_IMM);
15788 }
15789
15790 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15791 We need to be able to access the individual pieces, so the range
15792 is different from LD[234] and ST[234]. */
15793 bool
15794 aarch64_sve_struct_memory_operand_p (rtx op)
15795 {
15796 if (!MEM_P (op))
15797 return false;
15798
15799 machine_mode mode = GET_MODE (op);
15800 struct aarch64_address_info addr;
15801 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15802 ADDR_QUERY_ANY)
15803 || addr.type != ADDRESS_REG_IMM)
15804 return false;
15805
15806 poly_int64 first = addr.const_offset;
15807 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15808 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15809 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15810 }
15811
15812 /* Emit a register copy from operand to operand, taking care not to
15813 early-clobber source registers in the process.
15814
15815 COUNT is the number of components into which the copy needs to be
15816 decomposed. */
15817 void
15818 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15819 unsigned int count)
15820 {
15821 unsigned int i;
15822 int rdest = REGNO (operands[0]);
15823 int rsrc = REGNO (operands[1]);
15824
15825 if (!reg_overlap_mentioned_p (operands[0], operands[1])
15826 || rdest < rsrc)
15827 for (i = 0; i < count; i++)
15828 emit_move_insn (gen_rtx_REG (mode, rdest + i),
15829 gen_rtx_REG (mode, rsrc + i));
15830 else
15831 for (i = 0; i < count; i++)
15832 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15833 gen_rtx_REG (mode, rsrc + count - i - 1));
15834 }
15835
15836 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15837 one of VSTRUCT modes: OI, CI, or XI. */
15838 int
15839 aarch64_simd_attr_length_rglist (machine_mode mode)
15840 {
15841 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15842 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15843 }
15844
15845 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15846 alignment of a vector to 128 bits. SVE predicates have an alignment of
15847 16 bits. */
15848 static HOST_WIDE_INT
15849 aarch64_simd_vector_alignment (const_tree type)
15850 {
15851 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15852 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15853 be set for non-predicate vectors of booleans. Modes are the most
15854 direct way we have of identifying real SVE predicate types. */
15855 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15856 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15857 }
15858
15859 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15860 static poly_uint64
15861 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15862 {
15863 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15864 {
15865 /* If the length of the vector is fixed, try to align to that length,
15866 otherwise don't try to align at all. */
15867 HOST_WIDE_INT result;
15868 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15869 result = TYPE_ALIGN (TREE_TYPE (type));
15870 return result;
15871 }
15872 return TYPE_ALIGN (type);
15873 }
15874
15875 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15876 static bool
15877 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15878 {
15879 if (is_packed)
15880 return false;
15881
15882 /* For fixed-length vectors, check that the vectorizer will aim for
15883 full-vector alignment. This isn't true for generic GCC vectors
15884 that are wider than the ABI maximum of 128 bits. */
15885 poly_uint64 preferred_alignment =
15886 aarch64_vectorize_preferred_vector_alignment (type);
15887 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15888 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15889 preferred_alignment))
15890 return false;
15891
15892 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15893 return true;
15894 }
15895
15896 /* Return true if the vector misalignment factor is supported by the
15897 target. */
15898 static bool
15899 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15900 const_tree type, int misalignment,
15901 bool is_packed)
15902 {
15903 if (TARGET_SIMD && STRICT_ALIGNMENT)
15904 {
15905 /* Return if movmisalign pattern is not supported for this mode. */
15906 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15907 return false;
15908
15909 /* Misalignment factor is unknown at compile time. */
15910 if (misalignment == -1)
15911 return false;
15912 }
15913 return default_builtin_support_vector_misalignment (mode, type, misalignment,
15914 is_packed);
15915 }
15916
15917 /* If VALS is a vector constant that can be loaded into a register
15918 using DUP, generate instructions to do so and return an RTX to
15919 assign to the register. Otherwise return NULL_RTX. */
15920 static rtx
15921 aarch64_simd_dup_constant (rtx vals)
15922 {
15923 machine_mode mode = GET_MODE (vals);
15924 machine_mode inner_mode = GET_MODE_INNER (mode);
15925 rtx x;
15926
15927 if (!const_vec_duplicate_p (vals, &x))
15928 return NULL_RTX;
15929
15930 /* We can load this constant by using DUP and a constant in a
15931 single ARM register. This will be cheaper than a vector
15932 load. */
15933 x = copy_to_mode_reg (inner_mode, x);
15934 return gen_vec_duplicate (mode, x);
15935 }
15936
15937
15938 /* Generate code to load VALS, which is a PARALLEL containing only
15939 constants (for vec_init) or CONST_VECTOR, efficiently into a
15940 register. Returns an RTX to copy into the register, or NULL_RTX
15941 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15942 static rtx
15943 aarch64_simd_make_constant (rtx vals)
15944 {
15945 machine_mode mode = GET_MODE (vals);
15946 rtx const_dup;
15947 rtx const_vec = NULL_RTX;
15948 int n_const = 0;
15949 int i;
15950
15951 if (GET_CODE (vals) == CONST_VECTOR)
15952 const_vec = vals;
15953 else if (GET_CODE (vals) == PARALLEL)
15954 {
15955 /* A CONST_VECTOR must contain only CONST_INTs and
15956 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15957 Only store valid constants in a CONST_VECTOR. */
15958 int n_elts = XVECLEN (vals, 0);
15959 for (i = 0; i < n_elts; ++i)
15960 {
15961 rtx x = XVECEXP (vals, 0, i);
15962 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15963 n_const++;
15964 }
15965 if (n_const == n_elts)
15966 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15967 }
15968 else
15969 gcc_unreachable ();
15970
15971 if (const_vec != NULL_RTX
15972 && aarch64_simd_valid_immediate (const_vec, NULL))
15973 /* Load using MOVI/MVNI. */
15974 return const_vec;
15975 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15976 /* Loaded using DUP. */
15977 return const_dup;
15978 else if (const_vec != NULL_RTX)
15979 /* Load from constant pool. We cannot take advantage of single-cycle
15980 LD1 because we need a PC-relative addressing mode. */
15981 return const_vec;
15982 else
15983 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15984 We cannot construct an initializer. */
15985 return NULL_RTX;
15986 }
15987
15988 /* Expand a vector initialisation sequence, such that TARGET is
15989 initialised to contain VALS. */
15990
15991 void
15992 aarch64_expand_vector_init (rtx target, rtx vals)
15993 {
15994 machine_mode mode = GET_MODE (target);
15995 scalar_mode inner_mode = GET_MODE_INNER (mode);
15996 /* The number of vector elements. */
15997 int n_elts = XVECLEN (vals, 0);
15998 /* The number of vector elements which are not constant. */
15999 int n_var = 0;
16000 rtx any_const = NULL_RTX;
16001 /* The first element of vals. */
16002 rtx v0 = XVECEXP (vals, 0, 0);
16003 bool all_same = true;
16004
16005 /* This is a special vec_init<M><N> where N is not an element mode but a
16006 vector mode with half the elements of M. We expect to find two entries
16007 of mode N in VALS and we must put their concatentation into TARGET. */
16008 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16009 {
16010 gcc_assert (known_eq (GET_MODE_SIZE (mode),
16011 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16012 rtx lo = XVECEXP (vals, 0, 0);
16013 rtx hi = XVECEXP (vals, 0, 1);
16014 machine_mode narrow_mode = GET_MODE (lo);
16015 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16016 gcc_assert (narrow_mode == GET_MODE (hi));
16017
16018 /* When we want to concatenate a half-width vector with zeroes we can
16019 use the aarch64_combinez[_be] patterns. Just make sure that the
16020 zeroes are in the right half. */
16021 if (BYTES_BIG_ENDIAN
16022 && aarch64_simd_imm_zero (lo, narrow_mode)
16023 && general_operand (hi, narrow_mode))
16024 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16025 else if (!BYTES_BIG_ENDIAN
16026 && aarch64_simd_imm_zero (hi, narrow_mode)
16027 && general_operand (lo, narrow_mode))
16028 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16029 else
16030 {
16031 /* Else create the two half-width registers and combine them. */
16032 if (!REG_P (lo))
16033 lo = force_reg (GET_MODE (lo), lo);
16034 if (!REG_P (hi))
16035 hi = force_reg (GET_MODE (hi), hi);
16036
16037 if (BYTES_BIG_ENDIAN)
16038 std::swap (lo, hi);
16039 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16040 }
16041 return;
16042 }
16043
16044 /* Count the number of variable elements to initialise. */
16045 for (int i = 0; i < n_elts; ++i)
16046 {
16047 rtx x = XVECEXP (vals, 0, i);
16048 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16049 ++n_var;
16050 else
16051 any_const = x;
16052
16053 all_same &= rtx_equal_p (x, v0);
16054 }
16055
16056 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16057 how best to handle this. */
16058 if (n_var == 0)
16059 {
16060 rtx constant = aarch64_simd_make_constant (vals);
16061 if (constant != NULL_RTX)
16062 {
16063 emit_move_insn (target, constant);
16064 return;
16065 }
16066 }
16067
16068 /* Splat a single non-constant element if we can. */
16069 if (all_same)
16070 {
16071 rtx x = copy_to_mode_reg (inner_mode, v0);
16072 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16073 return;
16074 }
16075
16076 enum insn_code icode = optab_handler (vec_set_optab, mode);
16077 gcc_assert (icode != CODE_FOR_nothing);
16078
16079 /* If there are only variable elements, try to optimize
16080 the insertion using dup for the most common element
16081 followed by insertions. */
16082
16083 /* The algorithm will fill matches[*][0] with the earliest matching element,
16084 and matches[X][1] with the count of duplicate elements (if X is the
16085 earliest element which has duplicates). */
16086
16087 if (n_var == n_elts && n_elts <= 16)
16088 {
16089 int matches[16][2] = {0};
16090 for (int i = 0; i < n_elts; i++)
16091 {
16092 for (int j = 0; j <= i; j++)
16093 {
16094 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16095 {
16096 matches[i][0] = j;
16097 matches[j][1]++;
16098 break;
16099 }
16100 }
16101 }
16102 int maxelement = 0;
16103 int maxv = 0;
16104 for (int i = 0; i < n_elts; i++)
16105 if (matches[i][1] > maxv)
16106 {
16107 maxelement = i;
16108 maxv = matches[i][1];
16109 }
16110
16111 /* Create a duplicate of the most common element, unless all elements
16112 are equally useless to us, in which case just immediately set the
16113 vector register using the first element. */
16114
16115 if (maxv == 1)
16116 {
16117 /* For vectors of two 64-bit elements, we can do even better. */
16118 if (n_elts == 2
16119 && (inner_mode == E_DImode
16120 || inner_mode == E_DFmode))
16121
16122 {
16123 rtx x0 = XVECEXP (vals, 0, 0);
16124 rtx x1 = XVECEXP (vals, 0, 1);
16125 /* Combine can pick up this case, but handling it directly
16126 here leaves clearer RTL.
16127
16128 This is load_pair_lanes<mode>, and also gives us a clean-up
16129 for store_pair_lanes<mode>. */
16130 if (memory_operand (x0, inner_mode)
16131 && memory_operand (x1, inner_mode)
16132 && !STRICT_ALIGNMENT
16133 && rtx_equal_p (XEXP (x1, 0),
16134 plus_constant (Pmode,
16135 XEXP (x0, 0),
16136 GET_MODE_SIZE (inner_mode))))
16137 {
16138 rtx t;
16139 if (inner_mode == DFmode)
16140 t = gen_load_pair_lanesdf (target, x0, x1);
16141 else
16142 t = gen_load_pair_lanesdi (target, x0, x1);
16143 emit_insn (t);
16144 return;
16145 }
16146 }
16147 /* The subreg-move sequence below will move into lane zero of the
16148 vector register. For big-endian we want that position to hold
16149 the last element of VALS. */
16150 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16151 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16152 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16153 }
16154 else
16155 {
16156 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16157 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16158 }
16159
16160 /* Insert the rest. */
16161 for (int i = 0; i < n_elts; i++)
16162 {
16163 rtx x = XVECEXP (vals, 0, i);
16164 if (matches[i][0] == maxelement)
16165 continue;
16166 x = copy_to_mode_reg (inner_mode, x);
16167 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16168 }
16169 return;
16170 }
16171
16172 /* Initialise a vector which is part-variable. We want to first try
16173 to build those lanes which are constant in the most efficient way we
16174 can. */
16175 if (n_var != n_elts)
16176 {
16177 rtx copy = copy_rtx (vals);
16178
16179 /* Load constant part of vector. We really don't care what goes into the
16180 parts we will overwrite, but we're more likely to be able to load the
16181 constant efficiently if it has fewer, larger, repeating parts
16182 (see aarch64_simd_valid_immediate). */
16183 for (int i = 0; i < n_elts; i++)
16184 {
16185 rtx x = XVECEXP (vals, 0, i);
16186 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16187 continue;
16188 rtx subst = any_const;
16189 for (int bit = n_elts / 2; bit > 0; bit /= 2)
16190 {
16191 /* Look in the copied vector, as more elements are const. */
16192 rtx test = XVECEXP (copy, 0, i ^ bit);
16193 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16194 {
16195 subst = test;
16196 break;
16197 }
16198 }
16199 XVECEXP (copy, 0, i) = subst;
16200 }
16201 aarch64_expand_vector_init (target, copy);
16202 }
16203
16204 /* Insert the variable lanes directly. */
16205 for (int i = 0; i < n_elts; i++)
16206 {
16207 rtx x = XVECEXP (vals, 0, i);
16208 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16209 continue;
16210 x = copy_to_mode_reg (inner_mode, x);
16211 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16212 }
16213 }
16214
16215 /* Emit RTL corresponding to:
16216 insr TARGET, ELEM. */
16217
16218 static void
16219 emit_insr (rtx target, rtx elem)
16220 {
16221 machine_mode mode = GET_MODE (target);
16222 scalar_mode elem_mode = GET_MODE_INNER (mode);
16223 elem = force_reg (elem_mode, elem);
16224
16225 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16226 gcc_assert (icode != CODE_FOR_nothing);
16227 emit_insn (GEN_FCN (icode) (target, target, elem));
16228 }
16229
16230 /* Subroutine of aarch64_sve_expand_vector_init for handling
16231 trailing constants.
16232 This function works as follows:
16233 (a) Create a new vector consisting of trailing constants.
16234 (b) Initialize TARGET with the constant vector using emit_move_insn.
16235 (c) Insert remaining elements in TARGET using insr.
16236 NELTS is the total number of elements in original vector while
16237 while NELTS_REQD is the number of elements that are actually
16238 significant.
16239
16240 ??? The heuristic used is to do above only if number of constants
16241 is at least half the total number of elements. May need fine tuning. */
16242
16243 static bool
16244 aarch64_sve_expand_vector_init_handle_trailing_constants
16245 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16246 {
16247 machine_mode mode = GET_MODE (target);
16248 scalar_mode elem_mode = GET_MODE_INNER (mode);
16249 int n_trailing_constants = 0;
16250
16251 for (int i = nelts_reqd - 1;
16252 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16253 i--)
16254 n_trailing_constants++;
16255
16256 if (n_trailing_constants >= nelts_reqd / 2)
16257 {
16258 rtx_vector_builder v (mode, 1, nelts);
16259 for (int i = 0; i < nelts; i++)
16260 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16261 rtx const_vec = v.build ();
16262 emit_move_insn (target, const_vec);
16263
16264 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16265 emit_insr (target, builder.elt (i));
16266
16267 return true;
16268 }
16269
16270 return false;
16271 }
16272
16273 /* Subroutine of aarch64_sve_expand_vector_init.
16274 Works as follows:
16275 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16276 (b) Skip trailing elements from BUILDER, which are the same as
16277 element NELTS_REQD - 1.
16278 (c) Insert earlier elements in reverse order in TARGET using insr. */
16279
16280 static void
16281 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16282 const rtx_vector_builder &builder,
16283 int nelts_reqd)
16284 {
16285 machine_mode mode = GET_MODE (target);
16286 scalar_mode elem_mode = GET_MODE_INNER (mode);
16287
16288 struct expand_operand ops[2];
16289 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16290 gcc_assert (icode != CODE_FOR_nothing);
16291
16292 create_output_operand (&ops[0], target, mode);
16293 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16294 expand_insn (icode, 2, ops);
16295
16296 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16297 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16298 emit_insr (target, builder.elt (i));
16299 }
16300
16301 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16302 when all trailing elements of builder are same.
16303 This works as follows:
16304 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16305 (b) Insert remaining elements in TARGET using insr.
16306
16307 ??? The heuristic used is to do above if number of same trailing elements
16308 is at least 3/4 of total number of elements, loosely based on
16309 heuristic from mostly_zeros_p. May need fine-tuning. */
16310
16311 static bool
16312 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16313 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16314 {
16315 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16316 if (ndups >= (3 * nelts_reqd) / 4)
16317 {
16318 aarch64_sve_expand_vector_init_insert_elems (target, builder,
16319 nelts_reqd - ndups + 1);
16320 return true;
16321 }
16322
16323 return false;
16324 }
16325
16326 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16327 of elements in BUILDER.
16328
16329 The function tries to initialize TARGET from BUILDER if it fits one
16330 of the special cases outlined below.
16331
16332 Failing that, the function divides BUILDER into two sub-vectors:
16333 v_even = even elements of BUILDER;
16334 v_odd = odd elements of BUILDER;
16335
16336 and recursively calls itself with v_even and v_odd.
16337
16338 if (recursive call succeeded for v_even or v_odd)
16339 TARGET = zip (v_even, v_odd)
16340
16341 The function returns true if it managed to build TARGET from BUILDER
16342 with one of the special cases, false otherwise.
16343
16344 Example: {a, 1, b, 2, c, 3, d, 4}
16345
16346 The vector gets divided into:
16347 v_even = {a, b, c, d}
16348 v_odd = {1, 2, 3, 4}
16349
16350 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16351 initialize tmp2 from constant vector v_odd using emit_move_insn.
16352
16353 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16354 4 elements, so we construct tmp1 from v_even using insr:
16355 tmp1 = dup(d)
16356 insr tmp1, c
16357 insr tmp1, b
16358 insr tmp1, a
16359
16360 And finally:
16361 TARGET = zip (tmp1, tmp2)
16362 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16363
16364 static bool
16365 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16366 int nelts, int nelts_reqd)
16367 {
16368 machine_mode mode = GET_MODE (target);
16369
16370 /* Case 1: Vector contains trailing constants. */
16371
16372 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16373 (target, builder, nelts, nelts_reqd))
16374 return true;
16375
16376 /* Case 2: Vector contains leading constants. */
16377
16378 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16379 for (int i = 0; i < nelts_reqd; i++)
16380 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16381 rev_builder.finalize ();
16382
16383 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16384 (target, rev_builder, nelts, nelts_reqd))
16385 {
16386 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16387 return true;
16388 }
16389
16390 /* Case 3: Vector contains trailing same element. */
16391
16392 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16393 (target, builder, nelts_reqd))
16394 return true;
16395
16396 /* Case 4: Vector contains leading same element. */
16397
16398 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16399 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16400 {
16401 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16402 return true;
16403 }
16404
16405 /* Avoid recursing below 4-elements.
16406 ??? The threshold 4 may need fine-tuning. */
16407
16408 if (nelts_reqd <= 4)
16409 return false;
16410
16411 rtx_vector_builder v_even (mode, 1, nelts);
16412 rtx_vector_builder v_odd (mode, 1, nelts);
16413
16414 for (int i = 0; i < nelts * 2; i += 2)
16415 {
16416 v_even.quick_push (builder.elt (i));
16417 v_odd.quick_push (builder.elt (i + 1));
16418 }
16419
16420 v_even.finalize ();
16421 v_odd.finalize ();
16422
16423 rtx tmp1 = gen_reg_rtx (mode);
16424 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16425 nelts, nelts_reqd / 2);
16426
16427 rtx tmp2 = gen_reg_rtx (mode);
16428 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16429 nelts, nelts_reqd / 2);
16430
16431 if (!did_even_p && !did_odd_p)
16432 return false;
16433
16434 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16435 special cases and zip v_even, v_odd. */
16436
16437 if (!did_even_p)
16438 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16439
16440 if (!did_odd_p)
16441 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16442
16443 rtvec v = gen_rtvec (2, tmp1, tmp2);
16444 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16445 return true;
16446 }
16447
16448 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16449
16450 void
16451 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16452 {
16453 machine_mode mode = GET_MODE (target);
16454 int nelts = XVECLEN (vals, 0);
16455
16456 rtx_vector_builder v (mode, 1, nelts);
16457 for (int i = 0; i < nelts; i++)
16458 v.quick_push (XVECEXP (vals, 0, i));
16459 v.finalize ();
16460
16461 /* If neither sub-vectors of v could be initialized specially,
16462 then use INSR to insert all elements from v into TARGET.
16463 ??? This might not be optimal for vectors with large
16464 initializers like 16-element or above.
16465 For nelts < 4, it probably isn't useful to handle specially. */
16466
16467 if (nelts < 4
16468 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16469 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16470 }
16471
16472 static unsigned HOST_WIDE_INT
16473 aarch64_shift_truncation_mask (machine_mode mode)
16474 {
16475 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16476 return 0;
16477 return GET_MODE_UNIT_BITSIZE (mode) - 1;
16478 }
16479
16480 /* Select a format to encode pointers in exception handling data. */
16481 int
16482 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16483 {
16484 int type;
16485 switch (aarch64_cmodel)
16486 {
16487 case AARCH64_CMODEL_TINY:
16488 case AARCH64_CMODEL_TINY_PIC:
16489 case AARCH64_CMODEL_SMALL:
16490 case AARCH64_CMODEL_SMALL_PIC:
16491 case AARCH64_CMODEL_SMALL_SPIC:
16492 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16493 for everything. */
16494 type = DW_EH_PE_sdata4;
16495 break;
16496 default:
16497 /* No assumptions here. 8-byte relocs required. */
16498 type = DW_EH_PE_sdata8;
16499 break;
16500 }
16501 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16502 }
16503
16504 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16505
16506 static void
16507 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16508 {
16509 if (aarch64_simd_decl_p (decl))
16510 {
16511 fprintf (stream, "\t.variant_pcs\t");
16512 assemble_name (stream, name);
16513 fprintf (stream, "\n");
16514 }
16515 }
16516
16517 /* The last .arch and .tune assembly strings that we printed. */
16518 static std::string aarch64_last_printed_arch_string;
16519 static std::string aarch64_last_printed_tune_string;
16520
16521 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16522 by the function fndecl. */
16523
16524 void
16525 aarch64_declare_function_name (FILE *stream, const char* name,
16526 tree fndecl)
16527 {
16528 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16529
16530 struct cl_target_option *targ_options;
16531 if (target_parts)
16532 targ_options = TREE_TARGET_OPTION (target_parts);
16533 else
16534 targ_options = TREE_TARGET_OPTION (target_option_current_node);
16535 gcc_assert (targ_options);
16536
16537 const struct processor *this_arch
16538 = aarch64_get_arch (targ_options->x_explicit_arch);
16539
16540 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16541 std::string extension
16542 = aarch64_get_extension_string_for_isa_flags (isa_flags,
16543 this_arch->flags);
16544 /* Only update the assembler .arch string if it is distinct from the last
16545 such string we printed. */
16546 std::string to_print = this_arch->name + extension;
16547 if (to_print != aarch64_last_printed_arch_string)
16548 {
16549 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16550 aarch64_last_printed_arch_string = to_print;
16551 }
16552
16553 /* Print the cpu name we're tuning for in the comments, might be
16554 useful to readers of the generated asm. Do it only when it changes
16555 from function to function and verbose assembly is requested. */
16556 const struct processor *this_tune
16557 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16558
16559 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16560 {
16561 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16562 this_tune->name);
16563 aarch64_last_printed_tune_string = this_tune->name;
16564 }
16565
16566 aarch64_asm_output_variant_pcs (stream, fndecl, name);
16567
16568 /* Don't forget the type directive for ELF. */
16569 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16570 ASM_OUTPUT_LABEL (stream, name);
16571 }
16572
16573 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16574
16575 void
16576 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16577 {
16578 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16579 const char *value = IDENTIFIER_POINTER (target);
16580 aarch64_asm_output_variant_pcs (stream, decl, name);
16581 ASM_OUTPUT_DEF (stream, name, value);
16582 }
16583
16584 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16585 function symbol references. */
16586
16587 void
16588 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16589 {
16590 default_elf_asm_output_external (stream, decl, name);
16591 aarch64_asm_output_variant_pcs (stream, decl, name);
16592 }
16593
16594 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16595 Used to output the .cfi_b_key_frame directive when signing the current
16596 function with the B key. */
16597
16598 void
16599 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16600 {
16601 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16602 && aarch64_ra_sign_key == AARCH64_KEY_B)
16603 asm_fprintf (f, "\t.cfi_b_key_frame\n");
16604 }
16605
16606 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16607
16608 static void
16609 aarch64_start_file (void)
16610 {
16611 struct cl_target_option *default_options
16612 = TREE_TARGET_OPTION (target_option_default_node);
16613
16614 const struct processor *default_arch
16615 = aarch64_get_arch (default_options->x_explicit_arch);
16616 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16617 std::string extension
16618 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16619 default_arch->flags);
16620
16621 aarch64_last_printed_arch_string = default_arch->name + extension;
16622 aarch64_last_printed_tune_string = "";
16623 asm_fprintf (asm_out_file, "\t.arch %s\n",
16624 aarch64_last_printed_arch_string.c_str ());
16625
16626 default_file_start ();
16627 }
16628
16629 /* Emit load exclusive. */
16630
16631 static void
16632 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16633 rtx mem, rtx model_rtx)
16634 {
16635 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16636 }
16637
16638 /* Emit store exclusive. */
16639
16640 static void
16641 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16642 rtx rval, rtx mem, rtx model_rtx)
16643 {
16644 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16645 }
16646
16647 /* Mark the previous jump instruction as unlikely. */
16648
16649 static void
16650 aarch64_emit_unlikely_jump (rtx insn)
16651 {
16652 rtx_insn *jump = emit_jump_insn (insn);
16653 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16654 }
16655
16656 /* Expand a compare and swap pattern. */
16657
16658 void
16659 aarch64_expand_compare_and_swap (rtx operands[])
16660 {
16661 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16662 machine_mode mode, r_mode;
16663
16664 bval = operands[0];
16665 rval = operands[1];
16666 mem = operands[2];
16667 oldval = operands[3];
16668 newval = operands[4];
16669 is_weak = operands[5];
16670 mod_s = operands[6];
16671 mod_f = operands[7];
16672 mode = GET_MODE (mem);
16673
16674 /* Normally the succ memory model must be stronger than fail, but in the
16675 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16676 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16677 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16678 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16679 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16680
16681 r_mode = mode;
16682 if (mode == QImode || mode == HImode)
16683 {
16684 r_mode = SImode;
16685 rval = gen_reg_rtx (r_mode);
16686 }
16687
16688 if (TARGET_LSE)
16689 {
16690 /* The CAS insn requires oldval and rval overlap, but we need to
16691 have a copy of oldval saved across the operation to tell if
16692 the operation is successful. */
16693 if (reg_overlap_mentioned_p (rval, oldval))
16694 rval = copy_to_mode_reg (r_mode, oldval);
16695 else
16696 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16697
16698 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16699 newval, mod_s));
16700 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16701 }
16702 else
16703 {
16704 /* The oldval predicate varies by mode. Test it and force to reg. */
16705 insn_code code = code_for_aarch64_compare_and_swap (mode);
16706 if (!insn_data[code].operand[2].predicate (oldval, mode))
16707 oldval = force_reg (mode, oldval);
16708
16709 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16710 is_weak, mod_s, mod_f));
16711 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16712 }
16713
16714 if (r_mode != mode)
16715 rval = gen_lowpart (mode, rval);
16716 emit_move_insn (operands[1], rval);
16717
16718 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16719 emit_insn (gen_rtx_SET (bval, x));
16720 }
16721
16722 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16723 sequence implementing an atomic operation. */
16724
16725 static void
16726 aarch64_emit_post_barrier (enum memmodel model)
16727 {
16728 const enum memmodel base_model = memmodel_base (model);
16729
16730 if (is_mm_sync (model)
16731 && (base_model == MEMMODEL_ACQUIRE
16732 || base_model == MEMMODEL_ACQ_REL
16733 || base_model == MEMMODEL_SEQ_CST))
16734 {
16735 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16736 }
16737 }
16738
16739 /* Split a compare and swap pattern. */
16740
16741 void
16742 aarch64_split_compare_and_swap (rtx operands[])
16743 {
16744 rtx rval, mem, oldval, newval, scratch;
16745 machine_mode mode;
16746 bool is_weak;
16747 rtx_code_label *label1, *label2;
16748 rtx x, cond;
16749 enum memmodel model;
16750 rtx model_rtx;
16751
16752 rval = operands[0];
16753 mem = operands[1];
16754 oldval = operands[2];
16755 newval = operands[3];
16756 is_weak = (operands[4] != const0_rtx);
16757 model_rtx = operands[5];
16758 scratch = operands[7];
16759 mode = GET_MODE (mem);
16760 model = memmodel_from_int (INTVAL (model_rtx));
16761
16762 /* When OLDVAL is zero and we want the strong version we can emit a tighter
16763 loop:
16764 .label1:
16765 LD[A]XR rval, [mem]
16766 CBNZ rval, .label2
16767 ST[L]XR scratch, newval, [mem]
16768 CBNZ scratch, .label1
16769 .label2:
16770 CMP rval, 0. */
16771 bool strong_zero_p = !is_weak && oldval == const0_rtx;
16772
16773 label1 = NULL;
16774 if (!is_weak)
16775 {
16776 label1 = gen_label_rtx ();
16777 emit_label (label1);
16778 }
16779 label2 = gen_label_rtx ();
16780
16781 /* The initial load can be relaxed for a __sync operation since a final
16782 barrier will be emitted to stop code hoisting. */
16783 if (is_mm_sync (model))
16784 aarch64_emit_load_exclusive (mode, rval, mem,
16785 GEN_INT (MEMMODEL_RELAXED));
16786 else
16787 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16788
16789 if (strong_zero_p)
16790 {
16791 if (aarch64_track_speculation)
16792 {
16793 /* Emit an explicit compare instruction, so that we can correctly
16794 track the condition codes. */
16795 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16796 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16797 }
16798 else
16799 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16800
16801 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16802 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16803 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16804 }
16805 else
16806 {
16807 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16808 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16809 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16810 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16811 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16812 }
16813
16814 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16815
16816 if (!is_weak)
16817 {
16818 if (aarch64_track_speculation)
16819 {
16820 /* Emit an explicit compare instruction, so that we can correctly
16821 track the condition codes. */
16822 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16823 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16824 }
16825 else
16826 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16827
16828 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16829 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16830 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16831 }
16832 else
16833 {
16834 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16835 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16836 emit_insn (gen_rtx_SET (cond, x));
16837 }
16838
16839 emit_label (label2);
16840 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16841 to set the condition flags. If this is not used it will be removed by
16842 later passes. */
16843 if (strong_zero_p)
16844 {
16845 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16846 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16847 emit_insn (gen_rtx_SET (cond, x));
16848 }
16849 /* Emit any final barrier needed for a __sync operation. */
16850 if (is_mm_sync (model))
16851 aarch64_emit_post_barrier (model);
16852 }
16853
16854 /* Split an atomic operation. */
16855
16856 void
16857 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16858 rtx value, rtx model_rtx, rtx cond)
16859 {
16860 machine_mode mode = GET_MODE (mem);
16861 machine_mode wmode = (mode == DImode ? DImode : SImode);
16862 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16863 const bool is_sync = is_mm_sync (model);
16864 rtx_code_label *label;
16865 rtx x;
16866
16867 /* Split the atomic operation into a sequence. */
16868 label = gen_label_rtx ();
16869 emit_label (label);
16870
16871 if (new_out)
16872 new_out = gen_lowpart (wmode, new_out);
16873 if (old_out)
16874 old_out = gen_lowpart (wmode, old_out);
16875 else
16876 old_out = new_out;
16877 value = simplify_gen_subreg (wmode, value, mode, 0);
16878
16879 /* The initial load can be relaxed for a __sync operation since a final
16880 barrier will be emitted to stop code hoisting. */
16881 if (is_sync)
16882 aarch64_emit_load_exclusive (mode, old_out, mem,
16883 GEN_INT (MEMMODEL_RELAXED));
16884 else
16885 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16886
16887 switch (code)
16888 {
16889 case SET:
16890 new_out = value;
16891 break;
16892
16893 case NOT:
16894 x = gen_rtx_AND (wmode, old_out, value);
16895 emit_insn (gen_rtx_SET (new_out, x));
16896 x = gen_rtx_NOT (wmode, new_out);
16897 emit_insn (gen_rtx_SET (new_out, x));
16898 break;
16899
16900 case MINUS:
16901 if (CONST_INT_P (value))
16902 {
16903 value = GEN_INT (-INTVAL (value));
16904 code = PLUS;
16905 }
16906 /* Fall through. */
16907
16908 default:
16909 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16910 emit_insn (gen_rtx_SET (new_out, x));
16911 break;
16912 }
16913
16914 aarch64_emit_store_exclusive (mode, cond, mem,
16915 gen_lowpart (mode, new_out), model_rtx);
16916
16917 if (aarch64_track_speculation)
16918 {
16919 /* Emit an explicit compare instruction, so that we can correctly
16920 track the condition codes. */
16921 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16922 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16923 }
16924 else
16925 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16926
16927 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16928 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16929 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16930
16931 /* Emit any final barrier needed for a __sync operation. */
16932 if (is_sync)
16933 aarch64_emit_post_barrier (model);
16934 }
16935
16936 static void
16937 aarch64_init_libfuncs (void)
16938 {
16939 /* Half-precision float operations. The compiler handles all operations
16940 with NULL libfuncs by converting to SFmode. */
16941
16942 /* Conversions. */
16943 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16944 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16945
16946 /* Arithmetic. */
16947 set_optab_libfunc (add_optab, HFmode, NULL);
16948 set_optab_libfunc (sdiv_optab, HFmode, NULL);
16949 set_optab_libfunc (smul_optab, HFmode, NULL);
16950 set_optab_libfunc (neg_optab, HFmode, NULL);
16951 set_optab_libfunc (sub_optab, HFmode, NULL);
16952
16953 /* Comparisons. */
16954 set_optab_libfunc (eq_optab, HFmode, NULL);
16955 set_optab_libfunc (ne_optab, HFmode, NULL);
16956 set_optab_libfunc (lt_optab, HFmode, NULL);
16957 set_optab_libfunc (le_optab, HFmode, NULL);
16958 set_optab_libfunc (ge_optab, HFmode, NULL);
16959 set_optab_libfunc (gt_optab, HFmode, NULL);
16960 set_optab_libfunc (unord_optab, HFmode, NULL);
16961 }
16962
16963 /* Target hook for c_mode_for_suffix. */
16964 static machine_mode
16965 aarch64_c_mode_for_suffix (char suffix)
16966 {
16967 if (suffix == 'q')
16968 return TFmode;
16969
16970 return VOIDmode;
16971 }
16972
16973 /* We can only represent floating point constants which will fit in
16974 "quarter-precision" values. These values are characterised by
16975 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
16976 by:
16977
16978 (-1)^s * (n/16) * 2^r
16979
16980 Where:
16981 's' is the sign bit.
16982 'n' is an integer in the range 16 <= n <= 31.
16983 'r' is an integer in the range -3 <= r <= 4. */
16984
16985 /* Return true iff X can be represented by a quarter-precision
16986 floating point immediate operand X. Note, we cannot represent 0.0. */
16987 bool
16988 aarch64_float_const_representable_p (rtx x)
16989 {
16990 /* This represents our current view of how many bits
16991 make up the mantissa. */
16992 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16993 int exponent;
16994 unsigned HOST_WIDE_INT mantissa, mask;
16995 REAL_VALUE_TYPE r, m;
16996 bool fail;
16997
16998 x = unwrap_const_vec_duplicate (x);
16999 if (!CONST_DOUBLE_P (x))
17000 return false;
17001
17002 if (GET_MODE (x) == VOIDmode
17003 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17004 return false;
17005
17006 r = *CONST_DOUBLE_REAL_VALUE (x);
17007
17008 /* We cannot represent infinities, NaNs or +/-zero. We won't
17009 know if we have +zero until we analyse the mantissa, but we
17010 can reject the other invalid values. */
17011 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17012 || REAL_VALUE_MINUS_ZERO (r))
17013 return false;
17014
17015 /* Extract exponent. */
17016 r = real_value_abs (&r);
17017 exponent = REAL_EXP (&r);
17018
17019 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17020 highest (sign) bit, with a fixed binary point at bit point_pos.
17021 m1 holds the low part of the mantissa, m2 the high part.
17022 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17023 bits for the mantissa, this can fail (low bits will be lost). */
17024 real_ldexp (&m, &r, point_pos - exponent);
17025 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17026
17027 /* If the low part of the mantissa has bits set we cannot represent
17028 the value. */
17029 if (w.ulow () != 0)
17030 return false;
17031 /* We have rejected the lower HOST_WIDE_INT, so update our
17032 understanding of how many bits lie in the mantissa and
17033 look only at the high HOST_WIDE_INT. */
17034 mantissa = w.elt (1);
17035 point_pos -= HOST_BITS_PER_WIDE_INT;
17036
17037 /* We can only represent values with a mantissa of the form 1.xxxx. */
17038 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17039 if ((mantissa & mask) != 0)
17040 return false;
17041
17042 /* Having filtered unrepresentable values, we may now remove all
17043 but the highest 5 bits. */
17044 mantissa >>= point_pos - 5;
17045
17046 /* We cannot represent the value 0.0, so reject it. This is handled
17047 elsewhere. */
17048 if (mantissa == 0)
17049 return false;
17050
17051 /* Then, as bit 4 is always set, we can mask it off, leaving
17052 the mantissa in the range [0, 15]. */
17053 mantissa &= ~(1 << 4);
17054 gcc_assert (mantissa <= 15);
17055
17056 /* GCC internally does not use IEEE754-like encoding (where normalized
17057 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17058 Our mantissa values are shifted 4 places to the left relative to
17059 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17060 by 5 places to correct for GCC's representation. */
17061 exponent = 5 - exponent;
17062
17063 return (exponent >= 0 && exponent <= 7);
17064 }
17065
17066 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17067 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17068 output MOVI/MVNI, ORR or BIC immediate. */
17069 char*
17070 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17071 enum simd_immediate_check which)
17072 {
17073 bool is_valid;
17074 static char templ[40];
17075 const char *mnemonic;
17076 const char *shift_op;
17077 unsigned int lane_count = 0;
17078 char element_char;
17079
17080 struct simd_immediate_info info;
17081
17082 /* This will return true to show const_vector is legal for use as either
17083 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17084 It will also update INFO to show how the immediate should be generated.
17085 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17086 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17087 gcc_assert (is_valid);
17088
17089 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17090 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17091
17092 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17093 {
17094 gcc_assert (info.insn == simd_immediate_info::MOV
17095 && info.u.mov.shift == 0);
17096 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17097 move immediate path. */
17098 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17099 info.u.mov.value = GEN_INT (0);
17100 else
17101 {
17102 const unsigned int buf_size = 20;
17103 char float_buf[buf_size] = {'\0'};
17104 real_to_decimal_for_mode (float_buf,
17105 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17106 buf_size, buf_size, 1, info.elt_mode);
17107
17108 if (lane_count == 1)
17109 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17110 else
17111 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17112 lane_count, element_char, float_buf);
17113 return templ;
17114 }
17115 }
17116
17117 gcc_assert (CONST_INT_P (info.u.mov.value));
17118
17119 if (which == AARCH64_CHECK_MOV)
17120 {
17121 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17122 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17123 ? "msl" : "lsl");
17124 if (lane_count == 1)
17125 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17126 mnemonic, UINTVAL (info.u.mov.value));
17127 else if (info.u.mov.shift)
17128 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17129 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17130 element_char, UINTVAL (info.u.mov.value), shift_op,
17131 info.u.mov.shift);
17132 else
17133 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17134 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17135 element_char, UINTVAL (info.u.mov.value));
17136 }
17137 else
17138 {
17139 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17140 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17141 if (info.u.mov.shift)
17142 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17143 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17144 element_char, UINTVAL (info.u.mov.value), "lsl",
17145 info.u.mov.shift);
17146 else
17147 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17148 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17149 element_char, UINTVAL (info.u.mov.value));
17150 }
17151 return templ;
17152 }
17153
17154 char*
17155 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17156 {
17157
17158 /* If a floating point number was passed and we desire to use it in an
17159 integer mode do the conversion to integer. */
17160 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17161 {
17162 unsigned HOST_WIDE_INT ival;
17163 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17164 gcc_unreachable ();
17165 immediate = gen_int_mode (ival, mode);
17166 }
17167
17168 machine_mode vmode;
17169 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17170 a 128 bit vector mode. */
17171 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17172
17173 vmode = aarch64_simd_container_mode (mode, width);
17174 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17175 return aarch64_output_simd_mov_immediate (v_op, width);
17176 }
17177
17178 /* Return the output string to use for moving immediate CONST_VECTOR
17179 into an SVE register. */
17180
17181 char *
17182 aarch64_output_sve_mov_immediate (rtx const_vector)
17183 {
17184 static char templ[40];
17185 struct simd_immediate_info info;
17186 char element_char;
17187
17188 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17189 gcc_assert (is_valid);
17190
17191 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17192
17193 machine_mode vec_mode = GET_MODE (const_vector);
17194 if (aarch64_sve_pred_mode_p (vec_mode))
17195 {
17196 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17197 if (info.insn == simd_immediate_info::MOV)
17198 {
17199 gcc_assert (info.u.mov.value == const0_rtx);
17200 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17201 }
17202 else
17203 {
17204 gcc_assert (info.insn == simd_immediate_info::PTRUE);
17205 unsigned int total_bytes;
17206 if (info.u.pattern == AARCH64_SV_ALL
17207 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17208 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17209 total_bytes / GET_MODE_SIZE (info.elt_mode));
17210 else
17211 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17212 svpattern_token (info.u.pattern));
17213 }
17214 return buf;
17215 }
17216
17217 if (info.insn == simd_immediate_info::INDEX)
17218 {
17219 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17220 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17221 element_char, INTVAL (info.u.index.base),
17222 INTVAL (info.u.index.step));
17223 return templ;
17224 }
17225
17226 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17227 {
17228 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17229 info.u.mov.value = GEN_INT (0);
17230 else
17231 {
17232 const int buf_size = 20;
17233 char float_buf[buf_size] = {};
17234 real_to_decimal_for_mode (float_buf,
17235 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17236 buf_size, buf_size, 1, info.elt_mode);
17237
17238 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17239 element_char, float_buf);
17240 return templ;
17241 }
17242 }
17243
17244 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17245 element_char, INTVAL (info.u.mov.value));
17246 return templ;
17247 }
17248
17249 /* Split operands into moves from op[1] + op[2] into op[0]. */
17250
17251 void
17252 aarch64_split_combinev16qi (rtx operands[3])
17253 {
17254 unsigned int dest = REGNO (operands[0]);
17255 unsigned int src1 = REGNO (operands[1]);
17256 unsigned int src2 = REGNO (operands[2]);
17257 machine_mode halfmode = GET_MODE (operands[1]);
17258 unsigned int halfregs = REG_NREGS (operands[1]);
17259 rtx destlo, desthi;
17260
17261 gcc_assert (halfmode == V16QImode);
17262
17263 if (src1 == dest && src2 == dest + halfregs)
17264 {
17265 /* No-op move. Can't split to nothing; emit something. */
17266 emit_note (NOTE_INSN_DELETED);
17267 return;
17268 }
17269
17270 /* Preserve register attributes for variable tracking. */
17271 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17272 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17273 GET_MODE_SIZE (halfmode));
17274
17275 /* Special case of reversed high/low parts. */
17276 if (reg_overlap_mentioned_p (operands[2], destlo)
17277 && reg_overlap_mentioned_p (operands[1], desthi))
17278 {
17279 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17280 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17281 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17282 }
17283 else if (!reg_overlap_mentioned_p (operands[2], destlo))
17284 {
17285 /* Try to avoid unnecessary moves if part of the result
17286 is in the right place already. */
17287 if (src1 != dest)
17288 emit_move_insn (destlo, operands[1]);
17289 if (src2 != dest + halfregs)
17290 emit_move_insn (desthi, operands[2]);
17291 }
17292 else
17293 {
17294 if (src2 != dest + halfregs)
17295 emit_move_insn (desthi, operands[2]);
17296 if (src1 != dest)
17297 emit_move_insn (destlo, operands[1]);
17298 }
17299 }
17300
17301 /* vec_perm support. */
17302
17303 struct expand_vec_perm_d
17304 {
17305 rtx target, op0, op1;
17306 vec_perm_indices perm;
17307 machine_mode vmode;
17308 unsigned int vec_flags;
17309 bool one_vector_p;
17310 bool testing_p;
17311 };
17312
17313 /* Generate a variable permutation. */
17314
17315 static void
17316 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17317 {
17318 machine_mode vmode = GET_MODE (target);
17319 bool one_vector_p = rtx_equal_p (op0, op1);
17320
17321 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17322 gcc_checking_assert (GET_MODE (op0) == vmode);
17323 gcc_checking_assert (GET_MODE (op1) == vmode);
17324 gcc_checking_assert (GET_MODE (sel) == vmode);
17325 gcc_checking_assert (TARGET_SIMD);
17326
17327 if (one_vector_p)
17328 {
17329 if (vmode == V8QImode)
17330 {
17331 /* Expand the argument to a V16QI mode by duplicating it. */
17332 rtx pair = gen_reg_rtx (V16QImode);
17333 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17334 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17335 }
17336 else
17337 {
17338 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17339 }
17340 }
17341 else
17342 {
17343 rtx pair;
17344
17345 if (vmode == V8QImode)
17346 {
17347 pair = gen_reg_rtx (V16QImode);
17348 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17349 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17350 }
17351 else
17352 {
17353 pair = gen_reg_rtx (OImode);
17354 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17355 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17356 }
17357 }
17358 }
17359
17360 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17361 NELT is the number of elements in the vector. */
17362
17363 void
17364 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17365 unsigned int nelt)
17366 {
17367 machine_mode vmode = GET_MODE (target);
17368 bool one_vector_p = rtx_equal_p (op0, op1);
17369 rtx mask;
17370
17371 /* The TBL instruction does not use a modulo index, so we must take care
17372 of that ourselves. */
17373 mask = aarch64_simd_gen_const_vector_dup (vmode,
17374 one_vector_p ? nelt - 1 : 2 * nelt - 1);
17375 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17376
17377 /* For big-endian, we also need to reverse the index within the vector
17378 (but not which vector). */
17379 if (BYTES_BIG_ENDIAN)
17380 {
17381 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17382 if (!one_vector_p)
17383 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17384 sel = expand_simple_binop (vmode, XOR, sel, mask,
17385 NULL, 0, OPTAB_LIB_WIDEN);
17386 }
17387 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17388 }
17389
17390 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17391
17392 static void
17393 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17394 {
17395 emit_insn (gen_rtx_SET (target,
17396 gen_rtx_UNSPEC (GET_MODE (target),
17397 gen_rtvec (2, op0, op1), code)));
17398 }
17399
17400 /* Expand an SVE vec_perm with the given operands. */
17401
17402 void
17403 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17404 {
17405 machine_mode data_mode = GET_MODE (target);
17406 machine_mode sel_mode = GET_MODE (sel);
17407 /* Enforced by the pattern condition. */
17408 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17409
17410 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17411 size of the two value vectors, i.e. the upper bits of the indices
17412 are effectively ignored. SVE TBL instead produces 0 for any
17413 out-of-range indices, so we need to modulo all the vec_perm indices
17414 to ensure they are all in range. */
17415 rtx sel_reg = force_reg (sel_mode, sel);
17416
17417 /* Check if the sel only references the first values vector. */
17418 if (GET_CODE (sel) == CONST_VECTOR
17419 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17420 {
17421 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17422 return;
17423 }
17424
17425 /* Check if the two values vectors are the same. */
17426 if (rtx_equal_p (op0, op1))
17427 {
17428 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17429 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17430 NULL, 0, OPTAB_DIRECT);
17431 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17432 return;
17433 }
17434
17435 /* Run TBL on for each value vector and combine the results. */
17436
17437 rtx res0 = gen_reg_rtx (data_mode);
17438 rtx res1 = gen_reg_rtx (data_mode);
17439 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17440 if (GET_CODE (sel) != CONST_VECTOR
17441 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17442 {
17443 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17444 2 * nunits - 1);
17445 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17446 NULL, 0, OPTAB_DIRECT);
17447 }
17448 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17449 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17450 NULL, 0, OPTAB_DIRECT);
17451 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17452 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17453 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17454 else
17455 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17456 }
17457
17458 /* Recognize patterns suitable for the TRN instructions. */
17459 static bool
17460 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17461 {
17462 HOST_WIDE_INT odd;
17463 poly_uint64 nelt = d->perm.length ();
17464 rtx out, in0, in1, x;
17465 machine_mode vmode = d->vmode;
17466
17467 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17468 return false;
17469
17470 /* Note that these are little-endian tests.
17471 We correct for big-endian later. */
17472 if (!d->perm[0].is_constant (&odd)
17473 || (odd != 0 && odd != 1)
17474 || !d->perm.series_p (0, 2, odd, 2)
17475 || !d->perm.series_p (1, 2, nelt + odd, 2))
17476 return false;
17477
17478 /* Success! */
17479 if (d->testing_p)
17480 return true;
17481
17482 in0 = d->op0;
17483 in1 = d->op1;
17484 /* We don't need a big-endian lane correction for SVE; see the comment
17485 at the head of aarch64-sve.md for details. */
17486 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17487 {
17488 x = in0, in0 = in1, in1 = x;
17489 odd = !odd;
17490 }
17491 out = d->target;
17492
17493 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17494 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17495 return true;
17496 }
17497
17498 /* Recognize patterns suitable for the UZP instructions. */
17499 static bool
17500 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17501 {
17502 HOST_WIDE_INT odd;
17503 rtx out, in0, in1, x;
17504 machine_mode vmode = d->vmode;
17505
17506 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17507 return false;
17508
17509 /* Note that these are little-endian tests.
17510 We correct for big-endian later. */
17511 if (!d->perm[0].is_constant (&odd)
17512 || (odd != 0 && odd != 1)
17513 || !d->perm.series_p (0, 1, odd, 2))
17514 return false;
17515
17516 /* Success! */
17517 if (d->testing_p)
17518 return true;
17519
17520 in0 = d->op0;
17521 in1 = d->op1;
17522 /* We don't need a big-endian lane correction for SVE; see the comment
17523 at the head of aarch64-sve.md for details. */
17524 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17525 {
17526 x = in0, in0 = in1, in1 = x;
17527 odd = !odd;
17528 }
17529 out = d->target;
17530
17531 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17532 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17533 return true;
17534 }
17535
17536 /* Recognize patterns suitable for the ZIP instructions. */
17537 static bool
17538 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17539 {
17540 unsigned int high;
17541 poly_uint64 nelt = d->perm.length ();
17542 rtx out, in0, in1, x;
17543 machine_mode vmode = d->vmode;
17544
17545 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17546 return false;
17547
17548 /* Note that these are little-endian tests.
17549 We correct for big-endian later. */
17550 poly_uint64 first = d->perm[0];
17551 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17552 || !d->perm.series_p (0, 2, first, 1)
17553 || !d->perm.series_p (1, 2, first + nelt, 1))
17554 return false;
17555 high = maybe_ne (first, 0U);
17556
17557 /* Success! */
17558 if (d->testing_p)
17559 return true;
17560
17561 in0 = d->op0;
17562 in1 = d->op1;
17563 /* We don't need a big-endian lane correction for SVE; see the comment
17564 at the head of aarch64-sve.md for details. */
17565 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17566 {
17567 x = in0, in0 = in1, in1 = x;
17568 high = !high;
17569 }
17570 out = d->target;
17571
17572 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17573 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17574 return true;
17575 }
17576
17577 /* Recognize patterns for the EXT insn. */
17578
17579 static bool
17580 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17581 {
17582 HOST_WIDE_INT location;
17583 rtx offset;
17584
17585 /* The first element always refers to the first vector.
17586 Check if the extracted indices are increasing by one. */
17587 if (d->vec_flags == VEC_SVE_PRED
17588 || !d->perm[0].is_constant (&location)
17589 || !d->perm.series_p (0, 1, location, 1))
17590 return false;
17591
17592 /* Success! */
17593 if (d->testing_p)
17594 return true;
17595
17596 /* The case where (location == 0) is a no-op for both big- and little-endian,
17597 and is removed by the mid-end at optimization levels -O1 and higher.
17598
17599 We don't need a big-endian lane correction for SVE; see the comment
17600 at the head of aarch64-sve.md for details. */
17601 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17602 {
17603 /* After setup, we want the high elements of the first vector (stored
17604 at the LSB end of the register), and the low elements of the second
17605 vector (stored at the MSB end of the register). So swap. */
17606 std::swap (d->op0, d->op1);
17607 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17608 to_constant () is safe since this is restricted to Advanced SIMD
17609 vectors. */
17610 location = d->perm.length ().to_constant () - location;
17611 }
17612
17613 offset = GEN_INT (location);
17614 emit_set_insn (d->target,
17615 gen_rtx_UNSPEC (d->vmode,
17616 gen_rtvec (3, d->op0, d->op1, offset),
17617 UNSPEC_EXT));
17618 return true;
17619 }
17620
17621 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17622 within each 64-bit, 32-bit or 16-bit granule. */
17623
17624 static bool
17625 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17626 {
17627 HOST_WIDE_INT diff;
17628 unsigned int i, size, unspec;
17629 machine_mode pred_mode;
17630
17631 if (d->vec_flags == VEC_SVE_PRED
17632 || !d->one_vector_p
17633 || !d->perm[0].is_constant (&diff))
17634 return false;
17635
17636 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17637 if (size == 8)
17638 {
17639 unspec = UNSPEC_REV64;
17640 pred_mode = VNx2BImode;
17641 }
17642 else if (size == 4)
17643 {
17644 unspec = UNSPEC_REV32;
17645 pred_mode = VNx4BImode;
17646 }
17647 else if (size == 2)
17648 {
17649 unspec = UNSPEC_REV16;
17650 pred_mode = VNx8BImode;
17651 }
17652 else
17653 return false;
17654
17655 unsigned int step = diff + 1;
17656 for (i = 0; i < step; ++i)
17657 if (!d->perm.series_p (i, step, diff - i, step))
17658 return false;
17659
17660 /* Success! */
17661 if (d->testing_p)
17662 return true;
17663
17664 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17665 if (d->vec_flags == VEC_SVE_DATA)
17666 {
17667 rtx pred = aarch64_ptrue_reg (pred_mode);
17668 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
17669 UNSPEC_PRED_X);
17670 }
17671 emit_set_insn (d->target, src);
17672 return true;
17673 }
17674
17675 /* Recognize patterns for the REV insn, which reverses elements within
17676 a full vector. */
17677
17678 static bool
17679 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17680 {
17681 poly_uint64 nelt = d->perm.length ();
17682
17683 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17684 return false;
17685
17686 if (!d->perm.series_p (0, 1, nelt - 1, -1))
17687 return false;
17688
17689 /* Success! */
17690 if (d->testing_p)
17691 return true;
17692
17693 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17694 emit_set_insn (d->target, src);
17695 return true;
17696 }
17697
17698 static bool
17699 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17700 {
17701 rtx out = d->target;
17702 rtx in0;
17703 HOST_WIDE_INT elt;
17704 machine_mode vmode = d->vmode;
17705 rtx lane;
17706
17707 if (d->vec_flags == VEC_SVE_PRED
17708 || d->perm.encoding ().encoded_nelts () != 1
17709 || !d->perm[0].is_constant (&elt))
17710 return false;
17711
17712 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17713 return false;
17714
17715 /* Success! */
17716 if (d->testing_p)
17717 return true;
17718
17719 /* The generic preparation in aarch64_expand_vec_perm_const_1
17720 swaps the operand order and the permute indices if it finds
17721 d->perm[0] to be in the second operand. Thus, we can always
17722 use d->op0 and need not do any extra arithmetic to get the
17723 correct lane number. */
17724 in0 = d->op0;
17725 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
17726
17727 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17728 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17729 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17730 return true;
17731 }
17732
17733 static bool
17734 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17735 {
17736 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17737 machine_mode vmode = d->vmode;
17738
17739 /* Make sure that the indices are constant. */
17740 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17741 for (unsigned int i = 0; i < encoded_nelts; ++i)
17742 if (!d->perm[i].is_constant ())
17743 return false;
17744
17745 if (d->testing_p)
17746 return true;
17747
17748 /* Generic code will try constant permutation twice. Once with the
17749 original mode and again with the elements lowered to QImode.
17750 So wait and don't do the selector expansion ourselves. */
17751 if (vmode != V8QImode && vmode != V16QImode)
17752 return false;
17753
17754 /* to_constant is safe since this routine is specific to Advanced SIMD
17755 vectors. */
17756 unsigned int nelt = d->perm.length ().to_constant ();
17757 for (unsigned int i = 0; i < nelt; ++i)
17758 /* If big-endian and two vectors we end up with a weird mixed-endian
17759 mode on NEON. Reverse the index within each word but not the word
17760 itself. to_constant is safe because we checked is_constant above. */
17761 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17762 ? d->perm[i].to_constant () ^ (nelt - 1)
17763 : d->perm[i].to_constant ());
17764
17765 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17766 sel = force_reg (vmode, sel);
17767
17768 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17769 return true;
17770 }
17771
17772 /* Try to implement D using an SVE TBL instruction. */
17773
17774 static bool
17775 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17776 {
17777 unsigned HOST_WIDE_INT nelt;
17778
17779 /* Permuting two variable-length vectors could overflow the
17780 index range. */
17781 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17782 return false;
17783
17784 if (d->testing_p)
17785 return true;
17786
17787 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17788 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17789 if (d->one_vector_p)
17790 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17791 else
17792 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17793 return true;
17794 }
17795
17796 static bool
17797 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17798 {
17799 /* The pattern matching functions above are written to look for a small
17800 number to begin the sequence (0, 1, N/2). If we begin with an index
17801 from the second operand, we can swap the operands. */
17802 poly_int64 nelt = d->perm.length ();
17803 if (known_ge (d->perm[0], nelt))
17804 {
17805 d->perm.rotate_inputs (1);
17806 std::swap (d->op0, d->op1);
17807 }
17808
17809 if ((d->vec_flags == VEC_ADVSIMD
17810 || d->vec_flags == VEC_SVE_DATA
17811 || d->vec_flags == VEC_SVE_PRED)
17812 && known_gt (nelt, 1))
17813 {
17814 if (aarch64_evpc_rev_local (d))
17815 return true;
17816 else if (aarch64_evpc_rev_global (d))
17817 return true;
17818 else if (aarch64_evpc_ext (d))
17819 return true;
17820 else if (aarch64_evpc_dup (d))
17821 return true;
17822 else if (aarch64_evpc_zip (d))
17823 return true;
17824 else if (aarch64_evpc_uzp (d))
17825 return true;
17826 else if (aarch64_evpc_trn (d))
17827 return true;
17828 if (d->vec_flags == VEC_SVE_DATA)
17829 return aarch64_evpc_sve_tbl (d);
17830 else if (d->vec_flags == VEC_ADVSIMD)
17831 return aarch64_evpc_tbl (d);
17832 }
17833 return false;
17834 }
17835
17836 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
17837
17838 static bool
17839 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17840 rtx op1, const vec_perm_indices &sel)
17841 {
17842 struct expand_vec_perm_d d;
17843
17844 /* Check whether the mask can be applied to a single vector. */
17845 if (sel.ninputs () == 1
17846 || (op0 && rtx_equal_p (op0, op1)))
17847 d.one_vector_p = true;
17848 else if (sel.all_from_input_p (0))
17849 {
17850 d.one_vector_p = true;
17851 op1 = op0;
17852 }
17853 else if (sel.all_from_input_p (1))
17854 {
17855 d.one_vector_p = true;
17856 op0 = op1;
17857 }
17858 else
17859 d.one_vector_p = false;
17860
17861 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17862 sel.nelts_per_input ());
17863 d.vmode = vmode;
17864 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17865 d.target = target;
17866 d.op0 = op0;
17867 d.op1 = op1;
17868 d.testing_p = !target;
17869
17870 if (!d.testing_p)
17871 return aarch64_expand_vec_perm_const_1 (&d);
17872
17873 rtx_insn *last = get_last_insn ();
17874 bool ret = aarch64_expand_vec_perm_const_1 (&d);
17875 gcc_assert (last == get_last_insn ());
17876
17877 return ret;
17878 }
17879
17880 /* Generate a byte permute mask for a register of mode MODE,
17881 which has NUNITS units. */
17882
17883 rtx
17884 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17885 {
17886 /* We have to reverse each vector because we dont have
17887 a permuted load that can reverse-load according to ABI rules. */
17888 rtx mask;
17889 rtvec v = rtvec_alloc (16);
17890 unsigned int i, j;
17891 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17892
17893 gcc_assert (BYTES_BIG_ENDIAN);
17894 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17895
17896 for (i = 0; i < nunits; i++)
17897 for (j = 0; j < usize; j++)
17898 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17899 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17900 return force_reg (V16QImode, mask);
17901 }
17902
17903 /* Expand an SVE integer comparison using the SVE equivalent of:
17904
17905 (set TARGET (CODE OP0 OP1)). */
17906
17907 void
17908 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17909 {
17910 machine_mode pred_mode = GET_MODE (target);
17911 machine_mode data_mode = GET_MODE (op0);
17912 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
17913 op0, op1);
17914 if (!rtx_equal_p (target, res))
17915 emit_move_insn (target, res);
17916 }
17917
17918 /* Return the UNSPEC_COND_* code for comparison CODE. */
17919
17920 static unsigned int
17921 aarch64_unspec_cond_code (rtx_code code)
17922 {
17923 switch (code)
17924 {
17925 case NE:
17926 return UNSPEC_COND_FCMNE;
17927 case EQ:
17928 return UNSPEC_COND_FCMEQ;
17929 case LT:
17930 return UNSPEC_COND_FCMLT;
17931 case GT:
17932 return UNSPEC_COND_FCMGT;
17933 case LE:
17934 return UNSPEC_COND_FCMLE;
17935 case GE:
17936 return UNSPEC_COND_FCMGE;
17937 case UNORDERED:
17938 return UNSPEC_COND_FCMUO;
17939 default:
17940 gcc_unreachable ();
17941 }
17942 }
17943
17944 /* Emit:
17945
17946 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
17947
17948 where <X> is the operation associated with comparison CODE.
17949 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
17950
17951 static void
17952 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
17953 bool known_ptrue_p, rtx op0, rtx op1)
17954 {
17955 rtx flag = gen_int_mode (known_ptrue_p, SImode);
17956 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17957 gen_rtvec (4, pred, flag, op0, op1),
17958 aarch64_unspec_cond_code (code));
17959 emit_set_insn (target, unspec);
17960 }
17961
17962 /* Emit the SVE equivalent of:
17963
17964 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
17965 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
17966 (set TARGET (ior:PRED_MODE TMP1 TMP2))
17967
17968 where <Xi> is the operation associated with comparison CODEi.
17969 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
17970
17971 static void
17972 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
17973 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
17974 {
17975 machine_mode pred_mode = GET_MODE (pred);
17976 rtx tmp1 = gen_reg_rtx (pred_mode);
17977 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
17978 rtx tmp2 = gen_reg_rtx (pred_mode);
17979 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
17980 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17981 }
17982
17983 /* Emit the SVE equivalent of:
17984
17985 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
17986 (set TARGET (not TMP))
17987
17988 where <X> is the operation associated with comparison CODE.
17989 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
17990
17991 static void
17992 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
17993 bool known_ptrue_p, rtx op0, rtx op1)
17994 {
17995 machine_mode pred_mode = GET_MODE (pred);
17996 rtx tmp = gen_reg_rtx (pred_mode);
17997 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
17998 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17999 }
18000
18001 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18002
18003 (set TARGET (CODE OP0 OP1))
18004
18005 If CAN_INVERT_P is true, the caller can also handle inverted results;
18006 return true if the result is in fact inverted. */
18007
18008 bool
18009 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18010 rtx op0, rtx op1, bool can_invert_p)
18011 {
18012 machine_mode pred_mode = GET_MODE (target);
18013 machine_mode data_mode = GET_MODE (op0);
18014
18015 rtx ptrue = aarch64_ptrue_reg (pred_mode);
18016 switch (code)
18017 {
18018 case UNORDERED:
18019 /* UNORDERED has no immediate form. */
18020 op1 = force_reg (data_mode, op1);
18021 /* fall through */
18022 case LT:
18023 case LE:
18024 case GT:
18025 case GE:
18026 case EQ:
18027 case NE:
18028 {
18029 /* There is native support for the comparison. */
18030 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18031 return false;
18032 }
18033
18034 case LTGT:
18035 /* This is a trapping operation (LT or GT). */
18036 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18037 return false;
18038
18039 case UNEQ:
18040 if (!flag_trapping_math)
18041 {
18042 /* This would trap for signaling NaNs. */
18043 op1 = force_reg (data_mode, op1);
18044 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18045 ptrue, true, op0, op1);
18046 return false;
18047 }
18048 /* fall through */
18049 case UNLT:
18050 case UNLE:
18051 case UNGT:
18052 case UNGE:
18053 if (flag_trapping_math)
18054 {
18055 /* Work out which elements are ordered. */
18056 rtx ordered = gen_reg_rtx (pred_mode);
18057 op1 = force_reg (data_mode, op1);
18058 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18059 ptrue, true, op0, op1);
18060
18061 /* Test the opposite condition for the ordered elements,
18062 then invert the result. */
18063 if (code == UNEQ)
18064 code = NE;
18065 else
18066 code = reverse_condition_maybe_unordered (code);
18067 if (can_invert_p)
18068 {
18069 aarch64_emit_sve_fp_cond (target, code,
18070 ordered, false, op0, op1);
18071 return true;
18072 }
18073 aarch64_emit_sve_invert_fp_cond (target, code,
18074 ordered, false, op0, op1);
18075 return false;
18076 }
18077 break;
18078
18079 case ORDERED:
18080 /* ORDERED has no immediate form. */
18081 op1 = force_reg (data_mode, op1);
18082 break;
18083
18084 default:
18085 gcc_unreachable ();
18086 }
18087
18088 /* There is native support for the inverse comparison. */
18089 code = reverse_condition_maybe_unordered (code);
18090 if (can_invert_p)
18091 {
18092 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18093 return true;
18094 }
18095 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18096 return false;
18097 }
18098
18099 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18100 of the data being selected and CMP_MODE is the mode of the values being
18101 compared. */
18102
18103 void
18104 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18105 rtx *ops)
18106 {
18107 machine_mode pred_mode
18108 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18109 GET_MODE_SIZE (cmp_mode)).require ();
18110 rtx pred = gen_reg_rtx (pred_mode);
18111 if (FLOAT_MODE_P (cmp_mode))
18112 {
18113 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18114 ops[4], ops[5], true))
18115 std::swap (ops[1], ops[2]);
18116 }
18117 else
18118 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18119
18120 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18121 ops[1] = force_reg (data_mode, ops[1]);
18122 /* The "false" value can only be zero if the "true" value is a constant. */
18123 if (register_operand (ops[1], data_mode)
18124 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18125 ops[2] = force_reg (data_mode, ops[2]);
18126
18127 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18128 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18129 }
18130
18131 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18132 true. However due to issues with register allocation it is preferable
18133 to avoid tieing integer scalar and FP scalar modes. Executing integer
18134 operations in general registers is better than treating them as scalar
18135 vector operations. This reduces latency and avoids redundant int<->FP
18136 moves. So tie modes if they are either the same class, or vector modes
18137 with other vector modes, vector structs or any scalar mode. */
18138
18139 static bool
18140 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18141 {
18142 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18143 return true;
18144
18145 /* We specifically want to allow elements of "structure" modes to
18146 be tieable to the structure. This more general condition allows
18147 other rarer situations too. The reason we don't extend this to
18148 predicate modes is that there are no predicate structure modes
18149 nor any specific instructions for extracting part of a predicate
18150 register. */
18151 if (aarch64_vector_data_mode_p (mode1)
18152 && aarch64_vector_data_mode_p (mode2))
18153 return true;
18154
18155 /* Also allow any scalar modes with vectors. */
18156 if (aarch64_vector_mode_supported_p (mode1)
18157 || aarch64_vector_mode_supported_p (mode2))
18158 return true;
18159
18160 return false;
18161 }
18162
18163 /* Return a new RTX holding the result of moving POINTER forward by
18164 AMOUNT bytes. */
18165
18166 static rtx
18167 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18168 {
18169 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18170
18171 return adjust_automodify_address (pointer, GET_MODE (pointer),
18172 next, amount);
18173 }
18174
18175 /* Return a new RTX holding the result of moving POINTER forward by the
18176 size of the mode it points to. */
18177
18178 static rtx
18179 aarch64_progress_pointer (rtx pointer)
18180 {
18181 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18182 }
18183
18184 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18185 MODE bytes. */
18186
18187 static void
18188 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18189 machine_mode mode)
18190 {
18191 rtx reg = gen_reg_rtx (mode);
18192
18193 /* "Cast" the pointers to the correct mode. */
18194 *src = adjust_address (*src, mode, 0);
18195 *dst = adjust_address (*dst, mode, 0);
18196 /* Emit the memcpy. */
18197 emit_move_insn (reg, *src);
18198 emit_move_insn (*dst, reg);
18199 /* Move the pointers forward. */
18200 *src = aarch64_progress_pointer (*src);
18201 *dst = aarch64_progress_pointer (*dst);
18202 }
18203
18204 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18205 we succeed, otherwise return false. */
18206
18207 bool
18208 aarch64_expand_cpymem (rtx *operands)
18209 {
18210 int n, mode_bits;
18211 rtx dst = operands[0];
18212 rtx src = operands[1];
18213 rtx base;
18214 machine_mode cur_mode = BLKmode, next_mode;
18215 bool speed_p = !optimize_function_for_size_p (cfun);
18216
18217 /* When optimizing for size, give a better estimate of the length of a
18218 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18219 will always require an even number of instructions to do now. And each
18220 operation requires both a load+store, so devide the max number by 2. */
18221 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18222
18223 /* We can't do anything smart if the amount to copy is not constant. */
18224 if (!CONST_INT_P (operands[2]))
18225 return false;
18226
18227 n = INTVAL (operands[2]);
18228
18229 /* Try to keep the number of instructions low. For all cases we will do at
18230 most two moves for the residual amount, since we'll always overlap the
18231 remainder. */
18232 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18233 return false;
18234
18235 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18236 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18237
18238 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18239 src = adjust_automodify_address (src, VOIDmode, base, 0);
18240
18241 /* Convert n to bits to make the rest of the code simpler. */
18242 n = n * BITS_PER_UNIT;
18243
18244 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18245 larger than TImode, but we should not use them for loads/stores here. */
18246 const int copy_limit = GET_MODE_BITSIZE (TImode);
18247
18248 while (n > 0)
18249 {
18250 /* Find the largest mode in which to do the copy in without over reading
18251 or writing. */
18252 opt_scalar_int_mode mode_iter;
18253 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18254 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18255 cur_mode = mode_iter.require ();
18256
18257 gcc_assert (cur_mode != BLKmode);
18258
18259 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18260 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18261
18262 n -= mode_bits;
18263
18264 /* Do certain trailing copies as overlapping if it's going to be
18265 cheaper. i.e. less instructions to do so. For instance doing a 15
18266 byte copy it's more efficient to do two overlapping 8 byte copies than
18267 8 + 6 + 1. */
18268 if (n > 0 && n <= 8 * BITS_PER_UNIT)
18269 {
18270 next_mode = smallest_mode_for_size (n, MODE_INT);
18271 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18272 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18273 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18274 n = n_bits;
18275 }
18276 }
18277
18278 return true;
18279 }
18280
18281 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18282 SImode stores. Handle the case when the constant has identical
18283 bottom and top halves. This is beneficial when the two stores can be
18284 merged into an STP and we avoid synthesising potentially expensive
18285 immediates twice. Return true if such a split is possible. */
18286
18287 bool
18288 aarch64_split_dimode_const_store (rtx dst, rtx src)
18289 {
18290 rtx lo = gen_lowpart (SImode, src);
18291 rtx hi = gen_highpart_mode (SImode, DImode, src);
18292
18293 bool size_p = optimize_function_for_size_p (cfun);
18294
18295 if (!rtx_equal_p (lo, hi))
18296 return false;
18297
18298 unsigned int orig_cost
18299 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18300 unsigned int lo_cost
18301 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18302
18303 /* We want to transform:
18304 MOV x1, 49370
18305 MOVK x1, 0x140, lsl 16
18306 MOVK x1, 0xc0da, lsl 32
18307 MOVK x1, 0x140, lsl 48
18308 STR x1, [x0]
18309 into:
18310 MOV w1, 49370
18311 MOVK w1, 0x140, lsl 16
18312 STP w1, w1, [x0]
18313 So we want to perform this only when we save two instructions
18314 or more. When optimizing for size, however, accept any code size
18315 savings we can. */
18316 if (size_p && orig_cost <= lo_cost)
18317 return false;
18318
18319 if (!size_p
18320 && (orig_cost <= lo_cost + 1))
18321 return false;
18322
18323 rtx mem_lo = adjust_address (dst, SImode, 0);
18324 if (!aarch64_mem_pair_operand (mem_lo, SImode))
18325 return false;
18326
18327 rtx tmp_reg = gen_reg_rtx (SImode);
18328 aarch64_expand_mov_immediate (tmp_reg, lo);
18329 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18330 /* Don't emit an explicit store pair as this may not be always profitable.
18331 Let the sched-fusion logic decide whether to merge them. */
18332 emit_move_insn (mem_lo, tmp_reg);
18333 emit_move_insn (mem_hi, tmp_reg);
18334
18335 return true;
18336 }
18337
18338 /* Generate RTL for a conditional branch with rtx comparison CODE in
18339 mode CC_MODE. The destination of the unlikely conditional branch
18340 is LABEL_REF. */
18341
18342 void
18343 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18344 rtx label_ref)
18345 {
18346 rtx x;
18347 x = gen_rtx_fmt_ee (code, VOIDmode,
18348 gen_rtx_REG (cc_mode, CC_REGNUM),
18349 const0_rtx);
18350
18351 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18352 gen_rtx_LABEL_REF (VOIDmode, label_ref),
18353 pc_rtx);
18354 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18355 }
18356
18357 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18358
18359 OP1 represents the TImode destination operand 1
18360 OP2 represents the TImode destination operand 2
18361 LOW_DEST represents the low half (DImode) of TImode operand 0
18362 LOW_IN1 represents the low half (DImode) of TImode operand 1
18363 LOW_IN2 represents the low half (DImode) of TImode operand 2
18364 HIGH_DEST represents the high half (DImode) of TImode operand 0
18365 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18366 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18367
18368 void
18369 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18370 rtx *low_in1, rtx *low_in2,
18371 rtx *high_dest, rtx *high_in1,
18372 rtx *high_in2)
18373 {
18374 *low_dest = gen_reg_rtx (DImode);
18375 *low_in1 = gen_lowpart (DImode, op1);
18376 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18377 subreg_lowpart_offset (DImode, TImode));
18378 *high_dest = gen_reg_rtx (DImode);
18379 *high_in1 = gen_highpart (DImode, op1);
18380 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18381 subreg_highpart_offset (DImode, TImode));
18382 }
18383
18384 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18385
18386 This function differs from 'arch64_addti_scratch_regs' in that
18387 OP1 can be an immediate constant (zero). We must call
18388 subreg_highpart_offset with DImode and TImode arguments, otherwise
18389 VOIDmode will be used for the const_int which generates an internal
18390 error from subreg_size_highpart_offset which does not expect a size of zero.
18391
18392 OP1 represents the TImode destination operand 1
18393 OP2 represents the TImode destination operand 2
18394 LOW_DEST represents the low half (DImode) of TImode operand 0
18395 LOW_IN1 represents the low half (DImode) of TImode operand 1
18396 LOW_IN2 represents the low half (DImode) of TImode operand 2
18397 HIGH_DEST represents the high half (DImode) of TImode operand 0
18398 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18399 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18400
18401
18402 void
18403 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18404 rtx *low_in1, rtx *low_in2,
18405 rtx *high_dest, rtx *high_in1,
18406 rtx *high_in2)
18407 {
18408 *low_dest = gen_reg_rtx (DImode);
18409 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18410 subreg_lowpart_offset (DImode, TImode));
18411
18412 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18413 subreg_lowpart_offset (DImode, TImode));
18414 *high_dest = gen_reg_rtx (DImode);
18415
18416 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18417 subreg_highpart_offset (DImode, TImode));
18418 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18419 subreg_highpart_offset (DImode, TImode));
18420 }
18421
18422 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18423
18424 OP0 represents the TImode destination operand 0
18425 LOW_DEST represents the low half (DImode) of TImode operand 0
18426 LOW_IN1 represents the low half (DImode) of TImode operand 1
18427 LOW_IN2 represents the low half (DImode) of TImode operand 2
18428 HIGH_DEST represents the high half (DImode) of TImode operand 0
18429 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18430 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18431 UNSIGNED_P is true if the operation is being performed on unsigned
18432 values. */
18433 void
18434 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18435 rtx low_in2, rtx high_dest, rtx high_in1,
18436 rtx high_in2, bool unsigned_p)
18437 {
18438 if (low_in2 == const0_rtx)
18439 {
18440 low_dest = low_in1;
18441 high_in2 = force_reg (DImode, high_in2);
18442 if (unsigned_p)
18443 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18444 else
18445 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18446 }
18447 else
18448 {
18449 if (CONST_INT_P (low_in2))
18450 {
18451 high_in2 = force_reg (DImode, high_in2);
18452 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18453 GEN_INT (-INTVAL (low_in2))));
18454 }
18455 else
18456 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18457
18458 if (unsigned_p)
18459 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18460 else
18461 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18462 }
18463
18464 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18465 emit_move_insn (gen_highpart (DImode, op0), high_dest);
18466
18467 }
18468
18469 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18470
18471 static unsigned HOST_WIDE_INT
18472 aarch64_asan_shadow_offset (void)
18473 {
18474 if (TARGET_ILP32)
18475 return (HOST_WIDE_INT_1 << 29);
18476 else
18477 return (HOST_WIDE_INT_1 << 36);
18478 }
18479
18480 static rtx
18481 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18482 int code, tree treeop0, tree treeop1)
18483 {
18484 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18485 rtx op0, op1;
18486 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18487 insn_code icode;
18488 struct expand_operand ops[4];
18489
18490 start_sequence ();
18491 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18492
18493 op_mode = GET_MODE (op0);
18494 if (op_mode == VOIDmode)
18495 op_mode = GET_MODE (op1);
18496
18497 switch (op_mode)
18498 {
18499 case E_QImode:
18500 case E_HImode:
18501 case E_SImode:
18502 cmp_mode = SImode;
18503 icode = CODE_FOR_cmpsi;
18504 break;
18505
18506 case E_DImode:
18507 cmp_mode = DImode;
18508 icode = CODE_FOR_cmpdi;
18509 break;
18510
18511 case E_SFmode:
18512 cmp_mode = SFmode;
18513 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18514 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18515 break;
18516
18517 case E_DFmode:
18518 cmp_mode = DFmode;
18519 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18520 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18521 break;
18522
18523 default:
18524 end_sequence ();
18525 return NULL_RTX;
18526 }
18527
18528 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18529 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18530 if (!op0 || !op1)
18531 {
18532 end_sequence ();
18533 return NULL_RTX;
18534 }
18535 *prep_seq = get_insns ();
18536 end_sequence ();
18537
18538 create_fixed_operand (&ops[0], op0);
18539 create_fixed_operand (&ops[1], op1);
18540
18541 start_sequence ();
18542 if (!maybe_expand_insn (icode, 2, ops))
18543 {
18544 end_sequence ();
18545 return NULL_RTX;
18546 }
18547 *gen_seq = get_insns ();
18548 end_sequence ();
18549
18550 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18551 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18552 }
18553
18554 static rtx
18555 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18556 int cmp_code, tree treeop0, tree treeop1, int bit_code)
18557 {
18558 rtx op0, op1, target;
18559 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18560 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18561 insn_code icode;
18562 struct expand_operand ops[6];
18563 int aarch64_cond;
18564
18565 push_to_sequence (*prep_seq);
18566 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18567
18568 op_mode = GET_MODE (op0);
18569 if (op_mode == VOIDmode)
18570 op_mode = GET_MODE (op1);
18571
18572 switch (op_mode)
18573 {
18574 case E_QImode:
18575 case E_HImode:
18576 case E_SImode:
18577 cmp_mode = SImode;
18578 icode = CODE_FOR_ccmpsi;
18579 break;
18580
18581 case E_DImode:
18582 cmp_mode = DImode;
18583 icode = CODE_FOR_ccmpdi;
18584 break;
18585
18586 case E_SFmode:
18587 cmp_mode = SFmode;
18588 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18589 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18590 break;
18591
18592 case E_DFmode:
18593 cmp_mode = DFmode;
18594 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18595 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18596 break;
18597
18598 default:
18599 end_sequence ();
18600 return NULL_RTX;
18601 }
18602
18603 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18604 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18605 if (!op0 || !op1)
18606 {
18607 end_sequence ();
18608 return NULL_RTX;
18609 }
18610 *prep_seq = get_insns ();
18611 end_sequence ();
18612
18613 target = gen_rtx_REG (cc_mode, CC_REGNUM);
18614 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18615
18616 if (bit_code != AND)
18617 {
18618 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18619 GET_MODE (XEXP (prev, 0))),
18620 VOIDmode, XEXP (prev, 0), const0_rtx);
18621 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18622 }
18623
18624 create_fixed_operand (&ops[0], XEXP (prev, 0));
18625 create_fixed_operand (&ops[1], target);
18626 create_fixed_operand (&ops[2], op0);
18627 create_fixed_operand (&ops[3], op1);
18628 create_fixed_operand (&ops[4], prev);
18629 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18630
18631 push_to_sequence (*gen_seq);
18632 if (!maybe_expand_insn (icode, 6, ops))
18633 {
18634 end_sequence ();
18635 return NULL_RTX;
18636 }
18637
18638 *gen_seq = get_insns ();
18639 end_sequence ();
18640
18641 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18642 }
18643
18644 #undef TARGET_GEN_CCMP_FIRST
18645 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18646
18647 #undef TARGET_GEN_CCMP_NEXT
18648 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18649
18650 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18651 instruction fusion of some sort. */
18652
18653 static bool
18654 aarch64_macro_fusion_p (void)
18655 {
18656 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18657 }
18658
18659
18660 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
18661 should be kept together during scheduling. */
18662
18663 static bool
18664 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18665 {
18666 rtx set_dest;
18667 rtx prev_set = single_set (prev);
18668 rtx curr_set = single_set (curr);
18669 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
18670 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18671
18672 if (!aarch64_macro_fusion_p ())
18673 return false;
18674
18675 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18676 {
18677 /* We are trying to match:
18678 prev (mov) == (set (reg r0) (const_int imm16))
18679 curr (movk) == (set (zero_extract (reg r0)
18680 (const_int 16)
18681 (const_int 16))
18682 (const_int imm16_1)) */
18683
18684 set_dest = SET_DEST (curr_set);
18685
18686 if (GET_CODE (set_dest) == ZERO_EXTRACT
18687 && CONST_INT_P (SET_SRC (curr_set))
18688 && CONST_INT_P (SET_SRC (prev_set))
18689 && CONST_INT_P (XEXP (set_dest, 2))
18690 && INTVAL (XEXP (set_dest, 2)) == 16
18691 && REG_P (XEXP (set_dest, 0))
18692 && REG_P (SET_DEST (prev_set))
18693 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18694 {
18695 return true;
18696 }
18697 }
18698
18699 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18700 {
18701
18702 /* We're trying to match:
18703 prev (adrp) == (set (reg r1)
18704 (high (symbol_ref ("SYM"))))
18705 curr (add) == (set (reg r0)
18706 (lo_sum (reg r1)
18707 (symbol_ref ("SYM"))))
18708 Note that r0 need not necessarily be the same as r1, especially
18709 during pre-regalloc scheduling. */
18710
18711 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18712 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18713 {
18714 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18715 && REG_P (XEXP (SET_SRC (curr_set), 0))
18716 && REGNO (XEXP (SET_SRC (curr_set), 0))
18717 == REGNO (SET_DEST (prev_set))
18718 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18719 XEXP (SET_SRC (curr_set), 1)))
18720 return true;
18721 }
18722 }
18723
18724 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18725 {
18726
18727 /* We're trying to match:
18728 prev (movk) == (set (zero_extract (reg r0)
18729 (const_int 16)
18730 (const_int 32))
18731 (const_int imm16_1))
18732 curr (movk) == (set (zero_extract (reg r0)
18733 (const_int 16)
18734 (const_int 48))
18735 (const_int imm16_2)) */
18736
18737 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18738 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18739 && REG_P (XEXP (SET_DEST (prev_set), 0))
18740 && REG_P (XEXP (SET_DEST (curr_set), 0))
18741 && REGNO (XEXP (SET_DEST (prev_set), 0))
18742 == REGNO (XEXP (SET_DEST (curr_set), 0))
18743 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18744 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18745 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18746 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18747 && CONST_INT_P (SET_SRC (prev_set))
18748 && CONST_INT_P (SET_SRC (curr_set)))
18749 return true;
18750
18751 }
18752 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18753 {
18754 /* We're trying to match:
18755 prev (adrp) == (set (reg r0)
18756 (high (symbol_ref ("SYM"))))
18757 curr (ldr) == (set (reg r1)
18758 (mem (lo_sum (reg r0)
18759 (symbol_ref ("SYM")))))
18760 or
18761 curr (ldr) == (set (reg r1)
18762 (zero_extend (mem
18763 (lo_sum (reg r0)
18764 (symbol_ref ("SYM")))))) */
18765 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18766 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18767 {
18768 rtx curr_src = SET_SRC (curr_set);
18769
18770 if (GET_CODE (curr_src) == ZERO_EXTEND)
18771 curr_src = XEXP (curr_src, 0);
18772
18773 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18774 && REG_P (XEXP (XEXP (curr_src, 0), 0))
18775 && REGNO (XEXP (XEXP (curr_src, 0), 0))
18776 == REGNO (SET_DEST (prev_set))
18777 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18778 XEXP (SET_SRC (prev_set), 0)))
18779 return true;
18780 }
18781 }
18782
18783 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18784 && any_condjump_p (curr))
18785 {
18786 unsigned int condreg1, condreg2;
18787 rtx cc_reg_1;
18788 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18789 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18790
18791 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18792 && prev
18793 && modified_in_p (cc_reg_1, prev))
18794 {
18795 enum attr_type prev_type = get_attr_type (prev);
18796
18797 /* FIXME: this misses some which is considered simple arthematic
18798 instructions for ThunderX. Simple shifts are missed here. */
18799 if (prev_type == TYPE_ALUS_SREG
18800 || prev_type == TYPE_ALUS_IMM
18801 || prev_type == TYPE_LOGICS_REG
18802 || prev_type == TYPE_LOGICS_IMM)
18803 return true;
18804 }
18805 }
18806
18807 if (prev_set
18808 && curr_set
18809 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18810 && any_condjump_p (curr))
18811 {
18812 /* We're trying to match:
18813 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18814 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18815 (const_int 0))
18816 (label_ref ("SYM"))
18817 (pc)) */
18818 if (SET_DEST (curr_set) == (pc_rtx)
18819 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18820 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18821 && REG_P (SET_DEST (prev_set))
18822 && REGNO (SET_DEST (prev_set))
18823 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18824 {
18825 /* Fuse ALU operations followed by conditional branch instruction. */
18826 switch (get_attr_type (prev))
18827 {
18828 case TYPE_ALU_IMM:
18829 case TYPE_ALU_SREG:
18830 case TYPE_ADC_REG:
18831 case TYPE_ADC_IMM:
18832 case TYPE_ADCS_REG:
18833 case TYPE_ADCS_IMM:
18834 case TYPE_LOGIC_REG:
18835 case TYPE_LOGIC_IMM:
18836 case TYPE_CSEL:
18837 case TYPE_ADR:
18838 case TYPE_MOV_IMM:
18839 case TYPE_SHIFT_REG:
18840 case TYPE_SHIFT_IMM:
18841 case TYPE_BFM:
18842 case TYPE_RBIT:
18843 case TYPE_REV:
18844 case TYPE_EXTEND:
18845 return true;
18846
18847 default:;
18848 }
18849 }
18850 }
18851
18852 return false;
18853 }
18854
18855 /* Return true iff the instruction fusion described by OP is enabled. */
18856
18857 bool
18858 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18859 {
18860 return (aarch64_tune_params.fusible_ops & op) != 0;
18861 }
18862
18863 /* If MEM is in the form of [base+offset], extract the two parts
18864 of address and set to BASE and OFFSET, otherwise return false
18865 after clearing BASE and OFFSET. */
18866
18867 bool
18868 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18869 {
18870 rtx addr;
18871
18872 gcc_assert (MEM_P (mem));
18873
18874 addr = XEXP (mem, 0);
18875
18876 if (REG_P (addr))
18877 {
18878 *base = addr;
18879 *offset = const0_rtx;
18880 return true;
18881 }
18882
18883 if (GET_CODE (addr) == PLUS
18884 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18885 {
18886 *base = XEXP (addr, 0);
18887 *offset = XEXP (addr, 1);
18888 return true;
18889 }
18890
18891 *base = NULL_RTX;
18892 *offset = NULL_RTX;
18893
18894 return false;
18895 }
18896
18897 /* Types for scheduling fusion. */
18898 enum sched_fusion_type
18899 {
18900 SCHED_FUSION_NONE = 0,
18901 SCHED_FUSION_LD_SIGN_EXTEND,
18902 SCHED_FUSION_LD_ZERO_EXTEND,
18903 SCHED_FUSION_LD,
18904 SCHED_FUSION_ST,
18905 SCHED_FUSION_NUM
18906 };
18907
18908 /* If INSN is a load or store of address in the form of [base+offset],
18909 extract the two parts and set to BASE and OFFSET. Return scheduling
18910 fusion type this INSN is. */
18911
18912 static enum sched_fusion_type
18913 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18914 {
18915 rtx x, dest, src;
18916 enum sched_fusion_type fusion = SCHED_FUSION_LD;
18917
18918 gcc_assert (INSN_P (insn));
18919 x = PATTERN (insn);
18920 if (GET_CODE (x) != SET)
18921 return SCHED_FUSION_NONE;
18922
18923 src = SET_SRC (x);
18924 dest = SET_DEST (x);
18925
18926 machine_mode dest_mode = GET_MODE (dest);
18927
18928 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18929 return SCHED_FUSION_NONE;
18930
18931 if (GET_CODE (src) == SIGN_EXTEND)
18932 {
18933 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18934 src = XEXP (src, 0);
18935 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18936 return SCHED_FUSION_NONE;
18937 }
18938 else if (GET_CODE (src) == ZERO_EXTEND)
18939 {
18940 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18941 src = XEXP (src, 0);
18942 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18943 return SCHED_FUSION_NONE;
18944 }
18945
18946 if (GET_CODE (src) == MEM && REG_P (dest))
18947 extract_base_offset_in_addr (src, base, offset);
18948 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18949 {
18950 fusion = SCHED_FUSION_ST;
18951 extract_base_offset_in_addr (dest, base, offset);
18952 }
18953 else
18954 return SCHED_FUSION_NONE;
18955
18956 if (*base == NULL_RTX || *offset == NULL_RTX)
18957 fusion = SCHED_FUSION_NONE;
18958
18959 return fusion;
18960 }
18961
18962 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18963
18964 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18965 and PRI are only calculated for these instructions. For other instruction,
18966 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
18967 type instruction fusion can be added by returning different priorities.
18968
18969 It's important that irrelevant instructions get the largest FUSION_PRI. */
18970
18971 static void
18972 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18973 int *fusion_pri, int *pri)
18974 {
18975 int tmp, off_val;
18976 rtx base, offset;
18977 enum sched_fusion_type fusion;
18978
18979 gcc_assert (INSN_P (insn));
18980
18981 tmp = max_pri - 1;
18982 fusion = fusion_load_store (insn, &base, &offset);
18983 if (fusion == SCHED_FUSION_NONE)
18984 {
18985 *pri = tmp;
18986 *fusion_pri = tmp;
18987 return;
18988 }
18989
18990 /* Set FUSION_PRI according to fusion type and base register. */
18991 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18992
18993 /* Calculate PRI. */
18994 tmp /= 2;
18995
18996 /* INSN with smaller offset goes first. */
18997 off_val = (int)(INTVAL (offset));
18998 if (off_val >= 0)
18999 tmp -= (off_val & 0xfffff);
19000 else
19001 tmp += ((- off_val) & 0xfffff);
19002
19003 *pri = tmp;
19004 return;
19005 }
19006
19007 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19008 Adjust priority of sha1h instructions so they are scheduled before
19009 other SHA1 instructions. */
19010
19011 static int
19012 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19013 {
19014 rtx x = PATTERN (insn);
19015
19016 if (GET_CODE (x) == SET)
19017 {
19018 x = SET_SRC (x);
19019
19020 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19021 return priority + 10;
19022 }
19023
19024 return priority;
19025 }
19026
19027 /* Given OPERANDS of consecutive load/store, check if we can merge
19028 them into ldp/stp. LOAD is true if they are load instructions.
19029 MODE is the mode of memory operands. */
19030
19031 bool
19032 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19033 machine_mode mode)
19034 {
19035 HOST_WIDE_INT offval_1, offval_2, msize;
19036 enum reg_class rclass_1, rclass_2;
19037 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19038
19039 if (load)
19040 {
19041 mem_1 = operands[1];
19042 mem_2 = operands[3];
19043 reg_1 = operands[0];
19044 reg_2 = operands[2];
19045 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19046 if (REGNO (reg_1) == REGNO (reg_2))
19047 return false;
19048 }
19049 else
19050 {
19051 mem_1 = operands[0];
19052 mem_2 = operands[2];
19053 reg_1 = operands[1];
19054 reg_2 = operands[3];
19055 }
19056
19057 /* The mems cannot be volatile. */
19058 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19059 return false;
19060
19061 /* If we have SImode and slow unaligned ldp,
19062 check the alignment to be at least 8 byte. */
19063 if (mode == SImode
19064 && (aarch64_tune_params.extra_tuning_flags
19065 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19066 && !optimize_size
19067 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19068 return false;
19069
19070 /* Check if the addresses are in the form of [base+offset]. */
19071 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19072 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19073 return false;
19074 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19075 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19076 return false;
19077
19078 /* Check if the bases are same. */
19079 if (!rtx_equal_p (base_1, base_2))
19080 return false;
19081
19082 /* The operands must be of the same size. */
19083 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19084 GET_MODE_SIZE (GET_MODE (mem_2))));
19085
19086 offval_1 = INTVAL (offset_1);
19087 offval_2 = INTVAL (offset_2);
19088 /* We should only be trying this for fixed-sized modes. There is no
19089 SVE LDP/STP instruction. */
19090 msize = GET_MODE_SIZE (mode).to_constant ();
19091 /* Check if the offsets are consecutive. */
19092 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19093 return false;
19094
19095 /* Check if the addresses are clobbered by load. */
19096 if (load)
19097 {
19098 if (reg_mentioned_p (reg_1, mem_1))
19099 return false;
19100
19101 /* In increasing order, the last load can clobber the address. */
19102 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19103 return false;
19104 }
19105
19106 /* One of the memory accesses must be a mempair operand.
19107 If it is not the first one, they need to be swapped by the
19108 peephole. */
19109 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19110 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19111 return false;
19112
19113 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19114 rclass_1 = FP_REGS;
19115 else
19116 rclass_1 = GENERAL_REGS;
19117
19118 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19119 rclass_2 = FP_REGS;
19120 else
19121 rclass_2 = GENERAL_REGS;
19122
19123 /* Check if the registers are of same class. */
19124 if (rclass_1 != rclass_2)
19125 return false;
19126
19127 return true;
19128 }
19129
19130 /* Given OPERANDS of consecutive load/store that can be merged,
19131 swap them if they are not in ascending order. */
19132 void
19133 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19134 {
19135 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19136 HOST_WIDE_INT offval_1, offval_2;
19137
19138 if (load)
19139 {
19140 mem_1 = operands[1];
19141 mem_2 = operands[3];
19142 }
19143 else
19144 {
19145 mem_1 = operands[0];
19146 mem_2 = operands[2];
19147 }
19148
19149 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19150 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19151
19152 offval_1 = INTVAL (offset_1);
19153 offval_2 = INTVAL (offset_2);
19154
19155 if (offval_1 > offval_2)
19156 {
19157 /* Irrespective of whether this is a load or a store,
19158 we do the same swap. */
19159 std::swap (operands[0], operands[2]);
19160 std::swap (operands[1], operands[3]);
19161 }
19162 }
19163
19164 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19165 comparison between the two. */
19166 int
19167 aarch64_host_wide_int_compare (const void *x, const void *y)
19168 {
19169 return wi::cmps (* ((const HOST_WIDE_INT *) x),
19170 * ((const HOST_WIDE_INT *) y));
19171 }
19172
19173 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19174 other pointing to a REG rtx containing an offset, compare the offsets
19175 of the two pairs.
19176
19177 Return:
19178
19179 1 iff offset (X) > offset (Y)
19180 0 iff offset (X) == offset (Y)
19181 -1 iff offset (X) < offset (Y) */
19182 int
19183 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19184 {
19185 const rtx * operands_1 = (const rtx *) x;
19186 const rtx * operands_2 = (const rtx *) y;
19187 rtx mem_1, mem_2, base, offset_1, offset_2;
19188
19189 if (MEM_P (operands_1[0]))
19190 mem_1 = operands_1[0];
19191 else
19192 mem_1 = operands_1[1];
19193
19194 if (MEM_P (operands_2[0]))
19195 mem_2 = operands_2[0];
19196 else
19197 mem_2 = operands_2[1];
19198
19199 /* Extract the offsets. */
19200 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19201 extract_base_offset_in_addr (mem_2, &base, &offset_2);
19202
19203 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19204
19205 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19206 }
19207
19208 /* Given OPERANDS of consecutive load/store, check if we can merge
19209 them into ldp/stp by adjusting the offset. LOAD is true if they
19210 are load instructions. MODE is the mode of memory operands.
19211
19212 Given below consecutive stores:
19213
19214 str w1, [xb, 0x100]
19215 str w1, [xb, 0x104]
19216 str w1, [xb, 0x108]
19217 str w1, [xb, 0x10c]
19218
19219 Though the offsets are out of the range supported by stp, we can
19220 still pair them after adjusting the offset, like:
19221
19222 add scratch, xb, 0x100
19223 stp w1, w1, [scratch]
19224 stp w1, w1, [scratch, 0x8]
19225
19226 The peephole patterns detecting this opportunity should guarantee
19227 the scratch register is avaliable. */
19228
19229 bool
19230 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19231 scalar_mode mode)
19232 {
19233 const int num_insns = 4;
19234 enum reg_class rclass;
19235 HOST_WIDE_INT offvals[num_insns], msize;
19236 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19237
19238 if (load)
19239 {
19240 for (int i = 0; i < num_insns; i++)
19241 {
19242 reg[i] = operands[2 * i];
19243 mem[i] = operands[2 * i + 1];
19244
19245 gcc_assert (REG_P (reg[i]));
19246 }
19247
19248 /* Do not attempt to merge the loads if the loads clobber each other. */
19249 for (int i = 0; i < 8; i += 2)
19250 for (int j = i + 2; j < 8; j += 2)
19251 if (reg_overlap_mentioned_p (operands[i], operands[j]))
19252 return false;
19253 }
19254 else
19255 for (int i = 0; i < num_insns; i++)
19256 {
19257 mem[i] = operands[2 * i];
19258 reg[i] = operands[2 * i + 1];
19259 }
19260
19261 /* Skip if memory operand is by itself valid for ldp/stp. */
19262 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19263 return false;
19264
19265 for (int i = 0; i < num_insns; i++)
19266 {
19267 /* The mems cannot be volatile. */
19268 if (MEM_VOLATILE_P (mem[i]))
19269 return false;
19270
19271 /* Check if the addresses are in the form of [base+offset]. */
19272 extract_base_offset_in_addr (mem[i], base + i, offset + i);
19273 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19274 return false;
19275 }
19276
19277 /* Check if the registers are of same class. */
19278 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19279 ? FP_REGS : GENERAL_REGS;
19280
19281 for (int i = 1; i < num_insns; i++)
19282 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19283 {
19284 if (rclass != FP_REGS)
19285 return false;
19286 }
19287 else
19288 {
19289 if (rclass != GENERAL_REGS)
19290 return false;
19291 }
19292
19293 /* Only the last register in the order in which they occur
19294 may be clobbered by the load. */
19295 if (rclass == GENERAL_REGS && load)
19296 for (int i = 0; i < num_insns - 1; i++)
19297 if (reg_mentioned_p (reg[i], mem[i]))
19298 return false;
19299
19300 /* Check if the bases are same. */
19301 for (int i = 0; i < num_insns - 1; i++)
19302 if (!rtx_equal_p (base[i], base[i + 1]))
19303 return false;
19304
19305 for (int i = 0; i < num_insns; i++)
19306 offvals[i] = INTVAL (offset[i]);
19307
19308 msize = GET_MODE_SIZE (mode);
19309
19310 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19311 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19312 aarch64_host_wide_int_compare);
19313
19314 if (!(offvals[1] == offvals[0] + msize
19315 && offvals[3] == offvals[2] + msize))
19316 return false;
19317
19318 /* Check that offsets are within range of each other. The ldp/stp
19319 instructions have 7 bit immediate offsets, so use 0x80. */
19320 if (offvals[2] - offvals[0] >= msize * 0x80)
19321 return false;
19322
19323 /* The offsets must be aligned with respect to each other. */
19324 if (offvals[0] % msize != offvals[2] % msize)
19325 return false;
19326
19327 /* If we have SImode and slow unaligned ldp,
19328 check the alignment to be at least 8 byte. */
19329 if (mode == SImode
19330 && (aarch64_tune_params.extra_tuning_flags
19331 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19332 && !optimize_size
19333 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19334 return false;
19335
19336 return true;
19337 }
19338
19339 /* Given OPERANDS of consecutive load/store, this function pairs them
19340 into LDP/STP after adjusting the offset. It depends on the fact
19341 that the operands can be sorted so the offsets are correct for STP.
19342 MODE is the mode of memory operands. CODE is the rtl operator
19343 which should be applied to all memory operands, it's SIGN_EXTEND,
19344 ZERO_EXTEND or UNKNOWN. */
19345
19346 bool
19347 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19348 scalar_mode mode, RTX_CODE code)
19349 {
19350 rtx base, offset_1, offset_3, t1, t2;
19351 rtx mem_1, mem_2, mem_3, mem_4;
19352 rtx temp_operands[8];
19353 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19354 stp_off_upper_limit, stp_off_lower_limit, msize;
19355
19356 /* We make changes on a copy as we may still bail out. */
19357 for (int i = 0; i < 8; i ++)
19358 temp_operands[i] = operands[i];
19359
19360 /* Sort the operands. */
19361 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19362
19363 /* Copy the memory operands so that if we have to bail for some
19364 reason the original addresses are unchanged. */
19365 if (load)
19366 {
19367 mem_1 = copy_rtx (temp_operands[1]);
19368 mem_2 = copy_rtx (temp_operands[3]);
19369 mem_3 = copy_rtx (temp_operands[5]);
19370 mem_4 = copy_rtx (temp_operands[7]);
19371 }
19372 else
19373 {
19374 mem_1 = copy_rtx (temp_operands[0]);
19375 mem_2 = copy_rtx (temp_operands[2]);
19376 mem_3 = copy_rtx (temp_operands[4]);
19377 mem_4 = copy_rtx (temp_operands[6]);
19378 gcc_assert (code == UNKNOWN);
19379 }
19380
19381 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19382 extract_base_offset_in_addr (mem_3, &base, &offset_3);
19383 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19384 && offset_3 != NULL_RTX);
19385
19386 /* Adjust offset so it can fit in LDP/STP instruction. */
19387 msize = GET_MODE_SIZE (mode);
19388 stp_off_upper_limit = msize * (0x40 - 1);
19389 stp_off_lower_limit = - msize * 0x40;
19390
19391 off_val_1 = INTVAL (offset_1);
19392 off_val_3 = INTVAL (offset_3);
19393
19394 /* The base offset is optimally half way between the two STP/LDP offsets. */
19395 if (msize <= 4)
19396 base_off = (off_val_1 + off_val_3) / 2;
19397 else
19398 /* However, due to issues with negative LDP/STP offset generation for
19399 larger modes, for DF, DI and vector modes. we must not use negative
19400 addresses smaller than 9 signed unadjusted bits can store. This
19401 provides the most range in this case. */
19402 base_off = off_val_1;
19403
19404 /* Adjust the base so that it is aligned with the addresses but still
19405 optimal. */
19406 if (base_off % msize != off_val_1 % msize)
19407 /* Fix the offset, bearing in mind we want to make it bigger not
19408 smaller. */
19409 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19410 else if (msize <= 4)
19411 /* The negative range of LDP/STP is one larger than the positive range. */
19412 base_off += msize;
19413
19414 /* Check if base offset is too big or too small. We can attempt to resolve
19415 this issue by setting it to the maximum value and seeing if the offsets
19416 still fit. */
19417 if (base_off >= 0x1000)
19418 {
19419 base_off = 0x1000 - 1;
19420 /* We must still make sure that the base offset is aligned with respect
19421 to the address. But it may may not be made any bigger. */
19422 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19423 }
19424
19425 /* Likewise for the case where the base is too small. */
19426 if (base_off <= -0x1000)
19427 {
19428 base_off = -0x1000 + 1;
19429 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19430 }
19431
19432 /* Offset of the first STP/LDP. */
19433 new_off_1 = off_val_1 - base_off;
19434
19435 /* Offset of the second STP/LDP. */
19436 new_off_3 = off_val_3 - base_off;
19437
19438 /* The offsets must be within the range of the LDP/STP instructions. */
19439 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19440 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19441 return false;
19442
19443 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19444 new_off_1), true);
19445 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19446 new_off_1 + msize), true);
19447 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19448 new_off_3), true);
19449 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19450 new_off_3 + msize), true);
19451
19452 if (!aarch64_mem_pair_operand (mem_1, mode)
19453 || !aarch64_mem_pair_operand (mem_3, mode))
19454 return false;
19455
19456 if (code == ZERO_EXTEND)
19457 {
19458 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19459 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19460 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19461 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19462 }
19463 else if (code == SIGN_EXTEND)
19464 {
19465 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19466 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19467 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19468 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19469 }
19470
19471 if (load)
19472 {
19473 operands[0] = temp_operands[0];
19474 operands[1] = mem_1;
19475 operands[2] = temp_operands[2];
19476 operands[3] = mem_2;
19477 operands[4] = temp_operands[4];
19478 operands[5] = mem_3;
19479 operands[6] = temp_operands[6];
19480 operands[7] = mem_4;
19481 }
19482 else
19483 {
19484 operands[0] = mem_1;
19485 operands[1] = temp_operands[1];
19486 operands[2] = mem_2;
19487 operands[3] = temp_operands[3];
19488 operands[4] = mem_3;
19489 operands[5] = temp_operands[5];
19490 operands[6] = mem_4;
19491 operands[7] = temp_operands[7];
19492 }
19493
19494 /* Emit adjusting instruction. */
19495 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19496 /* Emit ldp/stp instructions. */
19497 t1 = gen_rtx_SET (operands[0], operands[1]);
19498 t2 = gen_rtx_SET (operands[2], operands[3]);
19499 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19500 t1 = gen_rtx_SET (operands[4], operands[5]);
19501 t2 = gen_rtx_SET (operands[6], operands[7]);
19502 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19503 return true;
19504 }
19505
19506 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19507 it isn't worth branching around empty masked ops (including masked
19508 stores). */
19509
19510 static bool
19511 aarch64_empty_mask_is_expensive (unsigned)
19512 {
19513 return false;
19514 }
19515
19516 /* Return 1 if pseudo register should be created and used to hold
19517 GOT address for PIC code. */
19518
19519 bool
19520 aarch64_use_pseudo_pic_reg (void)
19521 {
19522 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19523 }
19524
19525 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19526
19527 static int
19528 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19529 {
19530 switch (XINT (x, 1))
19531 {
19532 case UNSPEC_GOTSMALLPIC:
19533 case UNSPEC_GOTSMALLPIC28K:
19534 case UNSPEC_GOTTINYPIC:
19535 return 0;
19536 default:
19537 break;
19538 }
19539
19540 return default_unspec_may_trap_p (x, flags);
19541 }
19542
19543
19544 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19545 return the log2 of that value. Otherwise return -1. */
19546
19547 int
19548 aarch64_fpconst_pow_of_2 (rtx x)
19549 {
19550 const REAL_VALUE_TYPE *r;
19551
19552 if (!CONST_DOUBLE_P (x))
19553 return -1;
19554
19555 r = CONST_DOUBLE_REAL_VALUE (x);
19556
19557 if (REAL_VALUE_NEGATIVE (*r)
19558 || REAL_VALUE_ISNAN (*r)
19559 || REAL_VALUE_ISINF (*r)
19560 || !real_isinteger (r, DFmode))
19561 return -1;
19562
19563 return exact_log2 (real_to_integer (r));
19564 }
19565
19566 /* If X is a vector of equal CONST_DOUBLE values and that value is
19567 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19568
19569 int
19570 aarch64_vec_fpconst_pow_of_2 (rtx x)
19571 {
19572 int nelts;
19573 if (GET_CODE (x) != CONST_VECTOR
19574 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19575 return -1;
19576
19577 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19578 return -1;
19579
19580 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19581 if (firstval <= 0)
19582 return -1;
19583
19584 for (int i = 1; i < nelts; i++)
19585 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19586 return -1;
19587
19588 return firstval;
19589 }
19590
19591 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19592 to float.
19593
19594 __fp16 always promotes through this hook.
19595 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19596 through the generic excess precision logic rather than here. */
19597
19598 static tree
19599 aarch64_promoted_type (const_tree t)
19600 {
19601 if (SCALAR_FLOAT_TYPE_P (t)
19602 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19603 return float_type_node;
19604
19605 return NULL_TREE;
19606 }
19607
19608 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19609
19610 static bool
19611 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19612 optimization_type opt_type)
19613 {
19614 switch (op)
19615 {
19616 case rsqrt_optab:
19617 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19618
19619 default:
19620 return true;
19621 }
19622 }
19623
19624 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19625
19626 static unsigned int
19627 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19628 int *offset)
19629 {
19630 /* Polynomial invariant 1 == (VG / 2) - 1. */
19631 gcc_assert (i == 1);
19632 *factor = 2;
19633 *offset = 1;
19634 return AARCH64_DWARF_VG;
19635 }
19636
19637 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19638 if MODE is HFmode, and punt to the generic implementation otherwise. */
19639
19640 static bool
19641 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19642 {
19643 return (mode == HFmode
19644 ? true
19645 : default_libgcc_floating_mode_supported_p (mode));
19646 }
19647
19648 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19649 if MODE is HFmode, and punt to the generic implementation otherwise. */
19650
19651 static bool
19652 aarch64_scalar_mode_supported_p (scalar_mode mode)
19653 {
19654 return (mode == HFmode
19655 ? true
19656 : default_scalar_mode_supported_p (mode));
19657 }
19658
19659 /* Set the value of FLT_EVAL_METHOD.
19660 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19661
19662 0: evaluate all operations and constants, whose semantic type has at
19663 most the range and precision of type float, to the range and
19664 precision of float; evaluate all other operations and constants to
19665 the range and precision of the semantic type;
19666
19667 N, where _FloatN is a supported interchange floating type
19668 evaluate all operations and constants, whose semantic type has at
19669 most the range and precision of _FloatN type, to the range and
19670 precision of the _FloatN type; evaluate all other operations and
19671 constants to the range and precision of the semantic type;
19672
19673 If we have the ARMv8.2-A extensions then we support _Float16 in native
19674 precision, so we should set this to 16. Otherwise, we support the type,
19675 but want to evaluate expressions in float precision, so set this to
19676 0. */
19677
19678 static enum flt_eval_method
19679 aarch64_excess_precision (enum excess_precision_type type)
19680 {
19681 switch (type)
19682 {
19683 case EXCESS_PRECISION_TYPE_FAST:
19684 case EXCESS_PRECISION_TYPE_STANDARD:
19685 /* We can calculate either in 16-bit range and precision or
19686 32-bit range and precision. Make that decision based on whether
19687 we have native support for the ARMv8.2-A 16-bit floating-point
19688 instructions or not. */
19689 return (TARGET_FP_F16INST
19690 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19691 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19692 case EXCESS_PRECISION_TYPE_IMPLICIT:
19693 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19694 default:
19695 gcc_unreachable ();
19696 }
19697 return FLT_EVAL_METHOD_UNPREDICTABLE;
19698 }
19699
19700 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
19701 scheduled for speculative execution. Reject the long-running division
19702 and square-root instructions. */
19703
19704 static bool
19705 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19706 {
19707 switch (get_attr_type (insn))
19708 {
19709 case TYPE_SDIV:
19710 case TYPE_UDIV:
19711 case TYPE_FDIVS:
19712 case TYPE_FDIVD:
19713 case TYPE_FSQRTS:
19714 case TYPE_FSQRTD:
19715 case TYPE_NEON_FP_SQRT_S:
19716 case TYPE_NEON_FP_SQRT_D:
19717 case TYPE_NEON_FP_SQRT_S_Q:
19718 case TYPE_NEON_FP_SQRT_D_Q:
19719 case TYPE_NEON_FP_DIV_S:
19720 case TYPE_NEON_FP_DIV_D:
19721 case TYPE_NEON_FP_DIV_S_Q:
19722 case TYPE_NEON_FP_DIV_D_Q:
19723 return false;
19724 default:
19725 return true;
19726 }
19727 }
19728
19729 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
19730
19731 static int
19732 aarch64_compute_pressure_classes (reg_class *classes)
19733 {
19734 int i = 0;
19735 classes[i++] = GENERAL_REGS;
19736 classes[i++] = FP_REGS;
19737 /* PR_REGS isn't a useful pressure class because many predicate pseudo
19738 registers need to go in PR_LO_REGS at some point during their
19739 lifetime. Splitting it into two halves has the effect of making
19740 all predicates count against PR_LO_REGS, so that we try whenever
19741 possible to restrict the number of live predicates to 8. This
19742 greatly reduces the amount of spilling in certain loops. */
19743 classes[i++] = PR_LO_REGS;
19744 classes[i++] = PR_HI_REGS;
19745 return i;
19746 }
19747
19748 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
19749
19750 static bool
19751 aarch64_can_change_mode_class (machine_mode from,
19752 machine_mode to, reg_class_t)
19753 {
19754 if (BYTES_BIG_ENDIAN)
19755 {
19756 bool from_sve_p = aarch64_sve_data_mode_p (from);
19757 bool to_sve_p = aarch64_sve_data_mode_p (to);
19758
19759 /* Don't allow changes between SVE data modes and non-SVE modes.
19760 See the comment at the head of aarch64-sve.md for details. */
19761 if (from_sve_p != to_sve_p)
19762 return false;
19763
19764 /* Don't allow changes in element size: lane 0 of the new vector
19765 would not then be lane 0 of the old vector. See the comment
19766 above aarch64_maybe_expand_sve_subreg_move for a more detailed
19767 description.
19768
19769 In the worst case, this forces a register to be spilled in
19770 one mode and reloaded in the other, which handles the
19771 endianness correctly. */
19772 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19773 return false;
19774 }
19775 return true;
19776 }
19777
19778 /* Implement TARGET_EARLY_REMAT_MODES. */
19779
19780 static void
19781 aarch64_select_early_remat_modes (sbitmap modes)
19782 {
19783 /* SVE values are not normally live across a call, so it should be
19784 worth doing early rematerialization even in VL-specific mode. */
19785 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19786 {
19787 machine_mode mode = (machine_mode) i;
19788 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19789 if (vec_flags & VEC_ANY_SVE)
19790 bitmap_set_bit (modes, i);
19791 }
19792 }
19793
19794 /* Override the default target speculation_safe_value. */
19795 static rtx
19796 aarch64_speculation_safe_value (machine_mode mode,
19797 rtx result, rtx val, rtx failval)
19798 {
19799 /* Maybe we should warn if falling back to hard barriers. They are
19800 likely to be noticably more expensive than the alternative below. */
19801 if (!aarch64_track_speculation)
19802 return default_speculation_safe_value (mode, result, val, failval);
19803
19804 if (!REG_P (val))
19805 val = copy_to_mode_reg (mode, val);
19806
19807 if (!aarch64_reg_or_zero (failval, mode))
19808 failval = copy_to_mode_reg (mode, failval);
19809
19810 emit_insn (gen_despeculate_copy (mode, result, val, failval));
19811 return result;
19812 }
19813
19814 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19815 Look into the tuning structure for an estimate.
19816 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19817 Advanced SIMD 128 bits. */
19818
19819 static HOST_WIDE_INT
19820 aarch64_estimated_poly_value (poly_int64 val)
19821 {
19822 enum aarch64_sve_vector_bits_enum width_source
19823 = aarch64_tune_params.sve_width;
19824
19825 /* If we still don't have an estimate, use the default. */
19826 if (width_source == SVE_SCALABLE)
19827 return default_estimated_poly_value (val);
19828
19829 HOST_WIDE_INT over_128 = width_source - 128;
19830 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19831 }
19832
19833
19834 /* Return true for types that could be supported as SIMD return or
19835 argument types. */
19836
19837 static bool
19838 supported_simd_type (tree t)
19839 {
19840 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19841 {
19842 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19843 return s == 1 || s == 2 || s == 4 || s == 8;
19844 }
19845 return false;
19846 }
19847
19848 /* Return true for types that currently are supported as SIMD return
19849 or argument types. */
19850
19851 static bool
19852 currently_supported_simd_type (tree t, tree b)
19853 {
19854 if (COMPLEX_FLOAT_TYPE_P (t))
19855 return false;
19856
19857 if (TYPE_SIZE (t) != TYPE_SIZE (b))
19858 return false;
19859
19860 return supported_simd_type (t);
19861 }
19862
19863 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
19864
19865 static int
19866 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19867 struct cgraph_simd_clone *clonei,
19868 tree base_type, int num)
19869 {
19870 tree t, ret_type, arg_type;
19871 unsigned int elt_bits, vec_bits, count;
19872
19873 if (!TARGET_SIMD)
19874 return 0;
19875
19876 if (clonei->simdlen
19877 && (clonei->simdlen < 2
19878 || clonei->simdlen > 1024
19879 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19880 {
19881 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19882 "unsupported simdlen %d", clonei->simdlen);
19883 return 0;
19884 }
19885
19886 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19887 if (TREE_CODE (ret_type) != VOID_TYPE
19888 && !currently_supported_simd_type (ret_type, base_type))
19889 {
19890 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19891 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19892 "GCC does not currently support mixed size types "
19893 "for %<simd%> functions");
19894 else if (supported_simd_type (ret_type))
19895 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19896 "GCC does not currently support return type %qT "
19897 "for %<simd%> functions", ret_type);
19898 else
19899 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19900 "unsupported return type %qT for %<simd%> functions",
19901 ret_type);
19902 return 0;
19903 }
19904
19905 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19906 {
19907 arg_type = TREE_TYPE (t);
19908
19909 if (!currently_supported_simd_type (arg_type, base_type))
19910 {
19911 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19912 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19913 "GCC does not currently support mixed size types "
19914 "for %<simd%> functions");
19915 else
19916 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19917 "GCC does not currently support argument type %qT "
19918 "for %<simd%> functions", arg_type);
19919 return 0;
19920 }
19921 }
19922
19923 clonei->vecsize_mangle = 'n';
19924 clonei->mask_mode = VOIDmode;
19925 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19926 if (clonei->simdlen == 0)
19927 {
19928 count = 2;
19929 vec_bits = (num == 0 ? 64 : 128);
19930 clonei->simdlen = vec_bits / elt_bits;
19931 }
19932 else
19933 {
19934 count = 1;
19935 vec_bits = clonei->simdlen * elt_bits;
19936 if (vec_bits != 64 && vec_bits != 128)
19937 {
19938 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19939 "GCC does not currently support simdlen %d for type %qT",
19940 clonei->simdlen, base_type);
19941 return 0;
19942 }
19943 }
19944 clonei->vecsize_int = vec_bits;
19945 clonei->vecsize_float = vec_bits;
19946 return count;
19947 }
19948
19949 /* Implement TARGET_SIMD_CLONE_ADJUST. */
19950
19951 static void
19952 aarch64_simd_clone_adjust (struct cgraph_node *node)
19953 {
19954 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19955 use the correct ABI. */
19956
19957 tree t = TREE_TYPE (node->decl);
19958 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19959 TYPE_ATTRIBUTES (t));
19960 }
19961
19962 /* Implement TARGET_SIMD_CLONE_USABLE. */
19963
19964 static int
19965 aarch64_simd_clone_usable (struct cgraph_node *node)
19966 {
19967 switch (node->simdclone->vecsize_mangle)
19968 {
19969 case 'n':
19970 if (!TARGET_SIMD)
19971 return -1;
19972 return 0;
19973 default:
19974 gcc_unreachable ();
19975 }
19976 }
19977
19978 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19979
19980 static int
19981 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19982 {
19983 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19984 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19985 return 0;
19986 return 1;
19987 }
19988
19989 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19990
19991 static const char *
19992 aarch64_get_multilib_abi_name (void)
19993 {
19994 if (TARGET_BIG_END)
19995 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19996 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19997 }
19998
19999 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20000 global variable based guard use the default else
20001 return a null tree. */
20002 static tree
20003 aarch64_stack_protect_guard (void)
20004 {
20005 if (aarch64_stack_protector_guard == SSP_GLOBAL)
20006 return default_stack_protect_guard ();
20007
20008 return NULL_TREE;
20009 }
20010
20011 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
20012 section at the end if needed. */
20013 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
20014 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
20015 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
20016 void
20017 aarch64_file_end_indicate_exec_stack ()
20018 {
20019 file_end_indicate_exec_stack ();
20020
20021 unsigned feature_1_and = 0;
20022 if (aarch64_bti_enabled ())
20023 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20024
20025 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20026 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20027
20028 if (feature_1_and)
20029 {
20030 /* Generate .note.gnu.property section. */
20031 switch_to_section (get_section (".note.gnu.property",
20032 SECTION_NOTYPE, NULL));
20033
20034 /* PT_NOTE header: namesz, descsz, type.
20035 namesz = 4 ("GNU\0")
20036 descsz = 16 (Size of the program property array)
20037 [(12 + padding) * Number of array elements]
20038 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20039 assemble_align (POINTER_SIZE);
20040 assemble_integer (GEN_INT (4), 4, 32, 1);
20041 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20042 assemble_integer (GEN_INT (5), 4, 32, 1);
20043
20044 /* PT_NOTE name. */
20045 assemble_string ("GNU", 4);
20046
20047 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20048 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20049 datasz = 4
20050 data = feature_1_and. */
20051 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20052 assemble_integer (GEN_INT (4), 4, 32, 1);
20053 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20054
20055 /* Pad the size of the note to the required alignment. */
20056 assemble_align (POINTER_SIZE);
20057 }
20058 }
20059 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20060 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20061 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20062
20063 /* Target-specific selftests. */
20064
20065 #if CHECKING_P
20066
20067 namespace selftest {
20068
20069 /* Selftest for the RTL loader.
20070 Verify that the RTL loader copes with a dump from
20071 print_rtx_function. This is essentially just a test that class
20072 function_reader can handle a real dump, but it also verifies
20073 that lookup_reg_by_dump_name correctly handles hard regs.
20074 The presence of hard reg names in the dump means that the test is
20075 target-specific, hence it is in this file. */
20076
20077 static void
20078 aarch64_test_loading_full_dump ()
20079 {
20080 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20081
20082 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20083
20084 rtx_insn *insn_1 = get_insn_by_uid (1);
20085 ASSERT_EQ (NOTE, GET_CODE (insn_1));
20086
20087 rtx_insn *insn_15 = get_insn_by_uid (15);
20088 ASSERT_EQ (INSN, GET_CODE (insn_15));
20089 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20090
20091 /* Verify crtl->return_rtx. */
20092 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20093 ASSERT_EQ (0, REGNO (crtl->return_rtx));
20094 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20095 }
20096
20097 /* Run all target-specific selftests. */
20098
20099 static void
20100 aarch64_run_selftests (void)
20101 {
20102 aarch64_test_loading_full_dump ();
20103 }
20104
20105 } // namespace selftest
20106
20107 #endif /* #if CHECKING_P */
20108
20109 #undef TARGET_STACK_PROTECT_GUARD
20110 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20111
20112 #undef TARGET_ADDRESS_COST
20113 #define TARGET_ADDRESS_COST aarch64_address_cost
20114
20115 /* This hook will determines whether unnamed bitfields affect the alignment
20116 of the containing structure. The hook returns true if the structure
20117 should inherit the alignment requirements of an unnamed bitfield's
20118 type. */
20119 #undef TARGET_ALIGN_ANON_BITFIELD
20120 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20121
20122 #undef TARGET_ASM_ALIGNED_DI_OP
20123 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20124
20125 #undef TARGET_ASM_ALIGNED_HI_OP
20126 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20127
20128 #undef TARGET_ASM_ALIGNED_SI_OP
20129 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20130
20131 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20132 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20133 hook_bool_const_tree_hwi_hwi_const_tree_true
20134
20135 #undef TARGET_ASM_FILE_START
20136 #define TARGET_ASM_FILE_START aarch64_start_file
20137
20138 #undef TARGET_ASM_OUTPUT_MI_THUNK
20139 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20140
20141 #undef TARGET_ASM_SELECT_RTX_SECTION
20142 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20143
20144 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20145 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20146
20147 #undef TARGET_BUILD_BUILTIN_VA_LIST
20148 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20149
20150 #undef TARGET_CALLEE_COPIES
20151 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
20152
20153 #undef TARGET_CAN_ELIMINATE
20154 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20155
20156 #undef TARGET_CAN_INLINE_P
20157 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20158
20159 #undef TARGET_CANNOT_FORCE_CONST_MEM
20160 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20161
20162 #undef TARGET_CASE_VALUES_THRESHOLD
20163 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20164
20165 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20166 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20167
20168 /* Only the least significant bit is used for initialization guard
20169 variables. */
20170 #undef TARGET_CXX_GUARD_MASK_BIT
20171 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20172
20173 #undef TARGET_C_MODE_FOR_SUFFIX
20174 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20175
20176 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20177 #undef TARGET_DEFAULT_TARGET_FLAGS
20178 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20179 #endif
20180
20181 #undef TARGET_CLASS_MAX_NREGS
20182 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20183
20184 #undef TARGET_BUILTIN_DECL
20185 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20186
20187 #undef TARGET_BUILTIN_RECIPROCAL
20188 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20189
20190 #undef TARGET_C_EXCESS_PRECISION
20191 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20192
20193 #undef TARGET_EXPAND_BUILTIN
20194 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20195
20196 #undef TARGET_EXPAND_BUILTIN_VA_START
20197 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20198
20199 #undef TARGET_FOLD_BUILTIN
20200 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20201
20202 #undef TARGET_FUNCTION_ARG
20203 #define TARGET_FUNCTION_ARG aarch64_function_arg
20204
20205 #undef TARGET_FUNCTION_ARG_ADVANCE
20206 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20207
20208 #undef TARGET_FUNCTION_ARG_BOUNDARY
20209 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20210
20211 #undef TARGET_FUNCTION_ARG_PADDING
20212 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20213
20214 #undef TARGET_GET_RAW_RESULT_MODE
20215 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20216 #undef TARGET_GET_RAW_ARG_MODE
20217 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20218
20219 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20220 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20221
20222 #undef TARGET_FUNCTION_VALUE
20223 #define TARGET_FUNCTION_VALUE aarch64_function_value
20224
20225 #undef TARGET_FUNCTION_VALUE_REGNO_P
20226 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20227
20228 #undef TARGET_GIMPLE_FOLD_BUILTIN
20229 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20230
20231 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20232 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20233
20234 #undef TARGET_INIT_BUILTINS
20235 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20236
20237 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20238 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20239 aarch64_ira_change_pseudo_allocno_class
20240
20241 #undef TARGET_LEGITIMATE_ADDRESS_P
20242 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20243
20244 #undef TARGET_LEGITIMATE_CONSTANT_P
20245 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20246
20247 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20248 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20249 aarch64_legitimize_address_displacement
20250
20251 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20252 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20253
20254 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20255 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20256 aarch64_libgcc_floating_mode_supported_p
20257
20258 #undef TARGET_MANGLE_TYPE
20259 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20260
20261 #undef TARGET_MEMORY_MOVE_COST
20262 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20263
20264 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20265 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20266
20267 #undef TARGET_MUST_PASS_IN_STACK
20268 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20269
20270 /* This target hook should return true if accesses to volatile bitfields
20271 should use the narrowest mode possible. It should return false if these
20272 accesses should use the bitfield container type. */
20273 #undef TARGET_NARROW_VOLATILE_BITFIELD
20274 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20275
20276 #undef TARGET_OPTION_OVERRIDE
20277 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20278
20279 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20280 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20281 aarch64_override_options_after_change
20282
20283 #undef TARGET_OPTION_SAVE
20284 #define TARGET_OPTION_SAVE aarch64_option_save
20285
20286 #undef TARGET_OPTION_RESTORE
20287 #define TARGET_OPTION_RESTORE aarch64_option_restore
20288
20289 #undef TARGET_OPTION_PRINT
20290 #define TARGET_OPTION_PRINT aarch64_option_print
20291
20292 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20293 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20294
20295 #undef TARGET_SET_CURRENT_FUNCTION
20296 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20297
20298 #undef TARGET_PASS_BY_REFERENCE
20299 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20300
20301 #undef TARGET_PREFERRED_RELOAD_CLASS
20302 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20303
20304 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20305 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20306
20307 #undef TARGET_PROMOTED_TYPE
20308 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20309
20310 #undef TARGET_SECONDARY_RELOAD
20311 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20312
20313 #undef TARGET_SHIFT_TRUNCATION_MASK
20314 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20315
20316 #undef TARGET_SETUP_INCOMING_VARARGS
20317 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20318
20319 #undef TARGET_STRUCT_VALUE_RTX
20320 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20321
20322 #undef TARGET_REGISTER_MOVE_COST
20323 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20324
20325 #undef TARGET_RETURN_IN_MEMORY
20326 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20327
20328 #undef TARGET_RETURN_IN_MSB
20329 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20330
20331 #undef TARGET_RTX_COSTS
20332 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20333
20334 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20335 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20336
20337 #undef TARGET_SCHED_ISSUE_RATE
20338 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20339
20340 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20341 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20342 aarch64_sched_first_cycle_multipass_dfa_lookahead
20343
20344 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20345 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20346 aarch64_first_cycle_multipass_dfa_lookahead_guard
20347
20348 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20349 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20350 aarch64_get_separate_components
20351
20352 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20353 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20354 aarch64_components_for_bb
20355
20356 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20357 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20358 aarch64_disqualify_components
20359
20360 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20361 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20362 aarch64_emit_prologue_components
20363
20364 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20365 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20366 aarch64_emit_epilogue_components
20367
20368 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20369 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20370 aarch64_set_handled_components
20371
20372 #undef TARGET_TRAMPOLINE_INIT
20373 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20374
20375 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20376 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20377
20378 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20379 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20380
20381 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20382 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20383 aarch64_builtin_support_vector_misalignment
20384
20385 #undef TARGET_ARRAY_MODE
20386 #define TARGET_ARRAY_MODE aarch64_array_mode
20387
20388 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20389 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20390
20391 #undef TARGET_VECTORIZE_ADD_STMT_COST
20392 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20393
20394 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20395 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20396 aarch64_builtin_vectorization_cost
20397
20398 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20399 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20400
20401 #undef TARGET_VECTORIZE_BUILTINS
20402 #define TARGET_VECTORIZE_BUILTINS
20403
20404 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20405 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20406 aarch64_builtin_vectorized_function
20407
20408 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20409 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20410 aarch64_autovectorize_vector_sizes
20411
20412 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20413 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20414 aarch64_atomic_assign_expand_fenv
20415
20416 /* Section anchor support. */
20417
20418 #undef TARGET_MIN_ANCHOR_OFFSET
20419 #define TARGET_MIN_ANCHOR_OFFSET -256
20420
20421 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20422 byte offset; we can do much more for larger data types, but have no way
20423 to determine the size of the access. We assume accesses are aligned. */
20424 #undef TARGET_MAX_ANCHOR_OFFSET
20425 #define TARGET_MAX_ANCHOR_OFFSET 4095
20426
20427 #undef TARGET_VECTOR_ALIGNMENT
20428 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20429
20430 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20431 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20432 aarch64_vectorize_preferred_vector_alignment
20433 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20434 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20435 aarch64_simd_vector_alignment_reachable
20436
20437 /* vec_perm support. */
20438
20439 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20440 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20441 aarch64_vectorize_vec_perm_const
20442
20443 #undef TARGET_VECTORIZE_GET_MASK_MODE
20444 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20445 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20446 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20447 aarch64_empty_mask_is_expensive
20448 #undef TARGET_PREFERRED_ELSE_VALUE
20449 #define TARGET_PREFERRED_ELSE_VALUE \
20450 aarch64_preferred_else_value
20451
20452 #undef TARGET_INIT_LIBFUNCS
20453 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20454
20455 #undef TARGET_FIXED_CONDITION_CODE_REGS
20456 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20457
20458 #undef TARGET_FLAGS_REGNUM
20459 #define TARGET_FLAGS_REGNUM CC_REGNUM
20460
20461 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20462 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20463
20464 #undef TARGET_ASAN_SHADOW_OFFSET
20465 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20466
20467 #undef TARGET_LEGITIMIZE_ADDRESS
20468 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20469
20470 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20471 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20472
20473 #undef TARGET_CAN_USE_DOLOOP_P
20474 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20475
20476 #undef TARGET_SCHED_ADJUST_PRIORITY
20477 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20478
20479 #undef TARGET_SCHED_MACRO_FUSION_P
20480 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20481
20482 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20483 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20484
20485 #undef TARGET_SCHED_FUSION_PRIORITY
20486 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20487
20488 #undef TARGET_UNSPEC_MAY_TRAP_P
20489 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20490
20491 #undef TARGET_USE_PSEUDO_PIC_REG
20492 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20493
20494 #undef TARGET_PRINT_OPERAND
20495 #define TARGET_PRINT_OPERAND aarch64_print_operand
20496
20497 #undef TARGET_PRINT_OPERAND_ADDRESS
20498 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20499
20500 #undef TARGET_OPTAB_SUPPORTED_P
20501 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20502
20503 #undef TARGET_OMIT_STRUCT_RETURN_REG
20504 #define TARGET_OMIT_STRUCT_RETURN_REG true
20505
20506 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20507 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20508 aarch64_dwarf_poly_indeterminate_value
20509
20510 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20511 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20512 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20513
20514 #undef TARGET_HARD_REGNO_NREGS
20515 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20516 #undef TARGET_HARD_REGNO_MODE_OK
20517 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20518
20519 #undef TARGET_MODES_TIEABLE_P
20520 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20521
20522 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20523 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20524 aarch64_hard_regno_call_part_clobbered
20525
20526 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20527 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20528 aarch64_remove_extra_call_preserved_regs
20529
20530 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20531 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20532 aarch64_return_call_with_max_clobbers
20533
20534 #undef TARGET_CONSTANT_ALIGNMENT
20535 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20536
20537 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20538 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20539 aarch64_stack_clash_protection_alloca_probe_range
20540
20541 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20542 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20543
20544 #undef TARGET_CAN_CHANGE_MODE_CLASS
20545 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20546
20547 #undef TARGET_SELECT_EARLY_REMAT_MODES
20548 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20549
20550 #undef TARGET_SPECULATION_SAFE_VALUE
20551 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20552
20553 #undef TARGET_ESTIMATED_POLY_VALUE
20554 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20555
20556 #undef TARGET_ATTRIBUTE_TABLE
20557 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20558
20559 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20560 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20561 aarch64_simd_clone_compute_vecsize_and_simdlen
20562
20563 #undef TARGET_SIMD_CLONE_ADJUST
20564 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20565
20566 #undef TARGET_SIMD_CLONE_USABLE
20567 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20568
20569 #undef TARGET_COMP_TYPE_ATTRIBUTES
20570 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20571
20572 #undef TARGET_GET_MULTILIB_ABI_NAME
20573 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20574
20575 #if CHECKING_P
20576 #undef TARGET_RUN_TARGET_SELFTESTS
20577 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20578 #endif /* #if CHECKING_P */
20579
20580 #undef TARGET_ASM_POST_CFI_STARTPROC
20581 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20582
20583 struct gcc_target targetm = TARGET_INITIALIZER;
20584
20585 #include "gt-aarch64.h"