]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
[AArch64] Rework SVE REV[BHW] patterns
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
85 {
86 enum insn_type { MOV, MVN, INDEX, PTRUE };
87 enum modifier_type { LSL, MSL };
88
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode, rtx);
91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 insn_type = MOV, modifier_type = LSL,
93 unsigned int = 0);
94 simd_immediate_info (scalar_mode, rtx, rtx);
95 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
96
97 /* The mode of the elements. */
98 scalar_mode elt_mode;
99
100 /* The instruction to use to move the immediate into a vector. */
101 insn_type insn;
102
103 union
104 {
105 /* For MOV and MVN. */
106 struct
107 {
108 /* The value of each element. */
109 rtx value;
110
111 /* The kind of shift modifier to use, and the number of bits to shift.
112 This is (LSL, 0) if no shift is needed. */
113 modifier_type modifier;
114 unsigned int shift;
115 } mov;
116
117 /* For INDEX. */
118 struct
119 {
120 /* The value of the first element and the step to be added for each
121 subsequent element. */
122 rtx base, step;
123 } index;
124
125 /* For PTRUE. */
126 aarch64_svpattern pattern;
127 } u;
128 };
129
130 /* Construct a floating-point immediate in which each element has mode
131 ELT_MODE_IN and value VALUE_IN. */
132 inline simd_immediate_info
133 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
134 : elt_mode (elt_mode_in), insn (MOV)
135 {
136 u.mov.value = value_in;
137 u.mov.modifier = LSL;
138 u.mov.shift = 0;
139 }
140
141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
142 and value VALUE_IN. The other parameters are as for the structure
143 fields. */
144 inline simd_immediate_info
145 ::simd_immediate_info (scalar_int_mode elt_mode_in,
146 unsigned HOST_WIDE_INT value_in,
147 insn_type insn_in, modifier_type modifier_in,
148 unsigned int shift_in)
149 : elt_mode (elt_mode_in), insn (insn_in)
150 {
151 u.mov.value = gen_int_mode (value_in, elt_mode_in);
152 u.mov.modifier = modifier_in;
153 u.mov.shift = shift_in;
154 }
155
156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
157 and where element I is equal to BASE_IN + I * STEP_IN. */
158 inline simd_immediate_info
159 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
160 : elt_mode (elt_mode_in), insn (INDEX)
161 {
162 u.index.base = base_in;
163 u.index.step = step_in;
164 }
165
166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
167 and has PTRUE pattern PATTERN_IN. */
168 inline simd_immediate_info
169 ::simd_immediate_info (scalar_int_mode elt_mode_in,
170 aarch64_svpattern pattern_in)
171 : elt_mode (elt_mode_in), insn (PTRUE)
172 {
173 u.pattern = pattern_in;
174 }
175
176 /* The current code model. */
177 enum aarch64_code_model aarch64_cmodel;
178
179 /* The number of 64-bit elements in an SVE vector. */
180 poly_uint16 aarch64_sve_vg;
181
182 #ifdef HAVE_AS_TLS
183 #undef TARGET_HAVE_TLS
184 #define TARGET_HAVE_TLS 1
185 #endif
186
187 static bool aarch64_composite_type_p (const_tree, machine_mode);
188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
189 const_tree,
190 machine_mode *, int *,
191 bool *);
192 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
193 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
194 static void aarch64_override_options_after_change (void);
195 static bool aarch64_vector_mode_supported_p (machine_mode);
196 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
198 const_tree type,
199 int misalignment,
200 bool is_packed);
201 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
202 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
203 aarch64_addr_query_type);
204 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
205
206 /* Major revision number of the ARM Architecture implemented by the target. */
207 unsigned aarch64_architecture_version;
208
209 /* The processor for which instructions should be scheduled. */
210 enum aarch64_processor aarch64_tune = cortexa53;
211
212 /* Mask to specify which instruction scheduling options should be used. */
213 uint64_t aarch64_tune_flags = 0;
214
215 /* Global flag for PC relative loads. */
216 bool aarch64_pcrelative_literal_loads;
217
218 /* Global flag for whether frame pointer is enabled. */
219 bool aarch64_use_frame_pointer;
220
221 #define BRANCH_PROTECT_STR_MAX 255
222 char *accepted_branch_protection_string = NULL;
223
224 static enum aarch64_parse_opt_result
225 aarch64_parse_branch_protection (const char*, char**);
226
227 /* Support for command line parsing of boolean flags in the tuning
228 structures. */
229 struct aarch64_flag_desc
230 {
231 const char* name;
232 unsigned int flag;
233 };
234
235 #define AARCH64_FUSION_PAIR(name, internal_name) \
236 { name, AARCH64_FUSE_##internal_name },
237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
238 {
239 { "none", AARCH64_FUSE_NOTHING },
240 #include "aarch64-fusion-pairs.def"
241 { "all", AARCH64_FUSE_ALL },
242 { NULL, AARCH64_FUSE_NOTHING }
243 };
244
245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
246 { name, AARCH64_EXTRA_TUNE_##internal_name },
247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
248 {
249 { "none", AARCH64_EXTRA_TUNE_NONE },
250 #include "aarch64-tuning-flags.def"
251 { "all", AARCH64_EXTRA_TUNE_ALL },
252 { NULL, AARCH64_EXTRA_TUNE_NONE }
253 };
254
255 /* Tuning parameters. */
256
257 static const struct cpu_addrcost_table generic_addrcost_table =
258 {
259 {
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
264 },
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
271 };
272
273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
274 {
275 {
276 0, /* hi */
277 0, /* si */
278 0, /* di */
279 2, /* ti */
280 },
281 0, /* pre_modify */
282 0, /* post_modify */
283 1, /* register_offset */
284 1, /* register_sextend */
285 2, /* register_zextend */
286 0, /* imm_offset */
287 };
288
289 static const struct cpu_addrcost_table xgene1_addrcost_table =
290 {
291 {
292 1, /* hi */
293 0, /* si */
294 0, /* di */
295 1, /* ti */
296 },
297 1, /* pre_modify */
298 1, /* post_modify */
299 0, /* register_offset */
300 1, /* register_sextend */
301 1, /* register_zextend */
302 0, /* imm_offset */
303 };
304
305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
306 {
307 {
308 1, /* hi */
309 1, /* si */
310 1, /* di */
311 2, /* ti */
312 },
313 0, /* pre_modify */
314 0, /* post_modify */
315 2, /* register_offset */
316 3, /* register_sextend */
317 3, /* register_zextend */
318 0, /* imm_offset */
319 };
320
321 static const struct cpu_addrcost_table tsv110_addrcost_table =
322 {
323 {
324 1, /* hi */
325 0, /* si */
326 0, /* di */
327 1, /* ti */
328 },
329 0, /* pre_modify */
330 0, /* post_modify */
331 0, /* register_offset */
332 1, /* register_sextend */
333 1, /* register_zextend */
334 0, /* imm_offset */
335 };
336
337 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
338 {
339 {
340 1, /* hi */
341 1, /* si */
342 1, /* di */
343 2, /* ti */
344 },
345 1, /* pre_modify */
346 1, /* post_modify */
347 3, /* register_offset */
348 3, /* register_sextend */
349 3, /* register_zextend */
350 2, /* imm_offset */
351 };
352
353 static const struct cpu_regmove_cost generic_regmove_cost =
354 {
355 1, /* GP2GP */
356 /* Avoid the use of slow int<->fp moves for spilling by setting
357 their cost higher than memmov_cost. */
358 5, /* GP2FP */
359 5, /* FP2GP */
360 2 /* FP2FP */
361 };
362
363 static const struct cpu_regmove_cost cortexa57_regmove_cost =
364 {
365 1, /* GP2GP */
366 /* Avoid the use of slow int<->fp moves for spilling by setting
367 their cost higher than memmov_cost. */
368 5, /* GP2FP */
369 5, /* FP2GP */
370 2 /* FP2FP */
371 };
372
373 static const struct cpu_regmove_cost cortexa53_regmove_cost =
374 {
375 1, /* GP2GP */
376 /* Avoid the use of slow int<->fp moves for spilling by setting
377 their cost higher than memmov_cost. */
378 5, /* GP2FP */
379 5, /* FP2GP */
380 2 /* FP2FP */
381 };
382
383 static const struct cpu_regmove_cost exynosm1_regmove_cost =
384 {
385 1, /* GP2GP */
386 /* Avoid the use of slow int<->fp moves for spilling by setting
387 their cost higher than memmov_cost (actual, 4 and 9). */
388 9, /* GP2FP */
389 9, /* FP2GP */
390 1 /* FP2FP */
391 };
392
393 static const struct cpu_regmove_cost thunderx_regmove_cost =
394 {
395 2, /* GP2GP */
396 2, /* GP2FP */
397 6, /* FP2GP */
398 4 /* FP2FP */
399 };
400
401 static const struct cpu_regmove_cost xgene1_regmove_cost =
402 {
403 1, /* GP2GP */
404 /* Avoid the use of slow int<->fp moves for spilling by setting
405 their cost higher than memmov_cost. */
406 8, /* GP2FP */
407 8, /* FP2GP */
408 2 /* FP2FP */
409 };
410
411 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
412 {
413 2, /* GP2GP */
414 /* Avoid the use of int<->fp moves for spilling. */
415 6, /* GP2FP */
416 6, /* FP2GP */
417 4 /* FP2FP */
418 };
419
420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
421 {
422 1, /* GP2GP */
423 /* Avoid the use of int<->fp moves for spilling. */
424 8, /* GP2FP */
425 8, /* FP2GP */
426 4 /* FP2FP */
427 };
428
429 static const struct cpu_regmove_cost tsv110_regmove_cost =
430 {
431 1, /* GP2GP */
432 /* Avoid the use of slow int<->fp moves for spilling by setting
433 their cost higher than memmov_cost. */
434 2, /* GP2FP */
435 3, /* FP2GP */
436 2 /* FP2FP */
437 };
438
439 /* Generic costs for vector insn classes. */
440 static const struct cpu_vector_cost generic_vector_cost =
441 {
442 1, /* scalar_int_stmt_cost */
443 1, /* scalar_fp_stmt_cost */
444 1, /* scalar_load_cost */
445 1, /* scalar_store_cost */
446 1, /* vec_int_stmt_cost */
447 1, /* vec_fp_stmt_cost */
448 2, /* vec_permute_cost */
449 1, /* vec_to_scalar_cost */
450 1, /* scalar_to_vec_cost */
451 1, /* vec_align_load_cost */
452 1, /* vec_unalign_load_cost */
453 1, /* vec_unalign_store_cost */
454 1, /* vec_store_cost */
455 3, /* cond_taken_branch_cost */
456 1 /* cond_not_taken_branch_cost */
457 };
458
459 /* QDF24XX costs for vector insn classes. */
460 static const struct cpu_vector_cost qdf24xx_vector_cost =
461 {
462 1, /* scalar_int_stmt_cost */
463 1, /* scalar_fp_stmt_cost */
464 1, /* scalar_load_cost */
465 1, /* scalar_store_cost */
466 1, /* vec_int_stmt_cost */
467 3, /* vec_fp_stmt_cost */
468 2, /* vec_permute_cost */
469 1, /* vec_to_scalar_cost */
470 1, /* scalar_to_vec_cost */
471 1, /* vec_align_load_cost */
472 1, /* vec_unalign_load_cost */
473 1, /* vec_unalign_store_cost */
474 1, /* vec_store_cost */
475 3, /* cond_taken_branch_cost */
476 1 /* cond_not_taken_branch_cost */
477 };
478
479 /* ThunderX costs for vector insn classes. */
480 static const struct cpu_vector_cost thunderx_vector_cost =
481 {
482 1, /* scalar_int_stmt_cost */
483 1, /* scalar_fp_stmt_cost */
484 3, /* scalar_load_cost */
485 1, /* scalar_store_cost */
486 4, /* vec_int_stmt_cost */
487 1, /* vec_fp_stmt_cost */
488 4, /* vec_permute_cost */
489 2, /* vec_to_scalar_cost */
490 2, /* scalar_to_vec_cost */
491 3, /* vec_align_load_cost */
492 5, /* vec_unalign_load_cost */
493 5, /* vec_unalign_store_cost */
494 1, /* vec_store_cost */
495 3, /* cond_taken_branch_cost */
496 3 /* cond_not_taken_branch_cost */
497 };
498
499 static const struct cpu_vector_cost tsv110_vector_cost =
500 {
501 1, /* scalar_int_stmt_cost */
502 1, /* scalar_fp_stmt_cost */
503 5, /* scalar_load_cost */
504 1, /* scalar_store_cost */
505 2, /* vec_int_stmt_cost */
506 2, /* vec_fp_stmt_cost */
507 2, /* vec_permute_cost */
508 3, /* vec_to_scalar_cost */
509 2, /* scalar_to_vec_cost */
510 5, /* vec_align_load_cost */
511 5, /* vec_unalign_load_cost */
512 1, /* vec_unalign_store_cost */
513 1, /* vec_store_cost */
514 1, /* cond_taken_branch_cost */
515 1 /* cond_not_taken_branch_cost */
516 };
517
518 /* Generic costs for vector insn classes. */
519 static const struct cpu_vector_cost cortexa57_vector_cost =
520 {
521 1, /* scalar_int_stmt_cost */
522 1, /* scalar_fp_stmt_cost */
523 4, /* scalar_load_cost */
524 1, /* scalar_store_cost */
525 2, /* vec_int_stmt_cost */
526 2, /* vec_fp_stmt_cost */
527 3, /* vec_permute_cost */
528 8, /* vec_to_scalar_cost */
529 8, /* scalar_to_vec_cost */
530 4, /* vec_align_load_cost */
531 4, /* vec_unalign_load_cost */
532 1, /* vec_unalign_store_cost */
533 1, /* vec_store_cost */
534 1, /* cond_taken_branch_cost */
535 1 /* cond_not_taken_branch_cost */
536 };
537
538 static const struct cpu_vector_cost exynosm1_vector_cost =
539 {
540 1, /* scalar_int_stmt_cost */
541 1, /* scalar_fp_stmt_cost */
542 5, /* scalar_load_cost */
543 1, /* scalar_store_cost */
544 3, /* vec_int_stmt_cost */
545 3, /* vec_fp_stmt_cost */
546 3, /* vec_permute_cost */
547 3, /* vec_to_scalar_cost */
548 3, /* scalar_to_vec_cost */
549 5, /* vec_align_load_cost */
550 5, /* vec_unalign_load_cost */
551 1, /* vec_unalign_store_cost */
552 1, /* vec_store_cost */
553 1, /* cond_taken_branch_cost */
554 1 /* cond_not_taken_branch_cost */
555 };
556
557 /* Generic costs for vector insn classes. */
558 static const struct cpu_vector_cost xgene1_vector_cost =
559 {
560 1, /* scalar_int_stmt_cost */
561 1, /* scalar_fp_stmt_cost */
562 5, /* scalar_load_cost */
563 1, /* scalar_store_cost */
564 2, /* vec_int_stmt_cost */
565 2, /* vec_fp_stmt_cost */
566 2, /* vec_permute_cost */
567 4, /* vec_to_scalar_cost */
568 4, /* scalar_to_vec_cost */
569 10, /* vec_align_load_cost */
570 10, /* vec_unalign_load_cost */
571 2, /* vec_unalign_store_cost */
572 2, /* vec_store_cost */
573 2, /* cond_taken_branch_cost */
574 1 /* cond_not_taken_branch_cost */
575 };
576
577 /* Costs for vector insn classes for Vulcan. */
578 static const struct cpu_vector_cost thunderx2t99_vector_cost =
579 {
580 1, /* scalar_int_stmt_cost */
581 6, /* scalar_fp_stmt_cost */
582 4, /* scalar_load_cost */
583 1, /* scalar_store_cost */
584 5, /* vec_int_stmt_cost */
585 6, /* vec_fp_stmt_cost */
586 3, /* vec_permute_cost */
587 6, /* vec_to_scalar_cost */
588 5, /* scalar_to_vec_cost */
589 8, /* vec_align_load_cost */
590 8, /* vec_unalign_load_cost */
591 4, /* vec_unalign_store_cost */
592 4, /* vec_store_cost */
593 2, /* cond_taken_branch_cost */
594 1 /* cond_not_taken_branch_cost */
595 };
596
597 /* Generic costs for branch instructions. */
598 static const struct cpu_branch_cost generic_branch_cost =
599 {
600 1, /* Predictable. */
601 3 /* Unpredictable. */
602 };
603
604 /* Generic approximation modes. */
605 static const cpu_approx_modes generic_approx_modes =
606 {
607 AARCH64_APPROX_NONE, /* division */
608 AARCH64_APPROX_NONE, /* sqrt */
609 AARCH64_APPROX_NONE /* recip_sqrt */
610 };
611
612 /* Approximation modes for Exynos M1. */
613 static const cpu_approx_modes exynosm1_approx_modes =
614 {
615 AARCH64_APPROX_NONE, /* division */
616 AARCH64_APPROX_ALL, /* sqrt */
617 AARCH64_APPROX_ALL /* recip_sqrt */
618 };
619
620 /* Approximation modes for X-Gene 1. */
621 static const cpu_approx_modes xgene1_approx_modes =
622 {
623 AARCH64_APPROX_NONE, /* division */
624 AARCH64_APPROX_NONE, /* sqrt */
625 AARCH64_APPROX_ALL /* recip_sqrt */
626 };
627
628 /* Generic prefetch settings (which disable prefetch). */
629 static const cpu_prefetch_tune generic_prefetch_tune =
630 {
631 0, /* num_slots */
632 -1, /* l1_cache_size */
633 -1, /* l1_cache_line_size */
634 -1, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 -1 /* default_opt_level */
638 };
639
640 static const cpu_prefetch_tune exynosm1_prefetch_tune =
641 {
642 0, /* num_slots */
643 -1, /* l1_cache_size */
644 64, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
649 };
650
651 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
652 {
653 4, /* num_slots */
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 512, /* l2_cache_size */
657 false, /* prefetch_dynamic_strides */
658 2048, /* minimum_stride */
659 3 /* default_opt_level */
660 };
661
662 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
663 {
664 8, /* num_slots */
665 32, /* l1_cache_size */
666 128, /* l1_cache_line_size */
667 16*1024, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 3 /* default_opt_level */
671 };
672
673 static const cpu_prefetch_tune thunderx_prefetch_tune =
674 {
675 8, /* num_slots */
676 32, /* l1_cache_size */
677 128, /* l1_cache_line_size */
678 -1, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
682 };
683
684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
685 {
686 8, /* num_slots */
687 32, /* l1_cache_size */
688 64, /* l1_cache_line_size */
689 256, /* l2_cache_size */
690 true, /* prefetch_dynamic_strides */
691 -1, /* minimum_stride */
692 -1 /* default_opt_level */
693 };
694
695 static const cpu_prefetch_tune tsv110_prefetch_tune =
696 {
697 0, /* num_slots */
698 64, /* l1_cache_size */
699 64, /* l1_cache_line_size */
700 512, /* l2_cache_size */
701 true, /* prefetch_dynamic_strides */
702 -1, /* minimum_stride */
703 -1 /* default_opt_level */
704 };
705
706 static const cpu_prefetch_tune xgene1_prefetch_tune =
707 {
708 8, /* num_slots */
709 32, /* l1_cache_size */
710 64, /* l1_cache_line_size */
711 256, /* l2_cache_size */
712 true, /* prefetch_dynamic_strides */
713 -1, /* minimum_stride */
714 -1 /* default_opt_level */
715 };
716
717 static const struct tune_params generic_tunings =
718 {
719 &cortexa57_extra_costs,
720 &generic_addrcost_table,
721 &generic_regmove_cost,
722 &generic_vector_cost,
723 &generic_branch_cost,
724 &generic_approx_modes,
725 SVE_NOT_IMPLEMENTED, /* sve_width */
726 4, /* memmov_cost */
727 2, /* issue_rate */
728 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
729 "16:12", /* function_align. */
730 "4", /* jump_align. */
731 "8", /* loop_align. */
732 2, /* int_reassoc_width. */
733 4, /* fp_reassoc_width. */
734 1, /* vec_reassoc_width. */
735 2, /* min_div_recip_mul_sf. */
736 2, /* min_div_recip_mul_df. */
737 0, /* max_case_values. */
738 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
739 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
740 &generic_prefetch_tune
741 };
742
743 static const struct tune_params cortexa35_tunings =
744 {
745 &cortexa53_extra_costs,
746 &generic_addrcost_table,
747 &cortexa53_regmove_cost,
748 &generic_vector_cost,
749 &generic_branch_cost,
750 &generic_approx_modes,
751 SVE_NOT_IMPLEMENTED, /* sve_width */
752 4, /* memmov_cost */
753 1, /* issue_rate */
754 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
755 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
756 "16", /* function_align. */
757 "4", /* jump_align. */
758 "8", /* loop_align. */
759 2, /* int_reassoc_width. */
760 4, /* fp_reassoc_width. */
761 1, /* vec_reassoc_width. */
762 2, /* min_div_recip_mul_sf. */
763 2, /* min_div_recip_mul_df. */
764 0, /* max_case_values. */
765 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
766 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
767 &generic_prefetch_tune
768 };
769
770 static const struct tune_params cortexa53_tunings =
771 {
772 &cortexa53_extra_costs,
773 &generic_addrcost_table,
774 &cortexa53_regmove_cost,
775 &generic_vector_cost,
776 &generic_branch_cost,
777 &generic_approx_modes,
778 SVE_NOT_IMPLEMENTED, /* sve_width */
779 4, /* memmov_cost */
780 2, /* issue_rate */
781 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
782 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
783 "16", /* function_align. */
784 "4", /* jump_align. */
785 "8", /* loop_align. */
786 2, /* int_reassoc_width. */
787 4, /* fp_reassoc_width. */
788 1, /* vec_reassoc_width. */
789 2, /* min_div_recip_mul_sf. */
790 2, /* min_div_recip_mul_df. */
791 0, /* max_case_values. */
792 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
793 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
794 &generic_prefetch_tune
795 };
796
797 static const struct tune_params cortexa57_tunings =
798 {
799 &cortexa57_extra_costs,
800 &generic_addrcost_table,
801 &cortexa57_regmove_cost,
802 &cortexa57_vector_cost,
803 &generic_branch_cost,
804 &generic_approx_modes,
805 SVE_NOT_IMPLEMENTED, /* sve_width */
806 4, /* memmov_cost */
807 3, /* issue_rate */
808 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
809 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
810 "16", /* function_align. */
811 "4", /* jump_align. */
812 "8", /* loop_align. */
813 2, /* int_reassoc_width. */
814 4, /* fp_reassoc_width. */
815 1, /* vec_reassoc_width. */
816 2, /* min_div_recip_mul_sf. */
817 2, /* min_div_recip_mul_df. */
818 0, /* max_case_values. */
819 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
820 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
821 &generic_prefetch_tune
822 };
823
824 static const struct tune_params cortexa72_tunings =
825 {
826 &cortexa57_extra_costs,
827 &generic_addrcost_table,
828 &cortexa57_regmove_cost,
829 &cortexa57_vector_cost,
830 &generic_branch_cost,
831 &generic_approx_modes,
832 SVE_NOT_IMPLEMENTED, /* sve_width */
833 4, /* memmov_cost */
834 3, /* issue_rate */
835 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
836 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
837 "16", /* function_align. */
838 "4", /* jump_align. */
839 "8", /* loop_align. */
840 2, /* int_reassoc_width. */
841 4, /* fp_reassoc_width. */
842 1, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &generic_prefetch_tune
849 };
850
851 static const struct tune_params cortexa73_tunings =
852 {
853 &cortexa57_extra_costs,
854 &generic_addrcost_table,
855 &cortexa57_regmove_cost,
856 &cortexa57_vector_cost,
857 &generic_branch_cost,
858 &generic_approx_modes,
859 SVE_NOT_IMPLEMENTED, /* sve_width */
860 4, /* memmov_cost. */
861 2, /* issue_rate. */
862 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
863 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
864 "16", /* function_align. */
865 "4", /* jump_align. */
866 "8", /* loop_align. */
867 2, /* int_reassoc_width. */
868 4, /* fp_reassoc_width. */
869 1, /* vec_reassoc_width. */
870 2, /* min_div_recip_mul_sf. */
871 2, /* min_div_recip_mul_df. */
872 0, /* max_case_values. */
873 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
874 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
875 &generic_prefetch_tune
876 };
877
878
879
880 static const struct tune_params exynosm1_tunings =
881 {
882 &exynosm1_extra_costs,
883 &exynosm1_addrcost_table,
884 &exynosm1_regmove_cost,
885 &exynosm1_vector_cost,
886 &generic_branch_cost,
887 &exynosm1_approx_modes,
888 SVE_NOT_IMPLEMENTED, /* sve_width */
889 4, /* memmov_cost */
890 3, /* issue_rate */
891 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
892 "4", /* function_align. */
893 "4", /* jump_align. */
894 "4", /* loop_align. */
895 2, /* int_reassoc_width. */
896 4, /* fp_reassoc_width. */
897 1, /* vec_reassoc_width. */
898 2, /* min_div_recip_mul_sf. */
899 2, /* min_div_recip_mul_df. */
900 48, /* max_case_values. */
901 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
902 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
903 &exynosm1_prefetch_tune
904 };
905
906 static const struct tune_params thunderxt88_tunings =
907 {
908 &thunderx_extra_costs,
909 &generic_addrcost_table,
910 &thunderx_regmove_cost,
911 &thunderx_vector_cost,
912 &generic_branch_cost,
913 &generic_approx_modes,
914 SVE_NOT_IMPLEMENTED, /* sve_width */
915 6, /* memmov_cost */
916 2, /* issue_rate */
917 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
918 "8", /* function_align. */
919 "8", /* jump_align. */
920 "8", /* loop_align. */
921 2, /* int_reassoc_width. */
922 4, /* fp_reassoc_width. */
923 1, /* vec_reassoc_width. */
924 2, /* min_div_recip_mul_sf. */
925 2, /* min_div_recip_mul_df. */
926 0, /* max_case_values. */
927 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
928 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
929 &thunderxt88_prefetch_tune
930 };
931
932 static const struct tune_params thunderx_tunings =
933 {
934 &thunderx_extra_costs,
935 &generic_addrcost_table,
936 &thunderx_regmove_cost,
937 &thunderx_vector_cost,
938 &generic_branch_cost,
939 &generic_approx_modes,
940 SVE_NOT_IMPLEMENTED, /* sve_width */
941 6, /* memmov_cost */
942 2, /* issue_rate */
943 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
944 "8", /* function_align. */
945 "8", /* jump_align. */
946 "8", /* loop_align. */
947 2, /* int_reassoc_width. */
948 4, /* fp_reassoc_width. */
949 1, /* vec_reassoc_width. */
950 2, /* min_div_recip_mul_sf. */
951 2, /* min_div_recip_mul_df. */
952 0, /* max_case_values. */
953 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
954 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
955 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
956 &thunderx_prefetch_tune
957 };
958
959 static const struct tune_params tsv110_tunings =
960 {
961 &tsv110_extra_costs,
962 &tsv110_addrcost_table,
963 &tsv110_regmove_cost,
964 &tsv110_vector_cost,
965 &generic_branch_cost,
966 &generic_approx_modes,
967 SVE_NOT_IMPLEMENTED, /* sve_width */
968 4, /* memmov_cost */
969 4, /* issue_rate */
970 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
971 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
972 "16", /* function_align. */
973 "4", /* jump_align. */
974 "8", /* loop_align. */
975 2, /* int_reassoc_width. */
976 4, /* fp_reassoc_width. */
977 1, /* vec_reassoc_width. */
978 2, /* min_div_recip_mul_sf. */
979 2, /* min_div_recip_mul_df. */
980 0, /* max_case_values. */
981 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
982 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
983 &tsv110_prefetch_tune
984 };
985
986 static const struct tune_params xgene1_tunings =
987 {
988 &xgene1_extra_costs,
989 &xgene1_addrcost_table,
990 &xgene1_regmove_cost,
991 &xgene1_vector_cost,
992 &generic_branch_cost,
993 &xgene1_approx_modes,
994 SVE_NOT_IMPLEMENTED, /* sve_width */
995 6, /* memmov_cost */
996 4, /* issue_rate */
997 AARCH64_FUSE_NOTHING, /* fusible_ops */
998 "16", /* function_align. */
999 "16", /* jump_align. */
1000 "16", /* loop_align. */
1001 2, /* int_reassoc_width. */
1002 4, /* fp_reassoc_width. */
1003 1, /* vec_reassoc_width. */
1004 2, /* min_div_recip_mul_sf. */
1005 2, /* min_div_recip_mul_df. */
1006 17, /* max_case_values. */
1007 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1008 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1009 &xgene1_prefetch_tune
1010 };
1011
1012 static const struct tune_params emag_tunings =
1013 {
1014 &xgene1_extra_costs,
1015 &xgene1_addrcost_table,
1016 &xgene1_regmove_cost,
1017 &xgene1_vector_cost,
1018 &generic_branch_cost,
1019 &xgene1_approx_modes,
1020 SVE_NOT_IMPLEMENTED,
1021 6, /* memmov_cost */
1022 4, /* issue_rate */
1023 AARCH64_FUSE_NOTHING, /* fusible_ops */
1024 "16", /* function_align. */
1025 "16", /* jump_align. */
1026 "16", /* loop_align. */
1027 2, /* int_reassoc_width. */
1028 4, /* fp_reassoc_width. */
1029 1, /* vec_reassoc_width. */
1030 2, /* min_div_recip_mul_sf. */
1031 2, /* min_div_recip_mul_df. */
1032 17, /* max_case_values. */
1033 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1034 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1035 &xgene1_prefetch_tune
1036 };
1037
1038 static const struct tune_params qdf24xx_tunings =
1039 {
1040 &qdf24xx_extra_costs,
1041 &qdf24xx_addrcost_table,
1042 &qdf24xx_regmove_cost,
1043 &qdf24xx_vector_cost,
1044 &generic_branch_cost,
1045 &generic_approx_modes,
1046 SVE_NOT_IMPLEMENTED, /* sve_width */
1047 4, /* memmov_cost */
1048 4, /* issue_rate */
1049 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1050 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1051 "16", /* function_align. */
1052 "8", /* jump_align. */
1053 "16", /* loop_align. */
1054 2, /* int_reassoc_width. */
1055 4, /* fp_reassoc_width. */
1056 1, /* vec_reassoc_width. */
1057 2, /* min_div_recip_mul_sf. */
1058 2, /* min_div_recip_mul_df. */
1059 0, /* max_case_values. */
1060 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1061 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1062 &qdf24xx_prefetch_tune
1063 };
1064
1065 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1066 for now. */
1067 static const struct tune_params saphira_tunings =
1068 {
1069 &generic_extra_costs,
1070 &generic_addrcost_table,
1071 &generic_regmove_cost,
1072 &generic_vector_cost,
1073 &generic_branch_cost,
1074 &generic_approx_modes,
1075 SVE_NOT_IMPLEMENTED, /* sve_width */
1076 4, /* memmov_cost */
1077 4, /* issue_rate */
1078 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1079 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1080 "16", /* function_align. */
1081 "8", /* jump_align. */
1082 "16", /* loop_align. */
1083 2, /* int_reassoc_width. */
1084 4, /* fp_reassoc_width. */
1085 1, /* vec_reassoc_width. */
1086 2, /* min_div_recip_mul_sf. */
1087 2, /* min_div_recip_mul_df. */
1088 0, /* max_case_values. */
1089 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1090 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1091 &generic_prefetch_tune
1092 };
1093
1094 static const struct tune_params thunderx2t99_tunings =
1095 {
1096 &thunderx2t99_extra_costs,
1097 &thunderx2t99_addrcost_table,
1098 &thunderx2t99_regmove_cost,
1099 &thunderx2t99_vector_cost,
1100 &generic_branch_cost,
1101 &generic_approx_modes,
1102 SVE_NOT_IMPLEMENTED, /* sve_width */
1103 4, /* memmov_cost. */
1104 4, /* issue_rate. */
1105 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1106 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1107 "16", /* function_align. */
1108 "8", /* jump_align. */
1109 "16", /* loop_align. */
1110 3, /* int_reassoc_width. */
1111 2, /* fp_reassoc_width. */
1112 2, /* vec_reassoc_width. */
1113 2, /* min_div_recip_mul_sf. */
1114 2, /* min_div_recip_mul_df. */
1115 0, /* max_case_values. */
1116 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1117 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1118 &thunderx2t99_prefetch_tune
1119 };
1120
1121 static const struct tune_params neoversen1_tunings =
1122 {
1123 &cortexa57_extra_costs,
1124 &generic_addrcost_table,
1125 &generic_regmove_cost,
1126 &cortexa57_vector_cost,
1127 &generic_branch_cost,
1128 &generic_approx_modes,
1129 SVE_NOT_IMPLEMENTED, /* sve_width */
1130 4, /* memmov_cost */
1131 3, /* issue_rate */
1132 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1133 "32:16", /* function_align. */
1134 "32:16", /* jump_align. */
1135 "32:16", /* loop_align. */
1136 2, /* int_reassoc_width. */
1137 4, /* fp_reassoc_width. */
1138 2, /* vec_reassoc_width. */
1139 2, /* min_div_recip_mul_sf. */
1140 2, /* min_div_recip_mul_df. */
1141 0, /* max_case_values. */
1142 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1143 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1144 &generic_prefetch_tune
1145 };
1146
1147 /* Support for fine-grained override of the tuning structures. */
1148 struct aarch64_tuning_override_function
1149 {
1150 const char* name;
1151 void (*parse_override)(const char*, struct tune_params*);
1152 };
1153
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1157
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions[] =
1160 {
1161 { "fuse", aarch64_parse_fuse_string },
1162 { "tune", aarch64_parse_tune_string },
1163 { "sve_width", aarch64_parse_sve_width_string },
1164 { NULL, NULL }
1165 };
1166
1167 /* A processor implementing AArch64. */
1168 struct processor
1169 {
1170 const char *const name;
1171 enum aarch64_processor ident;
1172 enum aarch64_processor sched_core;
1173 enum aarch64_arch arch;
1174 unsigned architecture_version;
1175 const uint64_t flags;
1176 const struct tune_params *const tune;
1177 };
1178
1179 /* Architectures implementing AArch64. */
1180 static const struct processor all_architectures[] =
1181 {
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1186 };
1187
1188 /* Processor cores implementing AArch64. */
1189 static const struct processor all_cores[] =
1190 {
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1193 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1194 FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1197 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1198 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1199 };
1200
1201
1202 /* Target specification. These are populated by the -march, -mtune, -mcpu
1203 handling code or by target attributes. */
1204 static const struct processor *selected_arch;
1205 static const struct processor *selected_cpu;
1206 static const struct processor *selected_tune;
1207
1208 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1209
1210 /* The current tuning set. */
1211 struct tune_params aarch64_tune_params = generic_tunings;
1212
1213 /* Table of machine attributes. */
1214 static const struct attribute_spec aarch64_attribute_table[] =
1215 {
1216 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217 affects_type_identity, handler, exclude } */
1218 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1219 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1220 };
1221
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1223
1224 /* An ISA extension in the co-processor and main instruction set space. */
1225 struct aarch64_option_extension
1226 {
1227 const char *const name;
1228 const unsigned long flags_on;
1229 const unsigned long flags_off;
1230 };
1231
1232 typedef enum aarch64_cond_code
1233 {
1234 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1235 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1236 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1237 }
1238 aarch64_cc;
1239
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1241
1242 struct aarch64_branch_protect_type
1243 {
1244 /* The type's name that the user passes to the branch-protection option
1245 string. */
1246 const char* name;
1247 /* Function to handle the protection type and set global variables.
1248 First argument is the string token corresponding with this type and the
1249 second argument is the next token in the option string.
1250 Return values:
1251 * AARCH64_PARSE_OK: Handling was sucessful.
1252 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253 should print an error.
1254 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1255 own error. */
1256 enum aarch64_parse_opt_result (*handler)(char*, char*);
1257 /* A list of types that can follow this type in the option string. */
1258 const aarch64_branch_protect_type* subtypes;
1259 unsigned int num_subtypes;
1260 };
1261
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str, char* rest)
1264 {
1265 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1266 aarch64_enable_bti = 0;
1267 if (rest)
1268 {
1269 error ("unexpected %<%s%> after %<%s%>", rest, str);
1270 return AARCH64_PARSE_INVALID_FEATURE;
1271 }
1272 return AARCH64_PARSE_OK;
1273 }
1274
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str, char* rest)
1277 {
1278 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1279 aarch64_ra_sign_key = AARCH64_KEY_A;
1280 aarch64_enable_bti = 1;
1281 if (rest)
1282 {
1283 error ("unexpected %<%s%> after %<%s%>", rest, str);
1284 return AARCH64_PARSE_INVALID_FEATURE;
1285 }
1286 return AARCH64_PARSE_OK;
1287 }
1288
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1291 char* rest ATTRIBUTE_UNUSED)
1292 {
1293 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1294 aarch64_ra_sign_key = AARCH64_KEY_A;
1295 return AARCH64_PARSE_OK;
1296 }
1297
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1300 char* rest ATTRIBUTE_UNUSED)
1301 {
1302 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1303 return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1308 char* rest ATTRIBUTE_UNUSED)
1309 {
1310 aarch64_ra_sign_key = AARCH64_KEY_B;
1311 return AARCH64_PARSE_OK;
1312 }
1313
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1316 char* rest ATTRIBUTE_UNUSED)
1317 {
1318 aarch64_enable_bti = 1;
1319 return AARCH64_PARSE_OK;
1320 }
1321
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1323 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1324 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1325 { NULL, NULL, NULL, 0 }
1326 };
1327
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1329 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1330 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1331 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1332 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1333 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1334 { NULL, NULL, NULL, 0 }
1335 };
1336
1337 /* The condition codes of the processor, and the inverse function. */
1338 static const char * const aarch64_condition_codes[] =
1339 {
1340 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1342 };
1343
1344 /* The preferred condition codes for SVE conditions. */
1345 static const char *const aarch64_sve_condition_codes[] =
1346 {
1347 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1349 };
1350
1351 /* Return the assembly token for svpattern value VALUE. */
1352
1353 static const char *
1354 svpattern_token (enum aarch64_svpattern pattern)
1355 {
1356 switch (pattern)
1357 {
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359 AARCH64_FOR_SVPATTERN (CASE)
1360 #undef CASE
1361 case AARCH64_NUM_SVPATTERNS:
1362 break;
1363 }
1364 gcc_unreachable ();
1365 }
1366
1367 /* Generate code to enable conditional branches in functions over 1 MiB. */
1368 const char *
1369 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1370 const char * branch_format)
1371 {
1372 rtx_code_label * tmp_label = gen_label_rtx ();
1373 char label_buf[256];
1374 char buffer[128];
1375 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1376 CODE_LABEL_NUMBER (tmp_label));
1377 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1378 rtx dest_label = operands[pos_label];
1379 operands[pos_label] = tmp_label;
1380
1381 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1382 output_asm_insn (buffer, operands);
1383
1384 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1385 operands[pos_label] = dest_label;
1386 output_asm_insn (buffer, operands);
1387 return "";
1388 }
1389
1390 void
1391 aarch64_err_no_fpadvsimd (machine_mode mode)
1392 {
1393 if (TARGET_GENERAL_REGS_ONLY)
1394 if (FLOAT_MODE_P (mode))
1395 error ("%qs is incompatible with the use of floating-point types",
1396 "-mgeneral-regs-only");
1397 else
1398 error ("%qs is incompatible with the use of vector types",
1399 "-mgeneral-regs-only");
1400 else
1401 if (FLOAT_MODE_P (mode))
1402 error ("%qs feature modifier is incompatible with the use of"
1403 " floating-point types", "+nofp");
1404 else
1405 error ("%qs feature modifier is incompatible with the use of"
1406 " vector types", "+nofp");
1407 }
1408
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413 and GENERAL_REGS is lower than the memory cost (in this case the best class
1414 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1415 cost results in bad allocations with many redundant int<->FP moves which
1416 are expensive on various cores.
1417 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1419 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1420 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1421 The result of this is that it is no longer inefficient to have a higher
1422 memory move cost than the register move cost.
1423 */
1424
1425 static reg_class_t
1426 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1427 reg_class_t best_class)
1428 {
1429 machine_mode mode;
1430
1431 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1432 || !reg_class_subset_p (FP_REGS, allocno_class))
1433 return allocno_class;
1434
1435 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1436 || !reg_class_subset_p (FP_REGS, best_class))
1437 return best_class;
1438
1439 mode = PSEUDO_REGNO_MODE (regno);
1440 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1441 }
1442
1443 static unsigned int
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1445 {
1446 if (GET_MODE_UNIT_SIZE (mode) == 4)
1447 return aarch64_tune_params.min_div_recip_mul_sf;
1448 return aarch64_tune_params.min_div_recip_mul_df;
1449 }
1450
1451 /* Return the reassociation width of treeop OPC with mode MODE. */
1452 static int
1453 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1454 {
1455 if (VECTOR_MODE_P (mode))
1456 return aarch64_tune_params.vec_reassoc_width;
1457 if (INTEGRAL_MODE_P (mode))
1458 return aarch64_tune_params.int_reassoc_width;
1459 /* Avoid reassociating floating point addition so we emit more FMAs. */
1460 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1461 return aarch64_tune_params.fp_reassoc_width;
1462 return 1;
1463 }
1464
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1466 unsigned
1467 aarch64_dbx_register_number (unsigned regno)
1468 {
1469 if (GP_REGNUM_P (regno))
1470 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1471 else if (regno == SP_REGNUM)
1472 return AARCH64_DWARF_SP;
1473 else if (FP_REGNUM_P (regno))
1474 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1475 else if (PR_REGNUM_P (regno))
1476 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1477 else if (regno == VG_REGNUM)
1478 return AARCH64_DWARF_VG;
1479
1480 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481 equivalent DWARF register. */
1482 return DWARF_FRAME_REGISTERS;
1483 }
1484
1485 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1486 integer, otherwise return X unmodified. */
1487 static rtx
1488 aarch64_bit_representation (rtx x)
1489 {
1490 if (CONST_DOUBLE_P (x))
1491 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1492 return x;
1493 }
1494
1495 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1496 static bool
1497 aarch64_advsimd_struct_mode_p (machine_mode mode)
1498 {
1499 return (TARGET_SIMD
1500 && (mode == OImode || mode == CImode || mode == XImode));
1501 }
1502
1503 /* Return true if MODE is an SVE predicate mode. */
1504 static bool
1505 aarch64_sve_pred_mode_p (machine_mode mode)
1506 {
1507 return (TARGET_SVE
1508 && (mode == VNx16BImode
1509 || mode == VNx8BImode
1510 || mode == VNx4BImode
1511 || mode == VNx2BImode));
1512 }
1513
1514 /* Three mutually-exclusive flags describing a vector or predicate type. */
1515 const unsigned int VEC_ADVSIMD = 1;
1516 const unsigned int VEC_SVE_DATA = 2;
1517 const unsigned int VEC_SVE_PRED = 4;
1518 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1519 a structure of 2, 3 or 4 vectors. */
1520 const unsigned int VEC_STRUCT = 8;
1521 /* Useful combinations of the above. */
1522 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1523 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1524
1525 /* Return a set of flags describing the vector properties of mode MODE.
1526 Ignore modes that are not supported by the current target. */
1527 static unsigned int
1528 aarch64_classify_vector_mode (machine_mode mode)
1529 {
1530 if (aarch64_advsimd_struct_mode_p (mode))
1531 return VEC_ADVSIMD | VEC_STRUCT;
1532
1533 if (aarch64_sve_pred_mode_p (mode))
1534 return VEC_SVE_PRED;
1535
1536 /* Make the decision based on the mode's enum value rather than its
1537 properties, so that we keep the correct classification regardless
1538 of -msve-vector-bits. */
1539 switch (mode)
1540 {
1541 /* Single SVE vectors. */
1542 case E_VNx16QImode:
1543 case E_VNx8HImode:
1544 case E_VNx4SImode:
1545 case E_VNx2DImode:
1546 case E_VNx8HFmode:
1547 case E_VNx4SFmode:
1548 case E_VNx2DFmode:
1549 return TARGET_SVE ? VEC_SVE_DATA : 0;
1550
1551 /* x2 SVE vectors. */
1552 case E_VNx32QImode:
1553 case E_VNx16HImode:
1554 case E_VNx8SImode:
1555 case E_VNx4DImode:
1556 case E_VNx16HFmode:
1557 case E_VNx8SFmode:
1558 case E_VNx4DFmode:
1559 /* x3 SVE vectors. */
1560 case E_VNx48QImode:
1561 case E_VNx24HImode:
1562 case E_VNx12SImode:
1563 case E_VNx6DImode:
1564 case E_VNx24HFmode:
1565 case E_VNx12SFmode:
1566 case E_VNx6DFmode:
1567 /* x4 SVE vectors. */
1568 case E_VNx64QImode:
1569 case E_VNx32HImode:
1570 case E_VNx16SImode:
1571 case E_VNx8DImode:
1572 case E_VNx32HFmode:
1573 case E_VNx16SFmode:
1574 case E_VNx8DFmode:
1575 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1576
1577 /* 64-bit Advanced SIMD vectors. */
1578 case E_V8QImode:
1579 case E_V4HImode:
1580 case E_V2SImode:
1581 /* ...E_V1DImode doesn't exist. */
1582 case E_V4HFmode:
1583 case E_V2SFmode:
1584 case E_V1DFmode:
1585 /* 128-bit Advanced SIMD vectors. */
1586 case E_V16QImode:
1587 case E_V8HImode:
1588 case E_V4SImode:
1589 case E_V2DImode:
1590 case E_V8HFmode:
1591 case E_V4SFmode:
1592 case E_V2DFmode:
1593 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1594
1595 default:
1596 return 0;
1597 }
1598 }
1599
1600 /* Return true if MODE is any of the data vector modes, including
1601 structure modes. */
1602 static bool
1603 aarch64_vector_data_mode_p (machine_mode mode)
1604 {
1605 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1606 }
1607
1608 /* Return true if MODE is an SVE data vector mode; either a single vector
1609 or a structure of vectors. */
1610 static bool
1611 aarch64_sve_data_mode_p (machine_mode mode)
1612 {
1613 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1614 }
1615
1616 /* Implement target hook TARGET_ARRAY_MODE. */
1617 static opt_machine_mode
1618 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1619 {
1620 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1621 && IN_RANGE (nelems, 2, 4))
1622 return mode_for_vector (GET_MODE_INNER (mode),
1623 GET_MODE_NUNITS (mode) * nelems);
1624
1625 return opt_machine_mode ();
1626 }
1627
1628 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1629 static bool
1630 aarch64_array_mode_supported_p (machine_mode mode,
1631 unsigned HOST_WIDE_INT nelems)
1632 {
1633 if (TARGET_SIMD
1634 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1635 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1636 && (nelems >= 2 && nelems <= 4))
1637 return true;
1638
1639 return false;
1640 }
1641
1642 /* Return the SVE predicate mode to use for elements that have
1643 ELEM_NBYTES bytes, if such a mode exists. */
1644
1645 opt_machine_mode
1646 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1647 {
1648 if (TARGET_SVE)
1649 {
1650 if (elem_nbytes == 1)
1651 return VNx16BImode;
1652 if (elem_nbytes == 2)
1653 return VNx8BImode;
1654 if (elem_nbytes == 4)
1655 return VNx4BImode;
1656 if (elem_nbytes == 8)
1657 return VNx2BImode;
1658 }
1659 return opt_machine_mode ();
1660 }
1661
1662 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1663
1664 static opt_machine_mode
1665 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1666 {
1667 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1668 {
1669 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1670 machine_mode pred_mode;
1671 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1672 return pred_mode;
1673 }
1674
1675 return default_get_mask_mode (nunits, nbytes);
1676 }
1677
1678 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1679
1680 static opt_machine_mode
1681 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1682 {
1683 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1684 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1685 machine_mode mode;
1686 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1687 if (inner_mode == GET_MODE_INNER (mode)
1688 && known_eq (nunits, GET_MODE_NUNITS (mode))
1689 && aarch64_sve_data_mode_p (mode))
1690 return mode;
1691 return opt_machine_mode ();
1692 }
1693
1694 /* Return the integer element mode associated with SVE mode MODE. */
1695
1696 static scalar_int_mode
1697 aarch64_sve_element_int_mode (machine_mode mode)
1698 {
1699 unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1700 GET_MODE_NUNITS (mode));
1701 return int_mode_for_size (elt_bits, 0).require ();
1702 }
1703
1704 /* Return the integer vector mode associated with SVE mode MODE.
1705 Unlike mode_for_int_vector, this can handle the case in which
1706 MODE is a predicate (and thus has a different total size). */
1707
1708 static machine_mode
1709 aarch64_sve_int_mode (machine_mode mode)
1710 {
1711 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1712 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1713 }
1714
1715 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1716 prefer to use the first arithmetic operand as the else value if
1717 the else value doesn't matter, since that exactly matches the SVE
1718 destructive merging form. For ternary operations we could either
1719 pick the first operand and use FMAD-like instructions or the last
1720 operand and use FMLA-like instructions; the latter seems more
1721 natural. */
1722
1723 static tree
1724 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1725 {
1726 return nops == 3 ? ops[2] : ops[0];
1727 }
1728
1729 /* Implement TARGET_HARD_REGNO_NREGS. */
1730
1731 static unsigned int
1732 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1733 {
1734 /* ??? Logically we should only need to provide a value when
1735 HARD_REGNO_MODE_OK says that the combination is valid,
1736 but at the moment we need to handle all modes. Just ignore
1737 any runtime parts for registers that can't store them. */
1738 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1739 switch (aarch64_regno_regclass (regno))
1740 {
1741 case FP_REGS:
1742 case FP_LO_REGS:
1743 case FP_LO8_REGS:
1744 if (aarch64_sve_data_mode_p (mode))
1745 return exact_div (GET_MODE_SIZE (mode),
1746 BYTES_PER_SVE_VECTOR).to_constant ();
1747 return CEIL (lowest_size, UNITS_PER_VREG);
1748 case PR_REGS:
1749 case PR_LO_REGS:
1750 case PR_HI_REGS:
1751 return 1;
1752 default:
1753 return CEIL (lowest_size, UNITS_PER_WORD);
1754 }
1755 gcc_unreachable ();
1756 }
1757
1758 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1759
1760 static bool
1761 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1762 {
1763 if (GET_MODE_CLASS (mode) == MODE_CC)
1764 return regno == CC_REGNUM;
1765
1766 if (regno == VG_REGNUM)
1767 /* This must have the same size as _Unwind_Word. */
1768 return mode == DImode;
1769
1770 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1771 if (vec_flags & VEC_SVE_PRED)
1772 return PR_REGNUM_P (regno);
1773
1774 if (PR_REGNUM_P (regno))
1775 return 0;
1776
1777 if (regno == SP_REGNUM)
1778 /* The purpose of comparing with ptr_mode is to support the
1779 global register variable associated with the stack pointer
1780 register via the syntax of asm ("wsp") in ILP32. */
1781 return mode == Pmode || mode == ptr_mode;
1782
1783 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1784 return mode == Pmode;
1785
1786 if (GP_REGNUM_P (regno))
1787 {
1788 if (known_le (GET_MODE_SIZE (mode), 8))
1789 return true;
1790 else if (known_le (GET_MODE_SIZE (mode), 16))
1791 return (regno & 1) == 0;
1792 }
1793 else if (FP_REGNUM_P (regno))
1794 {
1795 if (vec_flags & VEC_STRUCT)
1796 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1797 else
1798 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1799 }
1800
1801 return false;
1802 }
1803
1804 /* Return true if this is a definition of a vectorized simd function. */
1805
1806 static bool
1807 aarch64_simd_decl_p (tree fndecl)
1808 {
1809 tree fntype;
1810
1811 if (fndecl == NULL)
1812 return false;
1813 fntype = TREE_TYPE (fndecl);
1814 if (fntype == NULL)
1815 return false;
1816
1817 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1818 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1819 return true;
1820
1821 return false;
1822 }
1823
1824 /* Return the mode a register save/restore should use. DImode for integer
1825 registers, DFmode for FP registers in non-SIMD functions (they only save
1826 the bottom half of a 128 bit register), or TFmode for FP registers in
1827 SIMD functions. */
1828
1829 static machine_mode
1830 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1831 {
1832 return GP_REGNUM_P (regno)
1833 ? E_DImode
1834 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1835 }
1836
1837 /* Return true if the instruction is a call to a SIMD function, false
1838 if it is not a SIMD function or if we do not know anything about
1839 the function. */
1840
1841 static bool
1842 aarch64_simd_call_p (rtx_insn *insn)
1843 {
1844 rtx symbol;
1845 rtx call;
1846 tree fndecl;
1847
1848 gcc_assert (CALL_P (insn));
1849 call = get_call_rtx_from (insn);
1850 symbol = XEXP (XEXP (call, 0), 0);
1851 if (GET_CODE (symbol) != SYMBOL_REF)
1852 return false;
1853 fndecl = SYMBOL_REF_DECL (symbol);
1854 if (!fndecl)
1855 return false;
1856
1857 return aarch64_simd_decl_p (fndecl);
1858 }
1859
1860 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1861 a function that uses the SIMD ABI, take advantage of the extra
1862 call-preserved registers that the ABI provides. */
1863
1864 void
1865 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1866 HARD_REG_SET *return_set)
1867 {
1868 if (aarch64_simd_call_p (insn))
1869 {
1870 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1871 if (FP_SIMD_SAVED_REGNUM_P (regno))
1872 CLEAR_HARD_REG_BIT (*return_set, regno);
1873 }
1874 }
1875
1876 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1877 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1878 clobbers the top 64 bits when restoring the bottom 64 bits. */
1879
1880 static bool
1881 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1882 machine_mode mode)
1883 {
1884 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1885 return FP_REGNUM_P (regno)
1886 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1887 }
1888
1889 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1890
1891 rtx_insn *
1892 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1893 {
1894 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1895
1896 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1897 return call_1;
1898 else
1899 return call_2;
1900 }
1901
1902 /* Implement REGMODE_NATURAL_SIZE. */
1903 poly_uint64
1904 aarch64_regmode_natural_size (machine_mode mode)
1905 {
1906 /* The natural size for SVE data modes is one SVE data vector,
1907 and similarly for predicates. We can't independently modify
1908 anything smaller than that. */
1909 /* ??? For now, only do this for variable-width SVE registers.
1910 Doing it for constant-sized registers breaks lower-subreg.c. */
1911 /* ??? And once that's fixed, we should probably have similar
1912 code for Advanced SIMD. */
1913 if (!aarch64_sve_vg.is_constant ())
1914 {
1915 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1916 if (vec_flags & VEC_SVE_PRED)
1917 return BYTES_PER_SVE_PRED;
1918 if (vec_flags & VEC_SVE_DATA)
1919 return BYTES_PER_SVE_VECTOR;
1920 }
1921 return UNITS_PER_WORD;
1922 }
1923
1924 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1925 machine_mode
1926 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1927 machine_mode mode)
1928 {
1929 /* The predicate mode determines which bits are significant and
1930 which are "don't care". Decreasing the number of lanes would
1931 lose data while increasing the number of lanes would make bits
1932 unnecessarily significant. */
1933 if (PR_REGNUM_P (regno))
1934 return mode;
1935 if (known_ge (GET_MODE_SIZE (mode), 4))
1936 return mode;
1937 else
1938 return SImode;
1939 }
1940
1941 /* Return true if I's bits are consecutive ones from the MSB. */
1942 bool
1943 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1944 {
1945 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1946 }
1947
1948 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1949 that strcpy from constants will be faster. */
1950
1951 static HOST_WIDE_INT
1952 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1953 {
1954 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1955 return MAX (align, BITS_PER_WORD);
1956 return align;
1957 }
1958
1959 /* Return true if calls to DECL should be treated as
1960 long-calls (ie called via a register). */
1961 static bool
1962 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1963 {
1964 return false;
1965 }
1966
1967 /* Return true if calls to symbol-ref SYM should be treated as
1968 long-calls (ie called via a register). */
1969 bool
1970 aarch64_is_long_call_p (rtx sym)
1971 {
1972 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1973 }
1974
1975 /* Return true if calls to symbol-ref SYM should not go through
1976 plt stubs. */
1977
1978 bool
1979 aarch64_is_noplt_call_p (rtx sym)
1980 {
1981 const_tree decl = SYMBOL_REF_DECL (sym);
1982
1983 if (flag_pic
1984 && decl
1985 && (!flag_plt
1986 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1987 && !targetm.binds_local_p (decl))
1988 return true;
1989
1990 return false;
1991 }
1992
1993 /* Return true if the offsets to a zero/sign-extract operation
1994 represent an expression that matches an extend operation. The
1995 operands represent the paramters from
1996
1997 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1998 bool
1999 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2000 rtx extract_imm)
2001 {
2002 HOST_WIDE_INT mult_val, extract_val;
2003
2004 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2005 return false;
2006
2007 mult_val = INTVAL (mult_imm);
2008 extract_val = INTVAL (extract_imm);
2009
2010 if (extract_val > 8
2011 && extract_val < GET_MODE_BITSIZE (mode)
2012 && exact_log2 (extract_val & ~7) > 0
2013 && (extract_val & 7) <= 4
2014 && mult_val == (1 << (extract_val & 7)))
2015 return true;
2016
2017 return false;
2018 }
2019
2020 /* Emit an insn that's a simple single-set. Both the operands must be
2021 known to be valid. */
2022 inline static rtx_insn *
2023 emit_set_insn (rtx x, rtx y)
2024 {
2025 return emit_insn (gen_rtx_SET (x, y));
2026 }
2027
2028 /* X and Y are two things to compare using CODE. Emit the compare insn and
2029 return the rtx for register 0 in the proper mode. */
2030 rtx
2031 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2032 {
2033 machine_mode mode = SELECT_CC_MODE (code, x, y);
2034 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
2035
2036 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2037 return cc_reg;
2038 }
2039
2040 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2041
2042 static rtx
2043 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2044 machine_mode y_mode)
2045 {
2046 if (y_mode == E_QImode || y_mode == E_HImode)
2047 {
2048 if (CONST_INT_P (y))
2049 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2050 else
2051 {
2052 rtx t, cc_reg;
2053 machine_mode cc_mode;
2054
2055 t = gen_rtx_ZERO_EXTEND (SImode, y);
2056 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2057 cc_mode = CC_SWPmode;
2058 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2059 emit_set_insn (cc_reg, t);
2060 return cc_reg;
2061 }
2062 }
2063
2064 return aarch64_gen_compare_reg (code, x, y);
2065 }
2066
2067 /* Build the SYMBOL_REF for __tls_get_addr. */
2068
2069 static GTY(()) rtx tls_get_addr_libfunc;
2070
2071 rtx
2072 aarch64_tls_get_addr (void)
2073 {
2074 if (!tls_get_addr_libfunc)
2075 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2076 return tls_get_addr_libfunc;
2077 }
2078
2079 /* Return the TLS model to use for ADDR. */
2080
2081 static enum tls_model
2082 tls_symbolic_operand_type (rtx addr)
2083 {
2084 enum tls_model tls_kind = TLS_MODEL_NONE;
2085 if (GET_CODE (addr) == CONST)
2086 {
2087 poly_int64 addend;
2088 rtx sym = strip_offset (addr, &addend);
2089 if (GET_CODE (sym) == SYMBOL_REF)
2090 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2091 }
2092 else if (GET_CODE (addr) == SYMBOL_REF)
2093 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2094
2095 return tls_kind;
2096 }
2097
2098 /* We'll allow lo_sum's in addresses in our legitimate addresses
2099 so that combine would take care of combining addresses where
2100 necessary, but for generation purposes, we'll generate the address
2101 as :
2102 RTL Absolute
2103 tmp = hi (symbol_ref); adrp x1, foo
2104 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2105 nop
2106
2107 PIC TLS
2108 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2109 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2110 bl __tls_get_addr
2111 nop
2112
2113 Load TLS symbol, depending on TLS mechanism and TLS access model.
2114
2115 Global Dynamic - Traditional TLS:
2116 adrp tmp, :tlsgd:imm
2117 add dest, tmp, #:tlsgd_lo12:imm
2118 bl __tls_get_addr
2119
2120 Global Dynamic - TLS Descriptors:
2121 adrp dest, :tlsdesc:imm
2122 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2123 add dest, dest, #:tlsdesc_lo12:imm
2124 blr tmp
2125 mrs tp, tpidr_el0
2126 add dest, dest, tp
2127
2128 Initial Exec:
2129 mrs tp, tpidr_el0
2130 adrp tmp, :gottprel:imm
2131 ldr dest, [tmp, #:gottprel_lo12:imm]
2132 add dest, dest, tp
2133
2134 Local Exec:
2135 mrs tp, tpidr_el0
2136 add t0, tp, #:tprel_hi12:imm, lsl #12
2137 add t0, t0, #:tprel_lo12_nc:imm
2138 */
2139
2140 static void
2141 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2142 enum aarch64_symbol_type type)
2143 {
2144 switch (type)
2145 {
2146 case SYMBOL_SMALL_ABSOLUTE:
2147 {
2148 /* In ILP32, the mode of dest can be either SImode or DImode. */
2149 rtx tmp_reg = dest;
2150 machine_mode mode = GET_MODE (dest);
2151
2152 gcc_assert (mode == Pmode || mode == ptr_mode);
2153
2154 if (can_create_pseudo_p ())
2155 tmp_reg = gen_reg_rtx (mode);
2156
2157 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2158 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2159 return;
2160 }
2161
2162 case SYMBOL_TINY_ABSOLUTE:
2163 emit_insn (gen_rtx_SET (dest, imm));
2164 return;
2165
2166 case SYMBOL_SMALL_GOT_28K:
2167 {
2168 machine_mode mode = GET_MODE (dest);
2169 rtx gp_rtx = pic_offset_table_rtx;
2170 rtx insn;
2171 rtx mem;
2172
2173 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2174 here before rtl expand. Tree IVOPT will generate rtl pattern to
2175 decide rtx costs, in which case pic_offset_table_rtx is not
2176 initialized. For that case no need to generate the first adrp
2177 instruction as the final cost for global variable access is
2178 one instruction. */
2179 if (gp_rtx != NULL)
2180 {
2181 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2182 using the page base as GOT base, the first page may be wasted,
2183 in the worst scenario, there is only 28K space for GOT).
2184
2185 The generate instruction sequence for accessing global variable
2186 is:
2187
2188 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2189
2190 Only one instruction needed. But we must initialize
2191 pic_offset_table_rtx properly. We generate initialize insn for
2192 every global access, and allow CSE to remove all redundant.
2193
2194 The final instruction sequences will look like the following
2195 for multiply global variables access.
2196
2197 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2198
2199 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2200 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2201 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2202 ... */
2203
2204 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2205 crtl->uses_pic_offset_table = 1;
2206 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2207
2208 if (mode != GET_MODE (gp_rtx))
2209 gp_rtx = gen_lowpart (mode, gp_rtx);
2210
2211 }
2212
2213 if (mode == ptr_mode)
2214 {
2215 if (mode == DImode)
2216 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2217 else
2218 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2219
2220 mem = XVECEXP (SET_SRC (insn), 0, 0);
2221 }
2222 else
2223 {
2224 gcc_assert (mode == Pmode);
2225
2226 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2227 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2228 }
2229
2230 /* The operand is expected to be MEM. Whenever the related insn
2231 pattern changed, above code which calculate mem should be
2232 updated. */
2233 gcc_assert (GET_CODE (mem) == MEM);
2234 MEM_READONLY_P (mem) = 1;
2235 MEM_NOTRAP_P (mem) = 1;
2236 emit_insn (insn);
2237 return;
2238 }
2239
2240 case SYMBOL_SMALL_GOT_4G:
2241 {
2242 /* In ILP32, the mode of dest can be either SImode or DImode,
2243 while the got entry is always of SImode size. The mode of
2244 dest depends on how dest is used: if dest is assigned to a
2245 pointer (e.g. in the memory), it has SImode; it may have
2246 DImode if dest is dereferenced to access the memeory.
2247 This is why we have to handle three different ldr_got_small
2248 patterns here (two patterns for ILP32). */
2249
2250 rtx insn;
2251 rtx mem;
2252 rtx tmp_reg = dest;
2253 machine_mode mode = GET_MODE (dest);
2254
2255 if (can_create_pseudo_p ())
2256 tmp_reg = gen_reg_rtx (mode);
2257
2258 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2259 if (mode == ptr_mode)
2260 {
2261 if (mode == DImode)
2262 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2263 else
2264 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2265
2266 mem = XVECEXP (SET_SRC (insn), 0, 0);
2267 }
2268 else
2269 {
2270 gcc_assert (mode == Pmode);
2271
2272 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2273 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2274 }
2275
2276 gcc_assert (GET_CODE (mem) == MEM);
2277 MEM_READONLY_P (mem) = 1;
2278 MEM_NOTRAP_P (mem) = 1;
2279 emit_insn (insn);
2280 return;
2281 }
2282
2283 case SYMBOL_SMALL_TLSGD:
2284 {
2285 rtx_insn *insns;
2286 machine_mode mode = GET_MODE (dest);
2287 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2288
2289 start_sequence ();
2290 if (TARGET_ILP32)
2291 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2292 else
2293 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2294 insns = get_insns ();
2295 end_sequence ();
2296
2297 RTL_CONST_CALL_P (insns) = 1;
2298 emit_libcall_block (insns, dest, result, imm);
2299 return;
2300 }
2301
2302 case SYMBOL_SMALL_TLSDESC:
2303 {
2304 machine_mode mode = GET_MODE (dest);
2305 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2306 rtx tp;
2307
2308 gcc_assert (mode == Pmode || mode == ptr_mode);
2309
2310 /* In ILP32, the got entry is always of SImode size. Unlike
2311 small GOT, the dest is fixed at reg 0. */
2312 if (TARGET_ILP32)
2313 emit_insn (gen_tlsdesc_small_si (imm));
2314 else
2315 emit_insn (gen_tlsdesc_small_di (imm));
2316 tp = aarch64_load_tp (NULL);
2317
2318 if (mode != Pmode)
2319 tp = gen_lowpart (mode, tp);
2320
2321 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2322 if (REG_P (dest))
2323 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2324 return;
2325 }
2326
2327 case SYMBOL_SMALL_TLSIE:
2328 {
2329 /* In ILP32, the mode of dest can be either SImode or DImode,
2330 while the got entry is always of SImode size. The mode of
2331 dest depends on how dest is used: if dest is assigned to a
2332 pointer (e.g. in the memory), it has SImode; it may have
2333 DImode if dest is dereferenced to access the memeory.
2334 This is why we have to handle three different tlsie_small
2335 patterns here (two patterns for ILP32). */
2336 machine_mode mode = GET_MODE (dest);
2337 rtx tmp_reg = gen_reg_rtx (mode);
2338 rtx tp = aarch64_load_tp (NULL);
2339
2340 if (mode == ptr_mode)
2341 {
2342 if (mode == DImode)
2343 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2344 else
2345 {
2346 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2347 tp = gen_lowpart (mode, tp);
2348 }
2349 }
2350 else
2351 {
2352 gcc_assert (mode == Pmode);
2353 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2354 }
2355
2356 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2357 if (REG_P (dest))
2358 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2359 return;
2360 }
2361
2362 case SYMBOL_TLSLE12:
2363 case SYMBOL_TLSLE24:
2364 case SYMBOL_TLSLE32:
2365 case SYMBOL_TLSLE48:
2366 {
2367 machine_mode mode = GET_MODE (dest);
2368 rtx tp = aarch64_load_tp (NULL);
2369
2370 if (mode != Pmode)
2371 tp = gen_lowpart (mode, tp);
2372
2373 switch (type)
2374 {
2375 case SYMBOL_TLSLE12:
2376 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2377 (dest, tp, imm));
2378 break;
2379 case SYMBOL_TLSLE24:
2380 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2381 (dest, tp, imm));
2382 break;
2383 case SYMBOL_TLSLE32:
2384 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2385 (dest, imm));
2386 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2387 (dest, dest, tp));
2388 break;
2389 case SYMBOL_TLSLE48:
2390 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2391 (dest, imm));
2392 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2393 (dest, dest, tp));
2394 break;
2395 default:
2396 gcc_unreachable ();
2397 }
2398
2399 if (REG_P (dest))
2400 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2401 return;
2402 }
2403
2404 case SYMBOL_TINY_GOT:
2405 emit_insn (gen_ldr_got_tiny (dest, imm));
2406 return;
2407
2408 case SYMBOL_TINY_TLSIE:
2409 {
2410 machine_mode mode = GET_MODE (dest);
2411 rtx tp = aarch64_load_tp (NULL);
2412
2413 if (mode == ptr_mode)
2414 {
2415 if (mode == DImode)
2416 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2417 else
2418 {
2419 tp = gen_lowpart (mode, tp);
2420 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2421 }
2422 }
2423 else
2424 {
2425 gcc_assert (mode == Pmode);
2426 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2427 }
2428
2429 if (REG_P (dest))
2430 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2431 return;
2432 }
2433
2434 default:
2435 gcc_unreachable ();
2436 }
2437 }
2438
2439 /* Emit a move from SRC to DEST. Assume that the move expanders can
2440 handle all moves if !can_create_pseudo_p (). The distinction is
2441 important because, unlike emit_move_insn, the move expanders know
2442 how to force Pmode objects into the constant pool even when the
2443 constant pool address is not itself legitimate. */
2444 static rtx
2445 aarch64_emit_move (rtx dest, rtx src)
2446 {
2447 return (can_create_pseudo_p ()
2448 ? emit_move_insn (dest, src)
2449 : emit_move_insn_1 (dest, src));
2450 }
2451
2452 /* Apply UNOPTAB to OP and store the result in DEST. */
2453
2454 static void
2455 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2456 {
2457 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2458 if (dest != tmp)
2459 emit_move_insn (dest, tmp);
2460 }
2461
2462 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2463
2464 static void
2465 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2466 {
2467 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2468 OPTAB_DIRECT);
2469 if (dest != tmp)
2470 emit_move_insn (dest, tmp);
2471 }
2472
2473 /* Split a 128-bit move operation into two 64-bit move operations,
2474 taking care to handle partial overlap of register to register
2475 copies. Special cases are needed when moving between GP regs and
2476 FP regs. SRC can be a register, constant or memory; DST a register
2477 or memory. If either operand is memory it must not have any side
2478 effects. */
2479 void
2480 aarch64_split_128bit_move (rtx dst, rtx src)
2481 {
2482 rtx dst_lo, dst_hi;
2483 rtx src_lo, src_hi;
2484
2485 machine_mode mode = GET_MODE (dst);
2486
2487 gcc_assert (mode == TImode || mode == TFmode);
2488 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2489 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2490
2491 if (REG_P (dst) && REG_P (src))
2492 {
2493 int src_regno = REGNO (src);
2494 int dst_regno = REGNO (dst);
2495
2496 /* Handle FP <-> GP regs. */
2497 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2498 {
2499 src_lo = gen_lowpart (word_mode, src);
2500 src_hi = gen_highpart (word_mode, src);
2501
2502 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2503 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2504 return;
2505 }
2506 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2507 {
2508 dst_lo = gen_lowpart (word_mode, dst);
2509 dst_hi = gen_highpart (word_mode, dst);
2510
2511 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2512 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2513 return;
2514 }
2515 }
2516
2517 dst_lo = gen_lowpart (word_mode, dst);
2518 dst_hi = gen_highpart (word_mode, dst);
2519 src_lo = gen_lowpart (word_mode, src);
2520 src_hi = gen_highpart_mode (word_mode, mode, src);
2521
2522 /* At most one pairing may overlap. */
2523 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2524 {
2525 aarch64_emit_move (dst_hi, src_hi);
2526 aarch64_emit_move (dst_lo, src_lo);
2527 }
2528 else
2529 {
2530 aarch64_emit_move (dst_lo, src_lo);
2531 aarch64_emit_move (dst_hi, src_hi);
2532 }
2533 }
2534
2535 bool
2536 aarch64_split_128bit_move_p (rtx dst, rtx src)
2537 {
2538 return (! REG_P (src)
2539 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2540 }
2541
2542 /* Split a complex SIMD combine. */
2543
2544 void
2545 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2546 {
2547 machine_mode src_mode = GET_MODE (src1);
2548 machine_mode dst_mode = GET_MODE (dst);
2549
2550 gcc_assert (VECTOR_MODE_P (dst_mode));
2551 gcc_assert (register_operand (dst, dst_mode)
2552 && register_operand (src1, src_mode)
2553 && register_operand (src2, src_mode));
2554
2555 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2556 return;
2557 }
2558
2559 /* Split a complex SIMD move. */
2560
2561 void
2562 aarch64_split_simd_move (rtx dst, rtx src)
2563 {
2564 machine_mode src_mode = GET_MODE (src);
2565 machine_mode dst_mode = GET_MODE (dst);
2566
2567 gcc_assert (VECTOR_MODE_P (dst_mode));
2568
2569 if (REG_P (dst) && REG_P (src))
2570 {
2571 gcc_assert (VECTOR_MODE_P (src_mode));
2572 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2573 }
2574 }
2575
2576 bool
2577 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2578 machine_mode ymode, rtx y)
2579 {
2580 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2581 gcc_assert (r != NULL);
2582 return rtx_equal_p (x, r);
2583 }
2584
2585
2586 /* Return TARGET if it is nonnull and a register of mode MODE.
2587 Otherwise, return a fresh register of mode MODE if we can,
2588 or TARGET reinterpreted as MODE if we can't. */
2589
2590 static rtx
2591 aarch64_target_reg (rtx target, machine_mode mode)
2592 {
2593 if (target && REG_P (target) && GET_MODE (target) == mode)
2594 return target;
2595 if (!can_create_pseudo_p ())
2596 {
2597 gcc_assert (target);
2598 return gen_lowpart (mode, target);
2599 }
2600 return gen_reg_rtx (mode);
2601 }
2602
2603 /* Return a register that contains the constant in BUILDER, given that
2604 the constant is a legitimate move operand. Use TARGET as the register
2605 if it is nonnull and convenient. */
2606
2607 static rtx
2608 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2609 {
2610 rtx src = builder.build ();
2611 target = aarch64_target_reg (target, GET_MODE (src));
2612 emit_insn (gen_rtx_SET (target, src));
2613 return target;
2614 }
2615
2616 static rtx
2617 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2618 {
2619 if (can_create_pseudo_p ())
2620 return force_reg (mode, value);
2621 else
2622 {
2623 gcc_assert (x);
2624 aarch64_emit_move (x, value);
2625 return x;
2626 }
2627 }
2628
2629 /* Return true if predicate value X is a constant in which every element
2630 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2631 value, i.e. as a predicate in which all bits are significant. */
2632
2633 static bool
2634 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2635 {
2636 if (GET_CODE (x) != CONST_VECTOR)
2637 return false;
2638
2639 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2640 GET_MODE_NUNITS (GET_MODE (x)));
2641 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2642 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2643 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2644
2645 unsigned int nelts = const_vector_encoded_nelts (x);
2646 for (unsigned int i = 0; i < nelts; ++i)
2647 {
2648 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2649 if (!CONST_INT_P (elt))
2650 return false;
2651
2652 builder.quick_push (elt);
2653 for (unsigned int j = 1; j < factor; ++j)
2654 builder.quick_push (const0_rtx);
2655 }
2656 builder.finalize ();
2657 return true;
2658 }
2659
2660 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2661 widest predicate element size it can have (that is, the largest size
2662 for which each element would still be 0 or 1). */
2663
2664 unsigned int
2665 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2666 {
2667 /* Start with the most optimistic assumption: that we only need
2668 one bit per pattern. This is what we will use if only the first
2669 bit in each pattern is ever set. */
2670 unsigned int mask = GET_MODE_SIZE (DImode);
2671 mask |= builder.npatterns ();
2672
2673 /* Look for set bits. */
2674 unsigned int nelts = builder.encoded_nelts ();
2675 for (unsigned int i = 1; i < nelts; ++i)
2676 if (INTVAL (builder.elt (i)) != 0)
2677 {
2678 if (i & 1)
2679 return 1;
2680 mask |= i;
2681 }
2682 return mask & -mask;
2683 }
2684
2685 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2686 that the constant would have with predicate element size ELT_SIZE
2687 (ignoring the upper bits in each element) and return:
2688
2689 * -1 if all bits are set
2690 * N if the predicate has N leading set bits followed by all clear bits
2691 * 0 if the predicate does not have any of these forms. */
2692
2693 int
2694 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2695 unsigned int elt_size)
2696 {
2697 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2698 followed by set bits. */
2699 if (builder.nelts_per_pattern () == 3)
2700 return 0;
2701
2702 /* Skip over leading set bits. */
2703 unsigned int nelts = builder.encoded_nelts ();
2704 unsigned int i = 0;
2705 for (; i < nelts; i += elt_size)
2706 if (INTVAL (builder.elt (i)) == 0)
2707 break;
2708 unsigned int vl = i / elt_size;
2709
2710 /* Check for the all-true case. */
2711 if (i == nelts)
2712 return -1;
2713
2714 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2715 repeating pattern of set bits followed by clear bits. */
2716 if (builder.nelts_per_pattern () != 2)
2717 return 0;
2718
2719 /* We have a "foreground" value and a duplicated "background" value.
2720 If the background might repeat and the last set bit belongs to it,
2721 we might have set bits followed by clear bits followed by set bits. */
2722 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2723 return 0;
2724
2725 /* Make sure that the rest are all clear. */
2726 for (; i < nelts; i += elt_size)
2727 if (INTVAL (builder.elt (i)) != 0)
2728 return 0;
2729
2730 return vl;
2731 }
2732
2733 /* See if there is an svpattern that encodes an SVE predicate of mode
2734 PRED_MODE in which the first VL bits are set and the rest are clear.
2735 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2736 A VL of -1 indicates an all-true vector. */
2737
2738 aarch64_svpattern
2739 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2740 {
2741 if (vl < 0)
2742 return AARCH64_SV_ALL;
2743
2744 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2745 return AARCH64_NUM_SVPATTERNS;
2746
2747 if (vl >= 1 && vl <= 8)
2748 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2749
2750 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2751 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2752
2753 int max_vl;
2754 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2755 {
2756 if (vl == (max_vl / 3) * 3)
2757 return AARCH64_SV_MUL3;
2758 /* These would only trigger for non-power-of-2 lengths. */
2759 if (vl == (max_vl & -4))
2760 return AARCH64_SV_MUL4;
2761 if (vl == (1 << floor_log2 (max_vl)))
2762 return AARCH64_SV_POW2;
2763 if (vl == max_vl)
2764 return AARCH64_SV_ALL;
2765 }
2766 return AARCH64_NUM_SVPATTERNS;
2767 }
2768
2769 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2770 bits has the lowest bit set and the upper bits clear. This is the
2771 VNx16BImode equivalent of a PTRUE for controlling elements of
2772 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2773 all bits are significant, even the upper zeros. */
2774
2775 rtx
2776 aarch64_ptrue_all (unsigned int elt_size)
2777 {
2778 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2779 builder.quick_push (const1_rtx);
2780 for (unsigned int i = 1; i < elt_size; ++i)
2781 builder.quick_push (const0_rtx);
2782 return builder.build ();
2783 }
2784
2785 /* Return an all-true predicate register of mode MODE. */
2786
2787 rtx
2788 aarch64_ptrue_reg (machine_mode mode)
2789 {
2790 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2791 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2792 return gen_lowpart (mode, reg);
2793 }
2794
2795 /* Return an all-false predicate register of mode MODE. */
2796
2797 rtx
2798 aarch64_pfalse_reg (machine_mode mode)
2799 {
2800 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2801 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2802 return gen_lowpart (mode, reg);
2803 }
2804
2805 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2806 true, or alternatively if we know that the operation predicated by
2807 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2808 aarch64_sve_gp_strictness operand that describes the operation
2809 predicated by PRED1[0]. */
2810
2811 bool
2812 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2813 {
2814 machine_mode mode = GET_MODE (pred2);
2815 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2816 && mode == GET_MODE (pred1[0])
2817 && aarch64_sve_gp_strictness (pred1[1], SImode));
2818 return (pred1[0] == CONSTM1_RTX (mode)
2819 || INTVAL (pred1[1]) == SVE_RELAXED_GP
2820 || rtx_equal_p (pred1[0], pred2));
2821 }
2822
2823 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
2824 for it. PRED2[0] is the predicate for the instruction whose result
2825 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
2826 for it. Return true if we can prove that the two predicates are
2827 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
2828 with PRED1[0] without changing behavior. */
2829
2830 bool
2831 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
2832 {
2833 machine_mode mode = GET_MODE (pred1[0]);
2834 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2835 && mode == GET_MODE (pred2[0])
2836 && aarch64_sve_ptrue_flag (pred1[1], SImode)
2837 && aarch64_sve_ptrue_flag (pred2[1], SImode));
2838
2839 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
2840 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
2841 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
2842 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
2843 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
2844 }
2845
2846 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
2847 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
2848 Use TARGET as the target register if nonnull and convenient. */
2849
2850 static rtx
2851 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
2852 machine_mode data_mode, rtx op1, rtx op2)
2853 {
2854 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
2855 expand_operand ops[5];
2856 create_output_operand (&ops[0], target, pred_mode);
2857 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
2858 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
2859 create_input_operand (&ops[3], op1, data_mode);
2860 create_input_operand (&ops[4], op2, data_mode);
2861 expand_insn (icode, 5, ops);
2862 return ops[0].value;
2863 }
2864
2865 /* Use a comparison to convert integer vector SRC into MODE, which is
2866 the corresponding SVE predicate mode. Use TARGET for the result
2867 if it's nonnull and convenient. */
2868
2869 static rtx
2870 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2871 {
2872 machine_mode src_mode = GET_MODE (src);
2873 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
2874 src, CONST0_RTX (src_mode));
2875 }
2876
2877 /* Return true if we can move VALUE into a register using a single
2878 CNT[BHWD] instruction. */
2879
2880 static bool
2881 aarch64_sve_cnt_immediate_p (poly_int64 value)
2882 {
2883 HOST_WIDE_INT factor = value.coeffs[0];
2884 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2885 return (value.coeffs[1] == factor
2886 && IN_RANGE (factor, 2, 16 * 16)
2887 && (factor & 1) == 0
2888 && factor <= 16 * (factor & -factor));
2889 }
2890
2891 /* Likewise for rtx X. */
2892
2893 bool
2894 aarch64_sve_cnt_immediate_p (rtx x)
2895 {
2896 poly_int64 value;
2897 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2898 }
2899
2900 /* Return the asm string for an instruction with a CNT-like vector size
2901 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2902 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2903 first part of the operands template (the part that comes before the
2904 vector size itself). FACTOR is the number of quadwords.
2905 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2906 If it is zero, we can use any element size. */
2907
2908 static char *
2909 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2910 unsigned int factor,
2911 unsigned int nelts_per_vq)
2912 {
2913 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2914
2915 if (nelts_per_vq == 0)
2916 /* There is some overlap in the ranges of the four CNT instructions.
2917 Here we always use the smallest possible element size, so that the
2918 multiplier is 1 whereever possible. */
2919 nelts_per_vq = factor & -factor;
2920 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2921 gcc_assert (IN_RANGE (shift, 1, 4));
2922 char suffix = "dwhb"[shift - 1];
2923
2924 factor >>= shift;
2925 unsigned int written;
2926 if (factor == 1)
2927 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2928 prefix, suffix, operands);
2929 else
2930 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2931 prefix, suffix, operands, factor);
2932 gcc_assert (written < sizeof (buffer));
2933 return buffer;
2934 }
2935
2936 /* Return the asm string for an instruction with a CNT-like vector size
2937 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2938 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2939 first part of the operands template (the part that comes before the
2940 vector size itself). X is the value of the vector size operand,
2941 as a polynomial integer rtx. */
2942
2943 char *
2944 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2945 rtx x)
2946 {
2947 poly_int64 value = rtx_to_poly_int64 (x);
2948 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2949 return aarch64_output_sve_cnt_immediate (prefix, operands,
2950 value.coeffs[1], 0);
2951 }
2952
2953 /* Return true if we can add VALUE to a register using a single ADDVL
2954 or ADDPL instruction. */
2955
2956 static bool
2957 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2958 {
2959 HOST_WIDE_INT factor = value.coeffs[0];
2960 if (factor == 0 || value.coeffs[1] != factor)
2961 return false;
2962 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2963 and a value of 16 is one vector width. */
2964 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2965 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2966 }
2967
2968 /* Likewise for rtx X. */
2969
2970 bool
2971 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2972 {
2973 poly_int64 value;
2974 return (poly_int_rtx_p (x, &value)
2975 && aarch64_sve_addvl_addpl_immediate_p (value));
2976 }
2977
2978 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2979 and storing the result in operand 0. */
2980
2981 char *
2982 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2983 {
2984 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2985 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2986 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2987
2988 /* Use INC or DEC if possible. */
2989 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2990 {
2991 if (aarch64_sve_cnt_immediate_p (offset_value))
2992 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2993 offset_value.coeffs[1], 0);
2994 if (aarch64_sve_cnt_immediate_p (-offset_value))
2995 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2996 -offset_value.coeffs[1], 0);
2997 }
2998
2999 int factor = offset_value.coeffs[1];
3000 if ((factor & 15) == 0)
3001 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3002 else
3003 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3004 return buffer;
3005 }
3006
3007 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3008 instruction. If it is, store the number of elements in each vector
3009 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3010 factor in *FACTOR_OUT (if nonnull). */
3011
3012 bool
3013 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
3014 unsigned int *nelts_per_vq_out)
3015 {
3016 rtx elt;
3017 poly_int64 value;
3018
3019 if (!const_vec_duplicate_p (x, &elt)
3020 || !poly_int_rtx_p (elt, &value))
3021 return false;
3022
3023 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3024 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3025 /* There's no vector INCB. */
3026 return false;
3027
3028 HOST_WIDE_INT factor = value.coeffs[0];
3029 if (value.coeffs[1] != factor)
3030 return false;
3031
3032 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3033 if ((factor % nelts_per_vq) != 0
3034 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3035 return false;
3036
3037 if (factor_out)
3038 *factor_out = factor;
3039 if (nelts_per_vq_out)
3040 *nelts_per_vq_out = nelts_per_vq;
3041 return true;
3042 }
3043
3044 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3045 instruction. */
3046
3047 bool
3048 aarch64_sve_inc_dec_immediate_p (rtx x)
3049 {
3050 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
3051 }
3052
3053 /* Return the asm template for an SVE vector INC or DEC instruction.
3054 OPERANDS gives the operands before the vector count and X is the
3055 value of the vector count operand itself. */
3056
3057 char *
3058 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
3059 {
3060 int factor;
3061 unsigned int nelts_per_vq;
3062 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3063 gcc_unreachable ();
3064 if (factor < 0)
3065 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
3066 nelts_per_vq);
3067 else
3068 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
3069 nelts_per_vq);
3070 }
3071
3072 static int
3073 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3074 scalar_int_mode mode)
3075 {
3076 int i;
3077 unsigned HOST_WIDE_INT val, val2, mask;
3078 int one_match, zero_match;
3079 int num_insns;
3080
3081 val = INTVAL (imm);
3082
3083 if (aarch64_move_imm (val, mode))
3084 {
3085 if (generate)
3086 emit_insn (gen_rtx_SET (dest, imm));
3087 return 1;
3088 }
3089
3090 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3091 (with XXXX non-zero). In that case check to see if the move can be done in
3092 a smaller mode. */
3093 val2 = val & 0xffffffff;
3094 if (mode == DImode
3095 && aarch64_move_imm (val2, SImode)
3096 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3097 {
3098 if (generate)
3099 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3100
3101 /* Check if we have to emit a second instruction by checking to see
3102 if any of the upper 32 bits of the original DI mode value is set. */
3103 if (val == val2)
3104 return 1;
3105
3106 i = (val >> 48) ? 48 : 32;
3107
3108 if (generate)
3109 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3110 GEN_INT ((val >> i) & 0xffff)));
3111
3112 return 2;
3113 }
3114
3115 if ((val >> 32) == 0 || mode == SImode)
3116 {
3117 if (generate)
3118 {
3119 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3120 if (mode == SImode)
3121 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3122 GEN_INT ((val >> 16) & 0xffff)));
3123 else
3124 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3125 GEN_INT ((val >> 16) & 0xffff)));
3126 }
3127 return 2;
3128 }
3129
3130 /* Remaining cases are all for DImode. */
3131
3132 mask = 0xffff;
3133 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3134 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3135 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3136 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3137
3138 if (zero_match != 2 && one_match != 2)
3139 {
3140 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3141 For a 64-bit bitmask try whether changing 16 bits to all ones or
3142 zeroes creates a valid bitmask. To check any repeated bitmask,
3143 try using 16 bits from the other 32-bit half of val. */
3144
3145 for (i = 0; i < 64; i += 16, mask <<= 16)
3146 {
3147 val2 = val & ~mask;
3148 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3149 break;
3150 val2 = val | mask;
3151 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3152 break;
3153 val2 = val2 & ~mask;
3154 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3155 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3156 break;
3157 }
3158 if (i != 64)
3159 {
3160 if (generate)
3161 {
3162 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3163 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3164 GEN_INT ((val >> i) & 0xffff)));
3165 }
3166 return 2;
3167 }
3168 }
3169
3170 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3171 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3172 otherwise skip zero bits. */
3173
3174 num_insns = 1;
3175 mask = 0xffff;
3176 val2 = one_match > zero_match ? ~val : val;
3177 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3178
3179 if (generate)
3180 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3181 ? (val | ~(mask << i))
3182 : (val & (mask << i)))));
3183 for (i += 16; i < 64; i += 16)
3184 {
3185 if ((val2 & (mask << i)) == 0)
3186 continue;
3187 if (generate)
3188 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3189 GEN_INT ((val >> i) & 0xffff)));
3190 num_insns ++;
3191 }
3192
3193 return num_insns;
3194 }
3195
3196 /* Return whether imm is a 128-bit immediate which is simple enough to
3197 expand inline. */
3198 bool
3199 aarch64_mov128_immediate (rtx imm)
3200 {
3201 if (GET_CODE (imm) == CONST_INT)
3202 return true;
3203
3204 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3205
3206 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3207 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3208
3209 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3210 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3211 }
3212
3213
3214 /* Return the number of temporary registers that aarch64_add_offset_1
3215 would need to add OFFSET to a register. */
3216
3217 static unsigned int
3218 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3219 {
3220 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3221 }
3222
3223 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3224 a non-polynomial OFFSET. MODE is the mode of the addition.
3225 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3226 be set and CFA adjustments added to the generated instructions.
3227
3228 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3229 temporary if register allocation is already complete. This temporary
3230 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3231 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3232 the immediate again.
3233
3234 Since this function may be used to adjust the stack pointer, we must
3235 ensure that it cannot cause transient stack deallocation (for example
3236 by first incrementing SP and then decrementing when adjusting by a
3237 large immediate). */
3238
3239 static void
3240 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3241 rtx src, HOST_WIDE_INT offset, rtx temp1,
3242 bool frame_related_p, bool emit_move_imm)
3243 {
3244 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3245 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3246
3247 HOST_WIDE_INT moffset = abs_hwi (offset);
3248 rtx_insn *insn;
3249
3250 if (!moffset)
3251 {
3252 if (!rtx_equal_p (dest, src))
3253 {
3254 insn = emit_insn (gen_rtx_SET (dest, src));
3255 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3256 }
3257 return;
3258 }
3259
3260 /* Single instruction adjustment. */
3261 if (aarch64_uimm12_shift (moffset))
3262 {
3263 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3264 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3265 return;
3266 }
3267
3268 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3269 and either:
3270
3271 a) the offset cannot be loaded by a 16-bit move or
3272 b) there is no spare register into which we can move it. */
3273 if (moffset < 0x1000000
3274 && ((!temp1 && !can_create_pseudo_p ())
3275 || !aarch64_move_imm (moffset, mode)))
3276 {
3277 HOST_WIDE_INT low_off = moffset & 0xfff;
3278
3279 low_off = offset < 0 ? -low_off : low_off;
3280 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3281 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3282 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3283 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3284 return;
3285 }
3286
3287 /* Emit a move immediate if required and an addition/subtraction. */
3288 if (emit_move_imm)
3289 {
3290 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3291 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3292 }
3293 insn = emit_insn (offset < 0
3294 ? gen_sub3_insn (dest, src, temp1)
3295 : gen_add3_insn (dest, src, temp1));
3296 if (frame_related_p)
3297 {
3298 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3299 rtx adj = plus_constant (mode, src, offset);
3300 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3301 }
3302 }
3303
3304 /* Return the number of temporary registers that aarch64_add_offset
3305 would need to move OFFSET into a register or add OFFSET to a register;
3306 ADD_P is true if we want the latter rather than the former. */
3307
3308 static unsigned int
3309 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3310 {
3311 /* This follows the same structure as aarch64_add_offset. */
3312 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3313 return 0;
3314
3315 unsigned int count = 0;
3316 HOST_WIDE_INT factor = offset.coeffs[1];
3317 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3318 poly_int64 poly_offset (factor, factor);
3319 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3320 /* Need one register for the ADDVL/ADDPL result. */
3321 count += 1;
3322 else if (factor != 0)
3323 {
3324 factor = abs (factor);
3325 if (factor > 16 * (factor & -factor))
3326 /* Need one register for the CNT result and one for the multiplication
3327 factor. If necessary, the second temporary can be reused for the
3328 constant part of the offset. */
3329 return 2;
3330 /* Need one register for the CNT result (which might then
3331 be shifted). */
3332 count += 1;
3333 }
3334 return count + aarch64_add_offset_1_temporaries (constant);
3335 }
3336
3337 /* If X can be represented as a poly_int64, return the number
3338 of temporaries that are required to add it to a register.
3339 Return -1 otherwise. */
3340
3341 int
3342 aarch64_add_offset_temporaries (rtx x)
3343 {
3344 poly_int64 offset;
3345 if (!poly_int_rtx_p (x, &offset))
3346 return -1;
3347 return aarch64_offset_temporaries (true, offset);
3348 }
3349
3350 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3351 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3352 be set and CFA adjustments added to the generated instructions.
3353
3354 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3355 temporary if register allocation is already complete. This temporary
3356 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3357 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3358 false to avoid emitting the immediate again.
3359
3360 TEMP2, if nonnull, is a second temporary register that doesn't
3361 overlap either DEST or REG.
3362
3363 Since this function may be used to adjust the stack pointer, we must
3364 ensure that it cannot cause transient stack deallocation (for example
3365 by first incrementing SP and then decrementing when adjusting by a
3366 large immediate). */
3367
3368 static void
3369 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3370 poly_int64 offset, rtx temp1, rtx temp2,
3371 bool frame_related_p, bool emit_move_imm = true)
3372 {
3373 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3374 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3375 gcc_assert (temp1 == NULL_RTX
3376 || !frame_related_p
3377 || !reg_overlap_mentioned_p (temp1, dest));
3378 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3379
3380 /* Try using ADDVL or ADDPL to add the whole value. */
3381 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3382 {
3383 rtx offset_rtx = gen_int_mode (offset, mode);
3384 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3385 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3386 return;
3387 }
3388
3389 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3390 SVE vector register, over and above the minimum size of 128 bits.
3391 This is equivalent to half the value returned by CNTD with a
3392 vector shape of ALL. */
3393 HOST_WIDE_INT factor = offset.coeffs[1];
3394 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3395
3396 /* Try using ADDVL or ADDPL to add the VG-based part. */
3397 poly_int64 poly_offset (factor, factor);
3398 if (src != const0_rtx
3399 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3400 {
3401 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3402 if (frame_related_p)
3403 {
3404 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3405 RTX_FRAME_RELATED_P (insn) = true;
3406 src = dest;
3407 }
3408 else
3409 {
3410 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3411 src = aarch64_force_temporary (mode, temp1, addr);
3412 temp1 = temp2;
3413 temp2 = NULL_RTX;
3414 }
3415 }
3416 /* Otherwise use a CNT-based sequence. */
3417 else if (factor != 0)
3418 {
3419 /* Use a subtraction if we have a negative factor. */
3420 rtx_code code = PLUS;
3421 if (factor < 0)
3422 {
3423 factor = -factor;
3424 code = MINUS;
3425 }
3426
3427 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3428 into the multiplication. */
3429 rtx val;
3430 int shift = 0;
3431 if (factor & 1)
3432 /* Use a right shift by 1. */
3433 shift = -1;
3434 else
3435 factor /= 2;
3436 HOST_WIDE_INT low_bit = factor & -factor;
3437 if (factor <= 16 * low_bit)
3438 {
3439 if (factor > 16 * 8)
3440 {
3441 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3442 the value with the minimum multiplier and shift it into
3443 position. */
3444 int extra_shift = exact_log2 (low_bit);
3445 shift += extra_shift;
3446 factor >>= extra_shift;
3447 }
3448 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3449 }
3450 else
3451 {
3452 /* Use CNTD, then multiply it by FACTOR. */
3453 val = gen_int_mode (poly_int64 (2, 2), mode);
3454 val = aarch64_force_temporary (mode, temp1, val);
3455
3456 /* Go back to using a negative multiplication factor if we have
3457 no register from which to subtract. */
3458 if (code == MINUS && src == const0_rtx)
3459 {
3460 factor = -factor;
3461 code = PLUS;
3462 }
3463 rtx coeff1 = gen_int_mode (factor, mode);
3464 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3465 val = gen_rtx_MULT (mode, val, coeff1);
3466 }
3467
3468 if (shift > 0)
3469 {
3470 /* Multiply by 1 << SHIFT. */
3471 val = aarch64_force_temporary (mode, temp1, val);
3472 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3473 }
3474 else if (shift == -1)
3475 {
3476 /* Divide by 2. */
3477 val = aarch64_force_temporary (mode, temp1, val);
3478 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3479 }
3480
3481 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3482 if (src != const0_rtx)
3483 {
3484 val = aarch64_force_temporary (mode, temp1, val);
3485 val = gen_rtx_fmt_ee (code, mode, src, val);
3486 }
3487 else if (code == MINUS)
3488 {
3489 val = aarch64_force_temporary (mode, temp1, val);
3490 val = gen_rtx_NEG (mode, val);
3491 }
3492
3493 if (constant == 0 || frame_related_p)
3494 {
3495 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3496 if (frame_related_p)
3497 {
3498 RTX_FRAME_RELATED_P (insn) = true;
3499 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3500 gen_rtx_SET (dest, plus_constant (Pmode, src,
3501 poly_offset)));
3502 }
3503 src = dest;
3504 if (constant == 0)
3505 return;
3506 }
3507 else
3508 {
3509 src = aarch64_force_temporary (mode, temp1, val);
3510 temp1 = temp2;
3511 temp2 = NULL_RTX;
3512 }
3513
3514 emit_move_imm = true;
3515 }
3516
3517 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3518 frame_related_p, emit_move_imm);
3519 }
3520
3521 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3522 than a poly_int64. */
3523
3524 void
3525 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3526 rtx offset_rtx, rtx temp1, rtx temp2)
3527 {
3528 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3529 temp1, temp2, false);
3530 }
3531
3532 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3533 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3534 if TEMP1 already contains abs (DELTA). */
3535
3536 static inline void
3537 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3538 {
3539 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3540 temp1, temp2, true, emit_move_imm);
3541 }
3542
3543 /* Subtract DELTA from the stack pointer, marking the instructions
3544 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3545 if nonnull. */
3546
3547 static inline void
3548 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3549 bool emit_move_imm = true)
3550 {
3551 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3552 temp1, temp2, frame_related_p, emit_move_imm);
3553 }
3554
3555 /* Set DEST to (vec_series BASE STEP). */
3556
3557 static void
3558 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3559 {
3560 machine_mode mode = GET_MODE (dest);
3561 scalar_mode inner = GET_MODE_INNER (mode);
3562
3563 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3564 if (!aarch64_sve_index_immediate_p (base))
3565 base = force_reg (inner, base);
3566 if (!aarch64_sve_index_immediate_p (step))
3567 step = force_reg (inner, step);
3568
3569 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3570 }
3571
3572 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3573 register of mode MODE. Use TARGET for the result if it's nonnull
3574 and convenient.
3575
3576 The two vector modes must have the same element mode. The behavior
3577 is to duplicate architectural lane N of SRC into architectural lanes
3578 N + I * STEP of the result. On big-endian targets, architectural
3579 lane 0 of an Advanced SIMD vector is the last element of the vector
3580 in memory layout, so for big-endian targets this operation has the
3581 effect of reversing SRC before duplicating it. Callers need to
3582 account for this. */
3583
3584 rtx
3585 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3586 {
3587 machine_mode src_mode = GET_MODE (src);
3588 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3589 insn_code icode = (BYTES_BIG_ENDIAN
3590 ? code_for_aarch64_vec_duplicate_vq_be (mode)
3591 : code_for_aarch64_vec_duplicate_vq_le (mode));
3592
3593 unsigned int i = 0;
3594 expand_operand ops[3];
3595 create_output_operand (&ops[i++], target, mode);
3596 create_output_operand (&ops[i++], src, src_mode);
3597 if (BYTES_BIG_ENDIAN)
3598 {
3599 /* Create a PARALLEL describing the reversal of SRC. */
3600 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3601 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3602 nelts_per_vq - 1, -1);
3603 create_fixed_operand (&ops[i++], sel);
3604 }
3605 expand_insn (icode, i, ops);
3606 return ops[0].value;
3607 }
3608
3609 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3610 the memory image into DEST. Return true on success. */
3611
3612 static bool
3613 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3614 {
3615 src = force_const_mem (GET_MODE (src), src);
3616 if (!src)
3617 return false;
3618
3619 /* Make sure that the address is legitimate. */
3620 if (!aarch64_sve_ld1rq_operand_p (src))
3621 {
3622 rtx addr = force_reg (Pmode, XEXP (src, 0));
3623 src = replace_equiv_address (src, addr);
3624 }
3625
3626 machine_mode mode = GET_MODE (dest);
3627 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3628 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3629 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3630 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3631 return true;
3632 }
3633
3634 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3635 SVE data mode and isn't a legitimate constant. Use TARGET for the
3636 result if convenient.
3637
3638 The returned register can have whatever mode seems most natural
3639 given the contents of SRC. */
3640
3641 static rtx
3642 aarch64_expand_sve_const_vector (rtx target, rtx src)
3643 {
3644 machine_mode mode = GET_MODE (src);
3645 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3646 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3647 scalar_mode elt_mode = GET_MODE_INNER (mode);
3648 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3649 unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3650
3651 if (nelts_per_pattern == 1 && encoded_bits == 128)
3652 {
3653 /* The constant is a duplicated quadword but can't be narrowed
3654 beyond a quadword. Get the memory image of the first quadword
3655 as a 128-bit vector and try using LD1RQ to load it from memory.
3656
3657 The effect for both endiannesses is to load memory lane N into
3658 architectural lanes N + I * STEP of the result. On big-endian
3659 targets, the layout of the 128-bit vector in an Advanced SIMD
3660 register would be different from its layout in an SVE register,
3661 but this 128-bit vector is a memory value only. */
3662 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3663 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3664 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3665 return target;
3666 }
3667
3668 if (nelts_per_pattern == 1 && encoded_bits < 128)
3669 {
3670 /* The vector is a repeating sequence of 64 bits or fewer.
3671 See if we can load them using an Advanced SIMD move and then
3672 duplicate it to fill a vector. This is better than using a GPR
3673 move because it keeps everything in the same register file. */
3674 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3675 rtx_vector_builder builder (vq_mode, npatterns, 1);
3676 for (unsigned int i = 0; i < npatterns; ++i)
3677 {
3678 /* We want memory lane N to go into architectural lane N,
3679 so reverse for big-endian targets. The DUP .Q pattern
3680 has a compensating reverse built-in. */
3681 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3682 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3683 }
3684 rtx vq_src = builder.build ();
3685 if (aarch64_simd_valid_immediate (vq_src, NULL))
3686 {
3687 vq_src = force_reg (vq_mode, vq_src);
3688 return aarch64_expand_sve_dupq (target, mode, vq_src);
3689 }
3690
3691 /* Get an integer representation of the repeating part of Advanced
3692 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3693 which for big-endian targets is lane-swapped wrt a normal
3694 Advanced SIMD vector. This means that for both endiannesses,
3695 memory lane N of SVE vector SRC corresponds to architectural
3696 lane N of a register holding VQ_SRC. This in turn means that
3697 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3698 as a single 128-bit value) and thus that memory lane 0 of SRC is
3699 in the lsb of the integer. Duplicating the integer therefore
3700 ensures that memory lane N of SRC goes into architectural lane
3701 N + I * INDEX of the SVE register. */
3702 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3703 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3704 if (elt_value)
3705 {
3706 /* Pretend that we had a vector of INT_MODE to start with. */
3707 elt_mode = int_mode;
3708 mode = aarch64_full_sve_mode (int_mode).require ();
3709
3710 /* If the integer can be moved into a general register by a
3711 single instruction, do that and duplicate the result. */
3712 if (CONST_INT_P (elt_value)
3713 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3714 {
3715 elt_value = force_reg (elt_mode, elt_value);
3716 return expand_vector_broadcast (mode, elt_value);
3717 }
3718 }
3719 else if (npatterns == 1)
3720 /* We're duplicating a single value, but can't do better than
3721 force it to memory and load from there. This handles things
3722 like symbolic constants. */
3723 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3724
3725 if (elt_value)
3726 {
3727 /* Load the element from memory if we can, otherwise move it into
3728 a register and use a DUP. */
3729 rtx op = force_const_mem (elt_mode, elt_value);
3730 if (!op)
3731 op = force_reg (elt_mode, elt_value);
3732 return expand_vector_broadcast (mode, op);
3733 }
3734 }
3735
3736 /* Try using INDEX. */
3737 rtx base, step;
3738 if (const_vec_series_p (src, &base, &step))
3739 {
3740 aarch64_expand_vec_series (target, base, step);
3741 return target;
3742 }
3743
3744 /* From here on, it's better to force the whole constant to memory
3745 if we can. */
3746 if (GET_MODE_NUNITS (mode).is_constant ())
3747 return NULL_RTX;
3748
3749 /* Expand each pattern individually. */
3750 gcc_assert (npatterns > 1);
3751 rtx_vector_builder builder;
3752 auto_vec<rtx, 16> vectors (npatterns);
3753 for (unsigned int i = 0; i < npatterns; ++i)
3754 {
3755 builder.new_vector (mode, 1, nelts_per_pattern);
3756 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3757 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3758 vectors.quick_push (force_reg (mode, builder.build ()));
3759 }
3760
3761 /* Use permutes to interleave the separate vectors. */
3762 while (npatterns > 1)
3763 {
3764 npatterns /= 2;
3765 for (unsigned int i = 0; i < npatterns; ++i)
3766 {
3767 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3768 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3769 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3770 vectors[i] = tmp;
3771 }
3772 }
3773 gcc_assert (vectors[0] == target);
3774 return target;
3775 }
3776
3777 /* Use WHILE to set a predicate register of mode MODE in which the first
3778 VL bits are set and the rest are clear. Use TARGET for the register
3779 if it's nonnull and convenient. */
3780
3781 static rtx
3782 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3783 unsigned int vl)
3784 {
3785 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3786 target = aarch64_target_reg (target, mode);
3787 emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3788 return target;
3789 }
3790
3791 static rtx
3792 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
3793
3794 /* BUILDER is a constant predicate in which the index of every set bit
3795 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3796 by inverting every element at a multiple of ELT_SIZE and EORing the
3797 result with an ELT_SIZE PTRUE.
3798
3799 Return a register that contains the constant on success, otherwise
3800 return null. Use TARGET as the register if it is nonnull and
3801 convenient. */
3802
3803 static rtx
3804 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
3805 unsigned int elt_size)
3806 {
3807 /* Invert every element at a multiple of ELT_SIZE, keeping the
3808 other bits zero. */
3809 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
3810 builder.nelts_per_pattern ());
3811 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3812 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
3813 inv_builder.quick_push (const1_rtx);
3814 else
3815 inv_builder.quick_push (const0_rtx);
3816 inv_builder.finalize ();
3817
3818 /* See if we can load the constant cheaply. */
3819 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
3820 if (!inv)
3821 return NULL_RTX;
3822
3823 /* EOR the result with an ELT_SIZE PTRUE. */
3824 rtx mask = aarch64_ptrue_all (elt_size);
3825 mask = force_reg (VNx16BImode, mask);
3826 target = aarch64_target_reg (target, VNx16BImode);
3827 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
3828 return target;
3829 }
3830
3831 /* BUILDER is a constant predicate in which the index of every set bit
3832 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
3833 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
3834 register on success, otherwise return null. Use TARGET as the register
3835 if nonnull and convenient. */
3836
3837 static rtx
3838 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
3839 unsigned int elt_size,
3840 unsigned int permute_size)
3841 {
3842 /* We're going to split the constant into two new constants A and B,
3843 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
3844 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
3845
3846 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
3847 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
3848
3849 where _ indicates elements that will be discarded by the permute.
3850
3851 First calculate the ELT_SIZEs for A and B. */
3852 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
3853 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
3854 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
3855 if (INTVAL (builder.elt (i)) != 0)
3856 {
3857 if (i & permute_size)
3858 b_elt_size |= i - permute_size;
3859 else
3860 a_elt_size |= i;
3861 }
3862 a_elt_size &= -a_elt_size;
3863 b_elt_size &= -b_elt_size;
3864
3865 /* Now construct the vectors themselves. */
3866 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
3867 builder.nelts_per_pattern ());
3868 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
3869 builder.nelts_per_pattern ());
3870 unsigned int nelts = builder.encoded_nelts ();
3871 for (unsigned int i = 0; i < nelts; ++i)
3872 if (i & (elt_size - 1))
3873 {
3874 a_builder.quick_push (const0_rtx);
3875 b_builder.quick_push (const0_rtx);
3876 }
3877 else if ((i & permute_size) == 0)
3878 {
3879 /* The A and B elements are significant. */
3880 a_builder.quick_push (builder.elt (i));
3881 b_builder.quick_push (builder.elt (i + permute_size));
3882 }
3883 else
3884 {
3885 /* The A and B elements are going to be discarded, so pick whatever
3886 is likely to give a nice constant. We are targeting element
3887 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
3888 with the aim of each being a sequence of ones followed by
3889 a sequence of zeros. So:
3890
3891 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
3892 duplicate the last X_ELT_SIZE element, to extend the
3893 current sequence of ones or zeros.
3894
3895 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
3896 zero, so that the constant really does have X_ELT_SIZE and
3897 not a smaller size. */
3898 if (a_elt_size > permute_size)
3899 a_builder.quick_push (const0_rtx);
3900 else
3901 a_builder.quick_push (a_builder.elt (i - a_elt_size));
3902 if (b_elt_size > permute_size)
3903 b_builder.quick_push (const0_rtx);
3904 else
3905 b_builder.quick_push (b_builder.elt (i - b_elt_size));
3906 }
3907 a_builder.finalize ();
3908 b_builder.finalize ();
3909
3910 /* Try loading A into a register. */
3911 rtx_insn *last = get_last_insn ();
3912 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
3913 if (!a)
3914 return NULL_RTX;
3915
3916 /* Try loading B into a register. */
3917 rtx b = a;
3918 if (a_builder != b_builder)
3919 {
3920 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
3921 if (!b)
3922 {
3923 delete_insns_since (last);
3924 return NULL_RTX;
3925 }
3926 }
3927
3928 /* Emit the TRN1 itself. */
3929 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
3930 target = aarch64_target_reg (target, mode);
3931 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
3932 gen_lowpart (mode, a),
3933 gen_lowpart (mode, b)));
3934 return target;
3935 }
3936
3937 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
3938 constant in BUILDER into an SVE predicate register. Return the register
3939 on success, otherwise return null. Use TARGET for the register if
3940 nonnull and convenient.
3941
3942 ALLOW_RECURSE_P is true if we can use methods that would call this
3943 function recursively. */
3944
3945 static rtx
3946 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
3947 bool allow_recurse_p)
3948 {
3949 if (builder.encoded_nelts () == 1)
3950 /* A PFALSE or a PTRUE .B ALL. */
3951 return aarch64_emit_set_immediate (target, builder);
3952
3953 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3954 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
3955 {
3956 /* If we can load the constant using PTRUE, use it as-is. */
3957 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
3958 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
3959 return aarch64_emit_set_immediate (target, builder);
3960
3961 /* Otherwise use WHILE to set the first VL bits. */
3962 return aarch64_sve_move_pred_via_while (target, mode, vl);
3963 }
3964
3965 if (!allow_recurse_p)
3966 return NULL_RTX;
3967
3968 /* Try inverting the vector in element size ELT_SIZE and then EORing
3969 the result with an ELT_SIZE PTRUE. */
3970 if (INTVAL (builder.elt (0)) == 0)
3971 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
3972 elt_size))
3973 return res;
3974
3975 /* Try using TRN1 to permute two simpler constants. */
3976 for (unsigned int i = elt_size; i <= 8; i *= 2)
3977 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
3978 elt_size, i))
3979 return res;
3980
3981 return NULL_RTX;
3982 }
3983
3984 /* Return an SVE predicate register that contains the VNx16BImode
3985 constant in BUILDER, without going through the move expanders.
3986
3987 The returned register can have whatever mode seems most natural
3988 given the contents of BUILDER. Use TARGET for the result if
3989 convenient. */
3990
3991 static rtx
3992 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
3993 {
3994 /* Try loading the constant using pure predicate operations. */
3995 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
3996 return res;
3997
3998 /* Try forcing the constant to memory. */
3999 if (builder.full_nelts ().is_constant ())
4000 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4001 {
4002 target = aarch64_target_reg (target, VNx16BImode);
4003 emit_move_insn (target, mem);
4004 return target;
4005 }
4006
4007 /* The last resort is to load the constant as an integer and then
4008 compare it against zero. Use -1 for set bits in order to increase
4009 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4010 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4011 builder.nelts_per_pattern ());
4012 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4013 int_builder.quick_push (INTVAL (builder.elt (i))
4014 ? constm1_rtx : const0_rtx);
4015 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4016 int_builder.build ());
4017 }
4018
4019 /* Set DEST to immediate IMM. */
4020
4021 void
4022 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4023 {
4024 machine_mode mode = GET_MODE (dest);
4025
4026 /* Check on what type of symbol it is. */
4027 scalar_int_mode int_mode;
4028 if ((GET_CODE (imm) == SYMBOL_REF
4029 || GET_CODE (imm) == LABEL_REF
4030 || GET_CODE (imm) == CONST
4031 || GET_CODE (imm) == CONST_POLY_INT)
4032 && is_a <scalar_int_mode> (mode, &int_mode))
4033 {
4034 rtx mem;
4035 poly_int64 offset;
4036 HOST_WIDE_INT const_offset;
4037 enum aarch64_symbol_type sty;
4038
4039 /* If we have (const (plus symbol offset)), separate out the offset
4040 before we start classifying the symbol. */
4041 rtx base = strip_offset (imm, &offset);
4042
4043 /* We must always add an offset involving VL separately, rather than
4044 folding it into the relocation. */
4045 if (!offset.is_constant (&const_offset))
4046 {
4047 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4048 emit_insn (gen_rtx_SET (dest, imm));
4049 else
4050 {
4051 /* Do arithmetic on 32-bit values if the result is smaller
4052 than that. */
4053 if (partial_subreg_p (int_mode, SImode))
4054 {
4055 /* It is invalid to do symbol calculations in modes
4056 narrower than SImode. */
4057 gcc_assert (base == const0_rtx);
4058 dest = gen_lowpart (SImode, dest);
4059 int_mode = SImode;
4060 }
4061 if (base != const0_rtx)
4062 {
4063 base = aarch64_force_temporary (int_mode, dest, base);
4064 aarch64_add_offset (int_mode, dest, base, offset,
4065 NULL_RTX, NULL_RTX, false);
4066 }
4067 else
4068 aarch64_add_offset (int_mode, dest, base, offset,
4069 dest, NULL_RTX, false);
4070 }
4071 return;
4072 }
4073
4074 sty = aarch64_classify_symbol (base, const_offset);
4075 switch (sty)
4076 {
4077 case SYMBOL_FORCE_TO_MEM:
4078 if (const_offset != 0
4079 && targetm.cannot_force_const_mem (int_mode, imm))
4080 {
4081 gcc_assert (can_create_pseudo_p ());
4082 base = aarch64_force_temporary (int_mode, dest, base);
4083 aarch64_add_offset (int_mode, dest, base, const_offset,
4084 NULL_RTX, NULL_RTX, false);
4085 return;
4086 }
4087
4088 mem = force_const_mem (ptr_mode, imm);
4089 gcc_assert (mem);
4090
4091 /* If we aren't generating PC relative literals, then
4092 we need to expand the literal pool access carefully.
4093 This is something that needs to be done in a number
4094 of places, so could well live as a separate function. */
4095 if (!aarch64_pcrelative_literal_loads)
4096 {
4097 gcc_assert (can_create_pseudo_p ());
4098 base = gen_reg_rtx (ptr_mode);
4099 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4100 if (ptr_mode != Pmode)
4101 base = convert_memory_address (Pmode, base);
4102 mem = gen_rtx_MEM (ptr_mode, base);
4103 }
4104
4105 if (int_mode != ptr_mode)
4106 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4107
4108 emit_insn (gen_rtx_SET (dest, mem));
4109
4110 return;
4111
4112 case SYMBOL_SMALL_TLSGD:
4113 case SYMBOL_SMALL_TLSDESC:
4114 case SYMBOL_SMALL_TLSIE:
4115 case SYMBOL_SMALL_GOT_28K:
4116 case SYMBOL_SMALL_GOT_4G:
4117 case SYMBOL_TINY_GOT:
4118 case SYMBOL_TINY_TLSIE:
4119 if (const_offset != 0)
4120 {
4121 gcc_assert(can_create_pseudo_p ());
4122 base = aarch64_force_temporary (int_mode, dest, base);
4123 aarch64_add_offset (int_mode, dest, base, const_offset,
4124 NULL_RTX, NULL_RTX, false);
4125 return;
4126 }
4127 /* FALLTHRU */
4128
4129 case SYMBOL_SMALL_ABSOLUTE:
4130 case SYMBOL_TINY_ABSOLUTE:
4131 case SYMBOL_TLSLE12:
4132 case SYMBOL_TLSLE24:
4133 case SYMBOL_TLSLE32:
4134 case SYMBOL_TLSLE48:
4135 aarch64_load_symref_appropriately (dest, imm, sty);
4136 return;
4137
4138 default:
4139 gcc_unreachable ();
4140 }
4141 }
4142
4143 if (!CONST_INT_P (imm))
4144 {
4145 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4146 {
4147 /* Only the low bit of each .H, .S and .D element is defined,
4148 so we can set the upper bits to whatever we like. If the
4149 predicate is all-true in MODE, prefer to set all the undefined
4150 bits as well, so that we can share a single .B predicate for
4151 all modes. */
4152 if (imm == CONSTM1_RTX (mode))
4153 imm = CONSTM1_RTX (VNx16BImode);
4154
4155 /* All methods for constructing predicate modes wider than VNx16BI
4156 will set the upper bits of each element to zero. Expose this
4157 by moving such constants as a VNx16BI, so that all bits are
4158 significant and so that constants for different modes can be
4159 shared. The wider constant will still be available as a
4160 REG_EQUAL note. */
4161 rtx_vector_builder builder;
4162 if (aarch64_get_sve_pred_bits (builder, imm))
4163 {
4164 rtx res = aarch64_expand_sve_const_pred (dest, builder);
4165 if (dest != res)
4166 emit_move_insn (dest, gen_lowpart (mode, res));
4167 return;
4168 }
4169 }
4170
4171 if (GET_CODE (imm) == HIGH
4172 || aarch64_simd_valid_immediate (imm, NULL))
4173 {
4174 emit_insn (gen_rtx_SET (dest, imm));
4175 return;
4176 }
4177
4178 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4179 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4180 {
4181 if (dest != res)
4182 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4183 return;
4184 }
4185
4186 rtx mem = force_const_mem (mode, imm);
4187 gcc_assert (mem);
4188 emit_move_insn (dest, mem);
4189 return;
4190 }
4191
4192 aarch64_internal_mov_immediate (dest, imm, true,
4193 as_a <scalar_int_mode> (mode));
4194 }
4195
4196 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4197 that is known to contain PTRUE. */
4198
4199 void
4200 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4201 {
4202 expand_operand ops[3];
4203 machine_mode mode = GET_MODE (dest);
4204 create_output_operand (&ops[0], dest, mode);
4205 create_input_operand (&ops[1], pred, GET_MODE(pred));
4206 create_input_operand (&ops[2], src, mode);
4207 temporary_volatile_ok v (true);
4208 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4209 }
4210
4211 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4212 operand is in memory. In this case we need to use the predicated LD1
4213 and ST1 instead of LDR and STR, both for correctness on big-endian
4214 targets and because LD1 and ST1 support a wider range of addressing modes.
4215 PRED_MODE is the mode of the predicate.
4216
4217 See the comment at the head of aarch64-sve.md for details about the
4218 big-endian handling. */
4219
4220 void
4221 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4222 {
4223 machine_mode mode = GET_MODE (dest);
4224 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4225 if (!register_operand (src, mode)
4226 && !register_operand (dest, mode))
4227 {
4228 rtx tmp = gen_reg_rtx (mode);
4229 if (MEM_P (src))
4230 aarch64_emit_sve_pred_move (tmp, ptrue, src);
4231 else
4232 emit_move_insn (tmp, src);
4233 src = tmp;
4234 }
4235 aarch64_emit_sve_pred_move (dest, ptrue, src);
4236 }
4237
4238 /* Called only on big-endian targets. See whether an SVE vector move
4239 from SRC to DEST is effectively a REV[BHW] instruction, because at
4240 least one operand is a subreg of an SVE vector that has wider or
4241 narrower elements. Return true and emit the instruction if so.
4242
4243 For example:
4244
4245 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4246
4247 represents a VIEW_CONVERT between the following vectors, viewed
4248 in memory order:
4249
4250 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4251 R1: { [0], [1], [2], [3], ... }
4252
4253 The high part of lane X in R2 should therefore correspond to lane X*2
4254 of R1, but the register representations are:
4255
4256 msb lsb
4257 R2: ...... [1].high [1].low [0].high [0].low
4258 R1: ...... [3] [2] [1] [0]
4259
4260 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4261 We therefore need a reverse operation to swap the high and low values
4262 around.
4263
4264 This is purely an optimization. Without it we would spill the
4265 subreg operand to the stack in one mode and reload it in the
4266 other mode, which has the same effect as the REV. */
4267
4268 bool
4269 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4270 {
4271 gcc_assert (BYTES_BIG_ENDIAN);
4272 if (GET_CODE (dest) == SUBREG)
4273 dest = SUBREG_REG (dest);
4274 if (GET_CODE (src) == SUBREG)
4275 src = SUBREG_REG (src);
4276
4277 /* The optimization handles two single SVE REGs with different element
4278 sizes. */
4279 if (!REG_P (dest)
4280 || !REG_P (src)
4281 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4282 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4283 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4284 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4285 return false;
4286
4287 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4288 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4289 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4290 UNSPEC_REV_SUBREG);
4291 emit_insn (gen_rtx_SET (dest, unspec));
4292 return true;
4293 }
4294
4295 /* Return a copy of X with mode MODE, without changing its other
4296 attributes. Unlike gen_lowpart, this doesn't care whether the
4297 mode change is valid. */
4298
4299 static rtx
4300 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4301 {
4302 if (GET_MODE (x) == mode)
4303 return x;
4304
4305 x = shallow_copy_rtx (x);
4306 set_mode_and_regno (x, mode, REGNO (x));
4307 return x;
4308 }
4309
4310 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4311 stored in wider integer containers. */
4312
4313 static unsigned int
4314 aarch64_sve_rev_unspec (machine_mode mode)
4315 {
4316 switch (GET_MODE_UNIT_SIZE (mode))
4317 {
4318 case 1: return UNSPEC_REVB;
4319 case 2: return UNSPEC_REVH;
4320 case 4: return UNSPEC_REVW;
4321 }
4322 gcc_unreachable ();
4323 }
4324
4325 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4326 operands. */
4327
4328 void
4329 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4330 {
4331 /* Decide which REV operation we need. The mode with wider elements
4332 determines the mode of the operands and the mode with the narrower
4333 elements determines the reverse width. */
4334 machine_mode mode_with_wider_elts = GET_MODE (dest);
4335 machine_mode mode_with_narrower_elts = GET_MODE (src);
4336 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4337 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4338 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4339
4340 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4341 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4342 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4343
4344 /* Get the operands in the appropriate modes and emit the instruction. */
4345 ptrue = gen_lowpart (pred_mode, ptrue);
4346 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4347 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4348 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4349 dest, ptrue, src));
4350 }
4351
4352 static bool
4353 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4354 tree exp ATTRIBUTE_UNUSED)
4355 {
4356 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4357 return false;
4358
4359 return true;
4360 }
4361
4362 /* Implement TARGET_PASS_BY_REFERENCE. */
4363
4364 static bool
4365 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
4366 machine_mode mode,
4367 const_tree type,
4368 bool named ATTRIBUTE_UNUSED)
4369 {
4370 HOST_WIDE_INT size;
4371 machine_mode dummymode;
4372 int nregs;
4373
4374 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4375 if (mode == BLKmode && type)
4376 size = int_size_in_bytes (type);
4377 else
4378 /* No frontends can create types with variable-sized modes, so we
4379 shouldn't be asked to pass or return them. */
4380 size = GET_MODE_SIZE (mode).to_constant ();
4381
4382 /* Aggregates are passed by reference based on their size. */
4383 if (type && AGGREGATE_TYPE_P (type))
4384 {
4385 size = int_size_in_bytes (type);
4386 }
4387
4388 /* Variable sized arguments are always returned by reference. */
4389 if (size < 0)
4390 return true;
4391
4392 /* Can this be a candidate to be passed in fp/simd register(s)? */
4393 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4394 &dummymode, &nregs,
4395 NULL))
4396 return false;
4397
4398 /* Arguments which are variable sized or larger than 2 registers are
4399 passed by reference unless they are a homogenous floating point
4400 aggregate. */
4401 return size > 2 * UNITS_PER_WORD;
4402 }
4403
4404 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4405 static bool
4406 aarch64_return_in_msb (const_tree valtype)
4407 {
4408 machine_mode dummy_mode;
4409 int dummy_int;
4410
4411 /* Never happens in little-endian mode. */
4412 if (!BYTES_BIG_ENDIAN)
4413 return false;
4414
4415 /* Only composite types smaller than or equal to 16 bytes can
4416 be potentially returned in registers. */
4417 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4418 || int_size_in_bytes (valtype) <= 0
4419 || int_size_in_bytes (valtype) > 16)
4420 return false;
4421
4422 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4423 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4424 is always passed/returned in the least significant bits of fp/simd
4425 register(s). */
4426 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4427 &dummy_mode, &dummy_int, NULL))
4428 return false;
4429
4430 return true;
4431 }
4432
4433 /* Implement TARGET_FUNCTION_VALUE.
4434 Define how to find the value returned by a function. */
4435
4436 static rtx
4437 aarch64_function_value (const_tree type, const_tree func,
4438 bool outgoing ATTRIBUTE_UNUSED)
4439 {
4440 machine_mode mode;
4441 int unsignedp;
4442 int count;
4443 machine_mode ag_mode;
4444
4445 mode = TYPE_MODE (type);
4446 if (INTEGRAL_TYPE_P (type))
4447 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4448
4449 if (aarch64_return_in_msb (type))
4450 {
4451 HOST_WIDE_INT size = int_size_in_bytes (type);
4452
4453 if (size % UNITS_PER_WORD != 0)
4454 {
4455 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4456 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4457 }
4458 }
4459
4460 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4461 &ag_mode, &count, NULL))
4462 {
4463 if (!aarch64_composite_type_p (type, mode))
4464 {
4465 gcc_assert (count == 1 && mode == ag_mode);
4466 return gen_rtx_REG (mode, V0_REGNUM);
4467 }
4468 else
4469 {
4470 int i;
4471 rtx par;
4472
4473 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4474 for (i = 0; i < count; i++)
4475 {
4476 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4477 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4478 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4479 XVECEXP (par, 0, i) = tmp;
4480 }
4481 return par;
4482 }
4483 }
4484 else
4485 return gen_rtx_REG (mode, R0_REGNUM);
4486 }
4487
4488 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4489 Return true if REGNO is the number of a hard register in which the values
4490 of called function may come back. */
4491
4492 static bool
4493 aarch64_function_value_regno_p (const unsigned int regno)
4494 {
4495 /* Maximum of 16 bytes can be returned in the general registers. Examples
4496 of 16-byte return values are: 128-bit integers and 16-byte small
4497 structures (excluding homogeneous floating-point aggregates). */
4498 if (regno == R0_REGNUM || regno == R1_REGNUM)
4499 return true;
4500
4501 /* Up to four fp/simd registers can return a function value, e.g. a
4502 homogeneous floating-point aggregate having four members. */
4503 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4504 return TARGET_FLOAT;
4505
4506 return false;
4507 }
4508
4509 /* Implement TARGET_RETURN_IN_MEMORY.
4510
4511 If the type T of the result of a function is such that
4512 void func (T arg)
4513 would require that arg be passed as a value in a register (or set of
4514 registers) according to the parameter passing rules, then the result
4515 is returned in the same registers as would be used for such an
4516 argument. */
4517
4518 static bool
4519 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4520 {
4521 HOST_WIDE_INT size;
4522 machine_mode ag_mode;
4523 int count;
4524
4525 if (!AGGREGATE_TYPE_P (type)
4526 && TREE_CODE (type) != COMPLEX_TYPE
4527 && TREE_CODE (type) != VECTOR_TYPE)
4528 /* Simple scalar types always returned in registers. */
4529 return false;
4530
4531 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4532 type,
4533 &ag_mode,
4534 &count,
4535 NULL))
4536 return false;
4537
4538 /* Types larger than 2 registers returned in memory. */
4539 size = int_size_in_bytes (type);
4540 return (size < 0 || size > 2 * UNITS_PER_WORD);
4541 }
4542
4543 static bool
4544 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4545 const_tree type, int *nregs)
4546 {
4547 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4548 return aarch64_vfp_is_call_or_return_candidate (mode,
4549 type,
4550 &pcum->aapcs_vfp_rmode,
4551 nregs,
4552 NULL);
4553 }
4554
4555 /* Given MODE and TYPE of a function argument, return the alignment in
4556 bits. The idea is to suppress any stronger alignment requested by
4557 the user and opt for the natural alignment (specified in AAPCS64 \S
4558 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4559 calculated in versions of GCC prior to GCC-9. This is a helper
4560 function for local use only. */
4561
4562 static unsigned int
4563 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4564 bool *abi_break)
4565 {
4566 *abi_break = false;
4567 if (!type)
4568 return GET_MODE_ALIGNMENT (mode);
4569
4570 if (integer_zerop (TYPE_SIZE (type)))
4571 return 0;
4572
4573 gcc_assert (TYPE_MODE (type) == mode);
4574
4575 if (!AGGREGATE_TYPE_P (type))
4576 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4577
4578 if (TREE_CODE (type) == ARRAY_TYPE)
4579 return TYPE_ALIGN (TREE_TYPE (type));
4580
4581 unsigned int alignment = 0;
4582 unsigned int bitfield_alignment = 0;
4583 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4584 if (TREE_CODE (field) == FIELD_DECL)
4585 {
4586 alignment = std::max (alignment, DECL_ALIGN (field));
4587 if (DECL_BIT_FIELD_TYPE (field))
4588 bitfield_alignment
4589 = std::max (bitfield_alignment,
4590 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4591 }
4592
4593 if (bitfield_alignment > alignment)
4594 {
4595 *abi_break = true;
4596 return bitfield_alignment;
4597 }
4598
4599 return alignment;
4600 }
4601
4602 /* Layout a function argument according to the AAPCS64 rules. The rule
4603 numbers refer to the rule numbers in the AAPCS64. */
4604
4605 static void
4606 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4607 const_tree type,
4608 bool named ATTRIBUTE_UNUSED)
4609 {
4610 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4611 int ncrn, nvrn, nregs;
4612 bool allocate_ncrn, allocate_nvrn;
4613 HOST_WIDE_INT size;
4614 bool abi_break;
4615
4616 /* We need to do this once per argument. */
4617 if (pcum->aapcs_arg_processed)
4618 return;
4619
4620 pcum->aapcs_arg_processed = true;
4621
4622 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4623 if (type)
4624 size = int_size_in_bytes (type);
4625 else
4626 /* No frontends can create types with variable-sized modes, so we
4627 shouldn't be asked to pass or return them. */
4628 size = GET_MODE_SIZE (mode).to_constant ();
4629 size = ROUND_UP (size, UNITS_PER_WORD);
4630
4631 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4632 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4633 mode,
4634 type,
4635 &nregs);
4636
4637 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4638 The following code thus handles passing by SIMD/FP registers first. */
4639
4640 nvrn = pcum->aapcs_nvrn;
4641
4642 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4643 and homogenous short-vector aggregates (HVA). */
4644 if (allocate_nvrn)
4645 {
4646 if (!TARGET_FLOAT)
4647 aarch64_err_no_fpadvsimd (mode);
4648
4649 if (nvrn + nregs <= NUM_FP_ARG_REGS)
4650 {
4651 pcum->aapcs_nextnvrn = nvrn + nregs;
4652 if (!aarch64_composite_type_p (type, mode))
4653 {
4654 gcc_assert (nregs == 1);
4655 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4656 }
4657 else
4658 {
4659 rtx par;
4660 int i;
4661 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4662 for (i = 0; i < nregs; i++)
4663 {
4664 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4665 V0_REGNUM + nvrn + i);
4666 rtx offset = gen_int_mode
4667 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4668 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4669 XVECEXP (par, 0, i) = tmp;
4670 }
4671 pcum->aapcs_reg = par;
4672 }
4673 return;
4674 }
4675 else
4676 {
4677 /* C.3 NSRN is set to 8. */
4678 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4679 goto on_stack;
4680 }
4681 }
4682
4683 ncrn = pcum->aapcs_ncrn;
4684 nregs = size / UNITS_PER_WORD;
4685
4686 /* C6 - C9. though the sign and zero extension semantics are
4687 handled elsewhere. This is the case where the argument fits
4688 entirely general registers. */
4689 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4690 {
4691 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4692
4693 /* C.8 if the argument has an alignment of 16 then the NGRN is
4694 rounded up to the next even number. */
4695 if (nregs == 2
4696 && ncrn % 2
4697 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4698 comparison is there because for > 16 * BITS_PER_UNIT
4699 alignment nregs should be > 2 and therefore it should be
4700 passed by reference rather than value. */
4701 && (aarch64_function_arg_alignment (mode, type, &abi_break)
4702 == 16 * BITS_PER_UNIT))
4703 {
4704 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4705 inform (input_location, "parameter passing for argument of type "
4706 "%qT changed in GCC 9.1", type);
4707 ++ncrn;
4708 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4709 }
4710
4711 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4712 A reg is still generated for it, but the caller should be smart
4713 enough not to use it. */
4714 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4715 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4716 else
4717 {
4718 rtx par;
4719 int i;
4720
4721 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4722 for (i = 0; i < nregs; i++)
4723 {
4724 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4725 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4726 GEN_INT (i * UNITS_PER_WORD));
4727 XVECEXP (par, 0, i) = tmp;
4728 }
4729 pcum->aapcs_reg = par;
4730 }
4731
4732 pcum->aapcs_nextncrn = ncrn + nregs;
4733 return;
4734 }
4735
4736 /* C.11 */
4737 pcum->aapcs_nextncrn = NUM_ARG_REGS;
4738
4739 /* The argument is passed on stack; record the needed number of words for
4740 this argument and align the total size if necessary. */
4741 on_stack:
4742 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4743
4744 if (aarch64_function_arg_alignment (mode, type, &abi_break)
4745 == 16 * BITS_PER_UNIT)
4746 {
4747 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4748 if (pcum->aapcs_stack_size != new_size)
4749 {
4750 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4751 inform (input_location, "parameter passing for argument of type "
4752 "%qT changed in GCC 9.1", type);
4753 pcum->aapcs_stack_size = new_size;
4754 }
4755 }
4756 return;
4757 }
4758
4759 /* Implement TARGET_FUNCTION_ARG. */
4760
4761 static rtx
4762 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4763 const_tree type, bool named)
4764 {
4765 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4766 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4767
4768 if (mode == VOIDmode)
4769 return NULL_RTX;
4770
4771 aarch64_layout_arg (pcum_v, mode, type, named);
4772 return pcum->aapcs_reg;
4773 }
4774
4775 void
4776 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4777 const_tree fntype ATTRIBUTE_UNUSED,
4778 rtx libname ATTRIBUTE_UNUSED,
4779 const_tree fndecl ATTRIBUTE_UNUSED,
4780 unsigned n_named ATTRIBUTE_UNUSED)
4781 {
4782 pcum->aapcs_ncrn = 0;
4783 pcum->aapcs_nvrn = 0;
4784 pcum->aapcs_nextncrn = 0;
4785 pcum->aapcs_nextnvrn = 0;
4786 pcum->pcs_variant = ARM_PCS_AAPCS64;
4787 pcum->aapcs_reg = NULL_RTX;
4788 pcum->aapcs_arg_processed = false;
4789 pcum->aapcs_stack_words = 0;
4790 pcum->aapcs_stack_size = 0;
4791
4792 if (!TARGET_FLOAT
4793 && fndecl && TREE_PUBLIC (fndecl)
4794 && fntype && fntype != error_mark_node)
4795 {
4796 const_tree type = TREE_TYPE (fntype);
4797 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4798 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4799 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4800 &mode, &nregs, NULL))
4801 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4802 }
4803 return;
4804 }
4805
4806 static void
4807 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4808 machine_mode mode,
4809 const_tree type,
4810 bool named)
4811 {
4812 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4813 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4814 {
4815 aarch64_layout_arg (pcum_v, mode, type, named);
4816 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4817 != (pcum->aapcs_stack_words != 0));
4818 pcum->aapcs_arg_processed = false;
4819 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4820 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4821 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4822 pcum->aapcs_stack_words = 0;
4823 pcum->aapcs_reg = NULL_RTX;
4824 }
4825 }
4826
4827 bool
4828 aarch64_function_arg_regno_p (unsigned regno)
4829 {
4830 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4831 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4832 }
4833
4834 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4835 PARM_BOUNDARY bits of alignment, but will be given anything up
4836 to STACK_BOUNDARY bits if the type requires it. This makes sure
4837 that both before and after the layout of each argument, the Next
4838 Stacked Argument Address (NSAA) will have a minimum alignment of
4839 8 bytes. */
4840
4841 static unsigned int
4842 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4843 {
4844 bool abi_break;
4845 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4846 &abi_break);
4847 if (abi_break & warn_psabi)
4848 inform (input_location, "parameter passing for argument of type "
4849 "%qT changed in GCC 9.1", type);
4850
4851 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4852 }
4853
4854 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4855
4856 static fixed_size_mode
4857 aarch64_get_reg_raw_mode (int regno)
4858 {
4859 if (TARGET_SVE && FP_REGNUM_P (regno))
4860 /* Don't use the SVE part of the register for __builtin_apply and
4861 __builtin_return. The SVE registers aren't used by the normal PCS,
4862 so using them there would be a waste of time. The PCS extensions
4863 for SVE types are fundamentally incompatible with the
4864 __builtin_return/__builtin_apply interface. */
4865 return as_a <fixed_size_mode> (V16QImode);
4866 return default_get_reg_raw_mode (regno);
4867 }
4868
4869 /* Implement TARGET_FUNCTION_ARG_PADDING.
4870
4871 Small aggregate types are placed in the lowest memory address.
4872
4873 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4874
4875 static pad_direction
4876 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4877 {
4878 /* On little-endian targets, the least significant byte of every stack
4879 argument is passed at the lowest byte address of the stack slot. */
4880 if (!BYTES_BIG_ENDIAN)
4881 return PAD_UPWARD;
4882
4883 /* Otherwise, integral, floating-point and pointer types are padded downward:
4884 the least significant byte of a stack argument is passed at the highest
4885 byte address of the stack slot. */
4886 if (type
4887 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4888 || POINTER_TYPE_P (type))
4889 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4890 return PAD_DOWNWARD;
4891
4892 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4893 return PAD_UPWARD;
4894 }
4895
4896 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4897
4898 It specifies padding for the last (may also be the only)
4899 element of a block move between registers and memory. If
4900 assuming the block is in the memory, padding upward means that
4901 the last element is padded after its highest significant byte,
4902 while in downward padding, the last element is padded at the
4903 its least significant byte side.
4904
4905 Small aggregates and small complex types are always padded
4906 upwards.
4907
4908 We don't need to worry about homogeneous floating-point or
4909 short-vector aggregates; their move is not affected by the
4910 padding direction determined here. Regardless of endianness,
4911 each element of such an aggregate is put in the least
4912 significant bits of a fp/simd register.
4913
4914 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4915 register has useful data, and return the opposite if the most
4916 significant byte does. */
4917
4918 bool
4919 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4920 bool first ATTRIBUTE_UNUSED)
4921 {
4922
4923 /* Small composite types are always padded upward. */
4924 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4925 {
4926 HOST_WIDE_INT size;
4927 if (type)
4928 size = int_size_in_bytes (type);
4929 else
4930 /* No frontends can create types with variable-sized modes, so we
4931 shouldn't be asked to pass or return them. */
4932 size = GET_MODE_SIZE (mode).to_constant ();
4933 if (size < 2 * UNITS_PER_WORD)
4934 return true;
4935 }
4936
4937 /* Otherwise, use the default padding. */
4938 return !BYTES_BIG_ENDIAN;
4939 }
4940
4941 static scalar_int_mode
4942 aarch64_libgcc_cmp_return_mode (void)
4943 {
4944 return SImode;
4945 }
4946
4947 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4948
4949 /* We use the 12-bit shifted immediate arithmetic instructions so values
4950 must be multiple of (1 << 12), i.e. 4096. */
4951 #define ARITH_FACTOR 4096
4952
4953 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4954 #error Cannot use simple address calculation for stack probing
4955 #endif
4956
4957 /* The pair of scratch registers used for stack probing. */
4958 #define PROBE_STACK_FIRST_REG R9_REGNUM
4959 #define PROBE_STACK_SECOND_REG R10_REGNUM
4960
4961 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4962 inclusive. These are offsets from the current stack pointer. */
4963
4964 static void
4965 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4966 {
4967 HOST_WIDE_INT size;
4968 if (!poly_size.is_constant (&size))
4969 {
4970 sorry ("stack probes for SVE frames");
4971 return;
4972 }
4973
4974 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4975
4976 /* See the same assertion on PROBE_INTERVAL above. */
4977 gcc_assert ((first % ARITH_FACTOR) == 0);
4978
4979 /* See if we have a constant small number of probes to generate. If so,
4980 that's the easy case. */
4981 if (size <= PROBE_INTERVAL)
4982 {
4983 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4984
4985 emit_set_insn (reg1,
4986 plus_constant (Pmode,
4987 stack_pointer_rtx, -(first + base)));
4988 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4989 }
4990
4991 /* The run-time loop is made up of 8 insns in the generic case while the
4992 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4993 else if (size <= 4 * PROBE_INTERVAL)
4994 {
4995 HOST_WIDE_INT i, rem;
4996
4997 emit_set_insn (reg1,
4998 plus_constant (Pmode,
4999 stack_pointer_rtx,
5000 -(first + PROBE_INTERVAL)));
5001 emit_stack_probe (reg1);
5002
5003 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5004 it exceeds SIZE. If only two probes are needed, this will not
5005 generate any code. Then probe at FIRST + SIZE. */
5006 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5007 {
5008 emit_set_insn (reg1,
5009 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5010 emit_stack_probe (reg1);
5011 }
5012
5013 rem = size - (i - PROBE_INTERVAL);
5014 if (rem > 256)
5015 {
5016 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5017
5018 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5019 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5020 }
5021 else
5022 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5023 }
5024
5025 /* Otherwise, do the same as above, but in a loop. Note that we must be
5026 extra careful with variables wrapping around because we might be at
5027 the very top (or the very bottom) of the address space and we have
5028 to be able to handle this case properly; in particular, we use an
5029 equality test for the loop condition. */
5030 else
5031 {
5032 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5033
5034 /* Step 1: round SIZE to the previous multiple of the interval. */
5035
5036 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5037
5038
5039 /* Step 2: compute initial and final value of the loop counter. */
5040
5041 /* TEST_ADDR = SP + FIRST. */
5042 emit_set_insn (reg1,
5043 plus_constant (Pmode, stack_pointer_rtx, -first));
5044
5045 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5046 HOST_WIDE_INT adjustment = - (first + rounded_size);
5047 if (! aarch64_uimm12_shift (adjustment))
5048 {
5049 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5050 true, Pmode);
5051 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5052 }
5053 else
5054 emit_set_insn (reg2,
5055 plus_constant (Pmode, stack_pointer_rtx, adjustment));
5056
5057 /* Step 3: the loop
5058
5059 do
5060 {
5061 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5062 probe at TEST_ADDR
5063 }
5064 while (TEST_ADDR != LAST_ADDR)
5065
5066 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5067 until it is equal to ROUNDED_SIZE. */
5068
5069 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5070
5071
5072 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5073 that SIZE is equal to ROUNDED_SIZE. */
5074
5075 if (size != rounded_size)
5076 {
5077 HOST_WIDE_INT rem = size - rounded_size;
5078
5079 if (rem > 256)
5080 {
5081 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5082
5083 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5084 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5085 }
5086 else
5087 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5088 }
5089 }
5090
5091 /* Make sure nothing is scheduled before we are done. */
5092 emit_insn (gen_blockage ());
5093 }
5094
5095 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5096 absolute addresses. */
5097
5098 const char *
5099 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5100 {
5101 static int labelno = 0;
5102 char loop_lab[32];
5103 rtx xops[2];
5104
5105 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5106
5107 /* Loop. */
5108 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5109
5110 HOST_WIDE_INT stack_clash_probe_interval
5111 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5112
5113 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5114 xops[0] = reg1;
5115 HOST_WIDE_INT interval;
5116 if (flag_stack_clash_protection)
5117 interval = stack_clash_probe_interval;
5118 else
5119 interval = PROBE_INTERVAL;
5120
5121 gcc_assert (aarch64_uimm12_shift (interval));
5122 xops[1] = GEN_INT (interval);
5123
5124 output_asm_insn ("sub\t%0, %0, %1", xops);
5125
5126 /* If doing stack clash protection then we probe up by the ABI specified
5127 amount. We do this because we're dropping full pages at a time in the
5128 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5129 if (flag_stack_clash_protection)
5130 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5131 else
5132 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5133
5134 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5135 by this amount for each iteration. */
5136 output_asm_insn ("str\txzr, [%0, %1]", xops);
5137
5138 /* Test if TEST_ADDR == LAST_ADDR. */
5139 xops[1] = reg2;
5140 output_asm_insn ("cmp\t%0, %1", xops);
5141
5142 /* Branch. */
5143 fputs ("\tb.ne\t", asm_out_file);
5144 assemble_name_raw (asm_out_file, loop_lab);
5145 fputc ('\n', asm_out_file);
5146
5147 return "";
5148 }
5149
5150 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5151 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5152 of GUARD_SIZE. When a probe is emitted it is done at most
5153 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5154 at most MIN_PROBE_THRESHOLD. By the end of this function
5155 BASE = BASE - ADJUSTMENT. */
5156
5157 const char *
5158 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5159 rtx min_probe_threshold, rtx guard_size)
5160 {
5161 /* This function is not allowed to use any instruction generation function
5162 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5163 so instead emit the code you want using output_asm_insn. */
5164 gcc_assert (flag_stack_clash_protection);
5165 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5166 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5167
5168 /* The minimum required allocation before the residual requires probing. */
5169 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5170
5171 /* Clamp the value down to the nearest value that can be used with a cmp. */
5172 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5173 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5174
5175 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5176 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5177
5178 static int labelno = 0;
5179 char loop_start_lab[32];
5180 char loop_end_lab[32];
5181 rtx xops[2];
5182
5183 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5184 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5185
5186 /* Emit loop start label. */
5187 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5188
5189 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5190 xops[0] = adjustment;
5191 xops[1] = probe_offset_value_rtx;
5192 output_asm_insn ("cmp\t%0, %1", xops);
5193
5194 /* Branch to end if not enough adjustment to probe. */
5195 fputs ("\tb.lt\t", asm_out_file);
5196 assemble_name_raw (asm_out_file, loop_end_lab);
5197 fputc ('\n', asm_out_file);
5198
5199 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5200 xops[0] = base;
5201 xops[1] = probe_offset_value_rtx;
5202 output_asm_insn ("sub\t%0, %0, %1", xops);
5203
5204 /* Probe at BASE. */
5205 xops[1] = const0_rtx;
5206 output_asm_insn ("str\txzr, [%0, %1]", xops);
5207
5208 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5209 xops[0] = adjustment;
5210 xops[1] = probe_offset_value_rtx;
5211 output_asm_insn ("sub\t%0, %0, %1", xops);
5212
5213 /* Branch to start if still more bytes to allocate. */
5214 fputs ("\tb\t", asm_out_file);
5215 assemble_name_raw (asm_out_file, loop_start_lab);
5216 fputc ('\n', asm_out_file);
5217
5218 /* No probe leave. */
5219 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5220
5221 /* BASE = BASE - ADJUSTMENT. */
5222 xops[0] = base;
5223 xops[1] = adjustment;
5224 output_asm_insn ("sub\t%0, %0, %1", xops);
5225 return "";
5226 }
5227
5228 /* Determine whether a frame chain needs to be generated. */
5229 static bool
5230 aarch64_needs_frame_chain (void)
5231 {
5232 /* Force a frame chain for EH returns so the return address is at FP+8. */
5233 if (frame_pointer_needed || crtl->calls_eh_return)
5234 return true;
5235
5236 /* A leaf function cannot have calls or write LR. */
5237 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5238
5239 /* Don't use a frame chain in leaf functions if leaf frame pointers
5240 are disabled. */
5241 if (flag_omit_leaf_frame_pointer && is_leaf)
5242 return false;
5243
5244 return aarch64_use_frame_pointer;
5245 }
5246
5247 /* Mark the registers that need to be saved by the callee and calculate
5248 the size of the callee-saved registers area and frame record (both FP
5249 and LR may be omitted). */
5250 static void
5251 aarch64_layout_frame (void)
5252 {
5253 HOST_WIDE_INT offset = 0;
5254 int regno, last_fp_reg = INVALID_REGNUM;
5255 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5256
5257 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5258
5259 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5260 the mid-end is doing. */
5261 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5262
5263 #define SLOT_NOT_REQUIRED (-2)
5264 #define SLOT_REQUIRED (-1)
5265
5266 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5267 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5268
5269 /* If this is a non-leaf simd function with calls we assume that
5270 at least one of those calls is to a non-simd function and thus
5271 we must save V8 to V23 in the prologue. */
5272
5273 if (simd_function && !crtl->is_leaf)
5274 {
5275 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5276 if (FP_SIMD_SAVED_REGNUM_P (regno))
5277 df_set_regs_ever_live (regno, true);
5278 }
5279
5280 /* First mark all the registers that really need to be saved... */
5281 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5282 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5283
5284 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5285 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5286
5287 /* ... that includes the eh data registers (if needed)... */
5288 if (crtl->calls_eh_return)
5289 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5290 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5291 = SLOT_REQUIRED;
5292
5293 /* ... and any callee saved register that dataflow says is live. */
5294 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5295 if (df_regs_ever_live_p (regno)
5296 && (regno == R30_REGNUM
5297 || !call_used_regs[regno]))
5298 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5299
5300 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5301 if (df_regs_ever_live_p (regno)
5302 && (!call_used_regs[regno]
5303 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5304 {
5305 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5306 last_fp_reg = regno;
5307 }
5308
5309 if (cfun->machine->frame.emit_frame_chain)
5310 {
5311 /* FP and LR are placed in the linkage record. */
5312 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5313 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5314 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5315 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5316 offset = 2 * UNITS_PER_WORD;
5317 }
5318
5319 /* With stack-clash, LR must be saved in non-leaf functions. */
5320 gcc_assert (crtl->is_leaf
5321 || (cfun->machine->frame.reg_offset[R30_REGNUM]
5322 != SLOT_NOT_REQUIRED));
5323
5324 /* Now assign stack slots for them. */
5325 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5326 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5327 {
5328 cfun->machine->frame.reg_offset[regno] = offset;
5329 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5330 cfun->machine->frame.wb_candidate1 = regno;
5331 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5332 cfun->machine->frame.wb_candidate2 = regno;
5333 offset += UNITS_PER_WORD;
5334 }
5335
5336 HOST_WIDE_INT max_int_offset = offset;
5337 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5338 bool has_align_gap = offset != max_int_offset;
5339
5340 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5341 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5342 {
5343 /* If there is an alignment gap between integer and fp callee-saves,
5344 allocate the last fp register to it if possible. */
5345 if (regno == last_fp_reg
5346 && has_align_gap
5347 && !simd_function
5348 && (offset & 8) == 0)
5349 {
5350 cfun->machine->frame.reg_offset[regno] = max_int_offset;
5351 break;
5352 }
5353
5354 cfun->machine->frame.reg_offset[regno] = offset;
5355 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5356 cfun->machine->frame.wb_candidate1 = regno;
5357 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5358 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5359 cfun->machine->frame.wb_candidate2 = regno;
5360 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5361 }
5362
5363 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5364
5365 cfun->machine->frame.saved_regs_size = offset;
5366
5367 HOST_WIDE_INT varargs_and_saved_regs_size
5368 = offset + cfun->machine->frame.saved_varargs_size;
5369
5370 cfun->machine->frame.hard_fp_offset
5371 = aligned_upper_bound (varargs_and_saved_regs_size
5372 + get_frame_size (),
5373 STACK_BOUNDARY / BITS_PER_UNIT);
5374
5375 /* Both these values are already aligned. */
5376 gcc_assert (multiple_p (crtl->outgoing_args_size,
5377 STACK_BOUNDARY / BITS_PER_UNIT));
5378 cfun->machine->frame.frame_size
5379 = (cfun->machine->frame.hard_fp_offset
5380 + crtl->outgoing_args_size);
5381
5382 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5383
5384 cfun->machine->frame.initial_adjust = 0;
5385 cfun->machine->frame.final_adjust = 0;
5386 cfun->machine->frame.callee_adjust = 0;
5387 cfun->machine->frame.callee_offset = 0;
5388
5389 HOST_WIDE_INT max_push_offset = 0;
5390 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5391 max_push_offset = 512;
5392 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5393 max_push_offset = 256;
5394
5395 HOST_WIDE_INT const_size, const_fp_offset;
5396 if (cfun->machine->frame.frame_size.is_constant (&const_size)
5397 && const_size < max_push_offset
5398 && known_eq (crtl->outgoing_args_size, 0))
5399 {
5400 /* Simple, small frame with no outgoing arguments:
5401 stp reg1, reg2, [sp, -frame_size]!
5402 stp reg3, reg4, [sp, 16] */
5403 cfun->machine->frame.callee_adjust = const_size;
5404 }
5405 else if (known_lt (crtl->outgoing_args_size
5406 + cfun->machine->frame.saved_regs_size, 512)
5407 && !(cfun->calls_alloca
5408 && known_lt (cfun->machine->frame.hard_fp_offset,
5409 max_push_offset)))
5410 {
5411 /* Frame with small outgoing arguments:
5412 sub sp, sp, frame_size
5413 stp reg1, reg2, [sp, outgoing_args_size]
5414 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5415 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5416 cfun->machine->frame.callee_offset
5417 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5418 }
5419 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5420 && const_fp_offset < max_push_offset)
5421 {
5422 /* Frame with large outgoing arguments but a small local area:
5423 stp reg1, reg2, [sp, -hard_fp_offset]!
5424 stp reg3, reg4, [sp, 16]
5425 sub sp, sp, outgoing_args_size */
5426 cfun->machine->frame.callee_adjust = const_fp_offset;
5427 cfun->machine->frame.final_adjust
5428 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5429 }
5430 else
5431 {
5432 /* Frame with large local area and outgoing arguments using frame pointer:
5433 sub sp, sp, hard_fp_offset
5434 stp x29, x30, [sp, 0]
5435 add x29, sp, 0
5436 stp reg3, reg4, [sp, 16]
5437 sub sp, sp, outgoing_args_size */
5438 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5439 cfun->machine->frame.final_adjust
5440 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5441 }
5442
5443 cfun->machine->frame.laid_out = true;
5444 }
5445
5446 /* Return true if the register REGNO is saved on entry to
5447 the current function. */
5448
5449 static bool
5450 aarch64_register_saved_on_entry (int regno)
5451 {
5452 return cfun->machine->frame.reg_offset[regno] >= 0;
5453 }
5454
5455 /* Return the next register up from REGNO up to LIMIT for the callee
5456 to save. */
5457
5458 static unsigned
5459 aarch64_next_callee_save (unsigned regno, unsigned limit)
5460 {
5461 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5462 regno ++;
5463 return regno;
5464 }
5465
5466 /* Push the register number REGNO of mode MODE to the stack with write-back
5467 adjusting the stack by ADJUSTMENT. */
5468
5469 static void
5470 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5471 HOST_WIDE_INT adjustment)
5472 {
5473 rtx base_rtx = stack_pointer_rtx;
5474 rtx insn, reg, mem;
5475
5476 reg = gen_rtx_REG (mode, regno);
5477 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5478 plus_constant (Pmode, base_rtx, -adjustment));
5479 mem = gen_frame_mem (mode, mem);
5480
5481 insn = emit_move_insn (mem, reg);
5482 RTX_FRAME_RELATED_P (insn) = 1;
5483 }
5484
5485 /* Generate and return an instruction to store the pair of registers
5486 REG and REG2 of mode MODE to location BASE with write-back adjusting
5487 the stack location BASE by ADJUSTMENT. */
5488
5489 static rtx
5490 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5491 HOST_WIDE_INT adjustment)
5492 {
5493 switch (mode)
5494 {
5495 case E_DImode:
5496 return gen_storewb_pairdi_di (base, base, reg, reg2,
5497 GEN_INT (-adjustment),
5498 GEN_INT (UNITS_PER_WORD - adjustment));
5499 case E_DFmode:
5500 return gen_storewb_pairdf_di (base, base, reg, reg2,
5501 GEN_INT (-adjustment),
5502 GEN_INT (UNITS_PER_WORD - adjustment));
5503 case E_TFmode:
5504 return gen_storewb_pairtf_di (base, base, reg, reg2,
5505 GEN_INT (-adjustment),
5506 GEN_INT (UNITS_PER_VREG - adjustment));
5507 default:
5508 gcc_unreachable ();
5509 }
5510 }
5511
5512 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5513 stack pointer by ADJUSTMENT. */
5514
5515 static void
5516 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5517 {
5518 rtx_insn *insn;
5519 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5520
5521 if (regno2 == INVALID_REGNUM)
5522 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5523
5524 rtx reg1 = gen_rtx_REG (mode, regno1);
5525 rtx reg2 = gen_rtx_REG (mode, regno2);
5526
5527 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5528 reg2, adjustment));
5529 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5530 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5531 RTX_FRAME_RELATED_P (insn) = 1;
5532 }
5533
5534 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5535 adjusting it by ADJUSTMENT afterwards. */
5536
5537 static rtx
5538 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5539 HOST_WIDE_INT adjustment)
5540 {
5541 switch (mode)
5542 {
5543 case E_DImode:
5544 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5545 GEN_INT (UNITS_PER_WORD));
5546 case E_DFmode:
5547 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5548 GEN_INT (UNITS_PER_WORD));
5549 case E_TFmode:
5550 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5551 GEN_INT (UNITS_PER_VREG));
5552 default:
5553 gcc_unreachable ();
5554 }
5555 }
5556
5557 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5558 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5559 into CFI_OPS. */
5560
5561 static void
5562 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5563 rtx *cfi_ops)
5564 {
5565 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5566 rtx reg1 = gen_rtx_REG (mode, regno1);
5567
5568 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5569
5570 if (regno2 == INVALID_REGNUM)
5571 {
5572 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5573 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5574 emit_move_insn (reg1, gen_frame_mem (mode, mem));
5575 }
5576 else
5577 {
5578 rtx reg2 = gen_rtx_REG (mode, regno2);
5579 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5580 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5581 reg2, adjustment));
5582 }
5583 }
5584
5585 /* Generate and return a store pair instruction of mode MODE to store
5586 register REG1 to MEM1 and register REG2 to MEM2. */
5587
5588 static rtx
5589 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5590 rtx reg2)
5591 {
5592 switch (mode)
5593 {
5594 case E_DImode:
5595 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5596
5597 case E_DFmode:
5598 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5599
5600 case E_TFmode:
5601 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5602
5603 default:
5604 gcc_unreachable ();
5605 }
5606 }
5607
5608 /* Generate and regurn a load pair isntruction of mode MODE to load register
5609 REG1 from MEM1 and register REG2 from MEM2. */
5610
5611 static rtx
5612 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5613 rtx mem2)
5614 {
5615 switch (mode)
5616 {
5617 case E_DImode:
5618 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5619
5620 case E_DFmode:
5621 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5622
5623 case E_TFmode:
5624 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5625
5626 default:
5627 gcc_unreachable ();
5628 }
5629 }
5630
5631 /* Return TRUE if return address signing should be enabled for the current
5632 function, otherwise return FALSE. */
5633
5634 bool
5635 aarch64_return_address_signing_enabled (void)
5636 {
5637 /* This function should only be called after frame laid out. */
5638 gcc_assert (cfun->machine->frame.laid_out);
5639
5640 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5641 if its LR is pushed onto stack. */
5642 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5643 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5644 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5645 }
5646
5647 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5648 bool
5649 aarch64_bti_enabled (void)
5650 {
5651 return (aarch64_enable_bti == 1);
5652 }
5653
5654 /* Emit code to save the callee-saved registers from register number START
5655 to LIMIT to the stack at the location starting at offset START_OFFSET,
5656 skipping any write-back candidates if SKIP_WB is true. */
5657
5658 static void
5659 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5660 unsigned start, unsigned limit, bool skip_wb)
5661 {
5662 rtx_insn *insn;
5663 unsigned regno;
5664 unsigned regno2;
5665
5666 for (regno = aarch64_next_callee_save (start, limit);
5667 regno <= limit;
5668 regno = aarch64_next_callee_save (regno + 1, limit))
5669 {
5670 rtx reg, mem;
5671 poly_int64 offset;
5672 int offset_diff;
5673
5674 if (skip_wb
5675 && (regno == cfun->machine->frame.wb_candidate1
5676 || regno == cfun->machine->frame.wb_candidate2))
5677 continue;
5678
5679 if (cfun->machine->reg_is_wrapped_separately[regno])
5680 continue;
5681
5682 reg = gen_rtx_REG (mode, regno);
5683 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5684 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5685 offset));
5686
5687 regno2 = aarch64_next_callee_save (regno + 1, limit);
5688 offset_diff = cfun->machine->frame.reg_offset[regno2]
5689 - cfun->machine->frame.reg_offset[regno];
5690
5691 if (regno2 <= limit
5692 && !cfun->machine->reg_is_wrapped_separately[regno2]
5693 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5694 {
5695 rtx reg2 = gen_rtx_REG (mode, regno2);
5696 rtx mem2;
5697
5698 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5699 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5700 offset));
5701 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5702 reg2));
5703
5704 /* The first part of a frame-related parallel insn is
5705 always assumed to be relevant to the frame
5706 calculations; subsequent parts, are only
5707 frame-related if explicitly marked. */
5708 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5709 regno = regno2;
5710 }
5711 else
5712 insn = emit_move_insn (mem, reg);
5713
5714 RTX_FRAME_RELATED_P (insn) = 1;
5715 }
5716 }
5717
5718 /* Emit code to restore the callee registers of mode MODE from register
5719 number START up to and including LIMIT. Restore from the stack offset
5720 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5721 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5722
5723 static void
5724 aarch64_restore_callee_saves (machine_mode mode,
5725 poly_int64 start_offset, unsigned start,
5726 unsigned limit, bool skip_wb, rtx *cfi_ops)
5727 {
5728 rtx base_rtx = stack_pointer_rtx;
5729 unsigned regno;
5730 unsigned regno2;
5731 poly_int64 offset;
5732
5733 for (regno = aarch64_next_callee_save (start, limit);
5734 regno <= limit;
5735 regno = aarch64_next_callee_save (regno + 1, limit))
5736 {
5737 if (cfun->machine->reg_is_wrapped_separately[regno])
5738 continue;
5739
5740 rtx reg, mem;
5741 int offset_diff;
5742
5743 if (skip_wb
5744 && (regno == cfun->machine->frame.wb_candidate1
5745 || regno == cfun->machine->frame.wb_candidate2))
5746 continue;
5747
5748 reg = gen_rtx_REG (mode, regno);
5749 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5750 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5751
5752 regno2 = aarch64_next_callee_save (regno + 1, limit);
5753 offset_diff = cfun->machine->frame.reg_offset[regno2]
5754 - cfun->machine->frame.reg_offset[regno];
5755
5756 if (regno2 <= limit
5757 && !cfun->machine->reg_is_wrapped_separately[regno2]
5758 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5759 {
5760 rtx reg2 = gen_rtx_REG (mode, regno2);
5761 rtx mem2;
5762
5763 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5764 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5765 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5766
5767 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5768 regno = regno2;
5769 }
5770 else
5771 emit_move_insn (reg, mem);
5772 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5773 }
5774 }
5775
5776 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5777 of MODE. */
5778
5779 static inline bool
5780 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5781 {
5782 HOST_WIDE_INT multiple;
5783 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5784 && IN_RANGE (multiple, -8, 7));
5785 }
5786
5787 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5788 of MODE. */
5789
5790 static inline bool
5791 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5792 {
5793 HOST_WIDE_INT multiple;
5794 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5795 && IN_RANGE (multiple, 0, 63));
5796 }
5797
5798 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5799 of MODE. */
5800
5801 bool
5802 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5803 {
5804 HOST_WIDE_INT multiple;
5805 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5806 && IN_RANGE (multiple, -64, 63));
5807 }
5808
5809 /* Return true if OFFSET is a signed 9-bit value. */
5810
5811 bool
5812 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5813 poly_int64 offset)
5814 {
5815 HOST_WIDE_INT const_offset;
5816 return (offset.is_constant (&const_offset)
5817 && IN_RANGE (const_offset, -256, 255));
5818 }
5819
5820 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5821 of MODE. */
5822
5823 static inline bool
5824 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5825 {
5826 HOST_WIDE_INT multiple;
5827 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5828 && IN_RANGE (multiple, -256, 255));
5829 }
5830
5831 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5832 of MODE. */
5833
5834 static inline bool
5835 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5836 {
5837 HOST_WIDE_INT multiple;
5838 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5839 && IN_RANGE (multiple, 0, 4095));
5840 }
5841
5842 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5843
5844 static sbitmap
5845 aarch64_get_separate_components (void)
5846 {
5847 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5848 bitmap_clear (components);
5849
5850 /* The registers we need saved to the frame. */
5851 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5852 if (aarch64_register_saved_on_entry (regno))
5853 {
5854 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5855 if (!frame_pointer_needed)
5856 offset += cfun->machine->frame.frame_size
5857 - cfun->machine->frame.hard_fp_offset;
5858 /* Check that we can access the stack slot of the register with one
5859 direct load with no adjustments needed. */
5860 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5861 bitmap_set_bit (components, regno);
5862 }
5863
5864 /* Don't mess with the hard frame pointer. */
5865 if (frame_pointer_needed)
5866 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5867
5868 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5869 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5870 /* If registers have been chosen to be stored/restored with
5871 writeback don't interfere with them to avoid having to output explicit
5872 stack adjustment instructions. */
5873 if (reg2 != INVALID_REGNUM)
5874 bitmap_clear_bit (components, reg2);
5875 if (reg1 != INVALID_REGNUM)
5876 bitmap_clear_bit (components, reg1);
5877
5878 bitmap_clear_bit (components, LR_REGNUM);
5879 bitmap_clear_bit (components, SP_REGNUM);
5880
5881 return components;
5882 }
5883
5884 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5885
5886 static sbitmap
5887 aarch64_components_for_bb (basic_block bb)
5888 {
5889 bitmap in = DF_LIVE_IN (bb);
5890 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5891 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5892 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5893
5894 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5895 bitmap_clear (components);
5896
5897 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5898 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5899 if ((!call_used_regs[regno]
5900 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5901 && (bitmap_bit_p (in, regno)
5902 || bitmap_bit_p (gen, regno)
5903 || bitmap_bit_p (kill, regno)))
5904 {
5905 unsigned regno2, offset, offset2;
5906 bitmap_set_bit (components, regno);
5907
5908 /* If there is a callee-save at an adjacent offset, add it too
5909 to increase the use of LDP/STP. */
5910 offset = cfun->machine->frame.reg_offset[regno];
5911 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5912
5913 if (regno2 <= LAST_SAVED_REGNUM)
5914 {
5915 offset2 = cfun->machine->frame.reg_offset[regno2];
5916 if ((offset & ~8) == (offset2 & ~8))
5917 bitmap_set_bit (components, regno2);
5918 }
5919 }
5920
5921 return components;
5922 }
5923
5924 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5925 Nothing to do for aarch64. */
5926
5927 static void
5928 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5929 {
5930 }
5931
5932 /* Return the next set bit in BMP from START onwards. Return the total number
5933 of bits in BMP if no set bit is found at or after START. */
5934
5935 static unsigned int
5936 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5937 {
5938 unsigned int nbits = SBITMAP_SIZE (bmp);
5939 if (start == nbits)
5940 return start;
5941
5942 gcc_assert (start < nbits);
5943 for (unsigned int i = start; i < nbits; i++)
5944 if (bitmap_bit_p (bmp, i))
5945 return i;
5946
5947 return nbits;
5948 }
5949
5950 /* Do the work for aarch64_emit_prologue_components and
5951 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5952 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5953 for these components or the epilogue sequence. That is, it determines
5954 whether we should emit stores or loads and what kind of CFA notes to attach
5955 to the insns. Otherwise the logic for the two sequences is very
5956 similar. */
5957
5958 static void
5959 aarch64_process_components (sbitmap components, bool prologue_p)
5960 {
5961 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5962 ? HARD_FRAME_POINTER_REGNUM
5963 : STACK_POINTER_REGNUM);
5964
5965 unsigned last_regno = SBITMAP_SIZE (components);
5966 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5967 rtx_insn *insn = NULL;
5968
5969 while (regno != last_regno)
5970 {
5971 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5972 so DFmode for the vector registers is enough. For simd functions
5973 we want to save the low 128 bits. */
5974 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5975
5976 rtx reg = gen_rtx_REG (mode, regno);
5977 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5978 if (!frame_pointer_needed)
5979 offset += cfun->machine->frame.frame_size
5980 - cfun->machine->frame.hard_fp_offset;
5981 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5982 rtx mem = gen_frame_mem (mode, addr);
5983
5984 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5985 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5986 /* No more registers to handle after REGNO.
5987 Emit a single save/restore and exit. */
5988 if (regno2 == last_regno)
5989 {
5990 insn = emit_insn (set);
5991 RTX_FRAME_RELATED_P (insn) = 1;
5992 if (prologue_p)
5993 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5994 else
5995 add_reg_note (insn, REG_CFA_RESTORE, reg);
5996 break;
5997 }
5998
5999 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6000 /* The next register is not of the same class or its offset is not
6001 mergeable with the current one into a pair. */
6002 if (!satisfies_constraint_Ump (mem)
6003 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6004 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
6005 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6006 GET_MODE_SIZE (mode)))
6007 {
6008 insn = emit_insn (set);
6009 RTX_FRAME_RELATED_P (insn) = 1;
6010 if (prologue_p)
6011 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6012 else
6013 add_reg_note (insn, REG_CFA_RESTORE, reg);
6014
6015 regno = regno2;
6016 continue;
6017 }
6018
6019 /* REGNO2 can be saved/restored in a pair with REGNO. */
6020 rtx reg2 = gen_rtx_REG (mode, regno2);
6021 if (!frame_pointer_needed)
6022 offset2 += cfun->machine->frame.frame_size
6023 - cfun->machine->frame.hard_fp_offset;
6024 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6025 rtx mem2 = gen_frame_mem (mode, addr2);
6026 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6027 : gen_rtx_SET (reg2, mem2);
6028
6029 if (prologue_p)
6030 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6031 else
6032 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6033
6034 RTX_FRAME_RELATED_P (insn) = 1;
6035 if (prologue_p)
6036 {
6037 add_reg_note (insn, REG_CFA_OFFSET, set);
6038 add_reg_note (insn, REG_CFA_OFFSET, set2);
6039 }
6040 else
6041 {
6042 add_reg_note (insn, REG_CFA_RESTORE, reg);
6043 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6044 }
6045
6046 regno = aarch64_get_next_set_bit (components, regno2 + 1);
6047 }
6048 }
6049
6050 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6051
6052 static void
6053 aarch64_emit_prologue_components (sbitmap components)
6054 {
6055 aarch64_process_components (components, true);
6056 }
6057
6058 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6059
6060 static void
6061 aarch64_emit_epilogue_components (sbitmap components)
6062 {
6063 aarch64_process_components (components, false);
6064 }
6065
6066 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6067
6068 static void
6069 aarch64_set_handled_components (sbitmap components)
6070 {
6071 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6072 if (bitmap_bit_p (components, regno))
6073 cfun->machine->reg_is_wrapped_separately[regno] = true;
6074 }
6075
6076 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
6077 determining the probe offset for alloca. */
6078
6079 static HOST_WIDE_INT
6080 aarch64_stack_clash_protection_alloca_probe_range (void)
6081 {
6082 return STACK_CLASH_CALLER_GUARD;
6083 }
6084
6085
6086 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
6087 registers. If POLY_SIZE is not large enough to require a probe this function
6088 will only adjust the stack. When allocating the stack space
6089 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
6090 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
6091 arguments. If we are then we ensure that any allocation larger than the ABI
6092 defined buffer needs a probe so that the invariant of having a 1KB buffer is
6093 maintained.
6094
6095 We emit barriers after each stack adjustment to prevent optimizations from
6096 breaking the invariant that we never drop the stack more than a page. This
6097 invariant is needed to make it easier to correctly handle asynchronous
6098 events, e.g. if we were to allow the stack to be dropped by more than a page
6099 and then have multiple probes up and we take a signal somewhere in between
6100 then the signal handler doesn't know the state of the stack and can make no
6101 assumptions about which pages have been probed. */
6102
6103 static void
6104 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
6105 poly_int64 poly_size,
6106 bool frame_related_p,
6107 bool final_adjustment_p)
6108 {
6109 HOST_WIDE_INT guard_size
6110 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6111 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6112 /* When doing the final adjustment for the outgoing argument size we can't
6113 assume that LR was saved at position 0. So subtract it's offset from the
6114 ABI safe buffer so that we don't accidentally allow an adjustment that
6115 would result in an allocation larger than the ABI buffer without
6116 probing. */
6117 HOST_WIDE_INT min_probe_threshold
6118 = final_adjustment_p
6119 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
6120 : guard_size - guard_used_by_caller;
6121
6122 poly_int64 frame_size = cfun->machine->frame.frame_size;
6123
6124 /* We should always have a positive probe threshold. */
6125 gcc_assert (min_probe_threshold > 0);
6126
6127 if (flag_stack_clash_protection && !final_adjustment_p)
6128 {
6129 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6130 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6131
6132 if (known_eq (frame_size, 0))
6133 {
6134 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
6135 }
6136 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
6137 && known_lt (final_adjust, guard_used_by_caller))
6138 {
6139 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
6140 }
6141 }
6142
6143 /* If SIZE is not large enough to require probing, just adjust the stack and
6144 exit. */
6145 if (known_lt (poly_size, min_probe_threshold)
6146 || !flag_stack_clash_protection)
6147 {
6148 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
6149 return;
6150 }
6151
6152 HOST_WIDE_INT size;
6153 /* Handle the SVE non-constant case first. */
6154 if (!poly_size.is_constant (&size))
6155 {
6156 if (dump_file)
6157 {
6158 fprintf (dump_file, "Stack clash SVE prologue: ");
6159 print_dec (poly_size, dump_file);
6160 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
6161 }
6162
6163 /* First calculate the amount of bytes we're actually spilling. */
6164 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
6165 poly_size, temp1, temp2, false, true);
6166
6167 rtx_insn *insn = get_last_insn ();
6168
6169 if (frame_related_p)
6170 {
6171 /* This is done to provide unwinding information for the stack
6172 adjustments we're about to do, however to prevent the optimizers
6173 from removing the R11 move and leaving the CFA note (which would be
6174 very wrong) we tie the old and new stack pointer together.
6175 The tie will expand to nothing but the optimizers will not touch
6176 the instruction. */
6177 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6178 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
6179 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
6180
6181 /* We want the CFA independent of the stack pointer for the
6182 duration of the loop. */
6183 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
6184 RTX_FRAME_RELATED_P (insn) = 1;
6185 }
6186
6187 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
6188 rtx guard_const = gen_int_mode (guard_size, Pmode);
6189
6190 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
6191 stack_pointer_rtx, temp1,
6192 probe_const, guard_const));
6193
6194 /* Now reset the CFA register if needed. */
6195 if (frame_related_p)
6196 {
6197 add_reg_note (insn, REG_CFA_DEF_CFA,
6198 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
6199 gen_int_mode (poly_size, Pmode)));
6200 RTX_FRAME_RELATED_P (insn) = 1;
6201 }
6202
6203 return;
6204 }
6205
6206 if (dump_file)
6207 fprintf (dump_file,
6208 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
6209 " bytes, probing will be required.\n", size);
6210
6211 /* Round size to the nearest multiple of guard_size, and calculate the
6212 residual as the difference between the original size and the rounded
6213 size. */
6214 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
6215 HOST_WIDE_INT residual = size - rounded_size;
6216
6217 /* We can handle a small number of allocations/probes inline. Otherwise
6218 punt to a loop. */
6219 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
6220 {
6221 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
6222 {
6223 aarch64_sub_sp (NULL, temp2, guard_size, true);
6224 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6225 guard_used_by_caller));
6226 emit_insn (gen_blockage ());
6227 }
6228 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
6229 }
6230 else
6231 {
6232 /* Compute the ending address. */
6233 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
6234 temp1, NULL, false, true);
6235 rtx_insn *insn = get_last_insn ();
6236
6237 /* For the initial allocation, we don't have a frame pointer
6238 set up, so we always need CFI notes. If we're doing the
6239 final allocation, then we may have a frame pointer, in which
6240 case it is the CFA, otherwise we need CFI notes.
6241
6242 We can determine which allocation we are doing by looking at
6243 the value of FRAME_RELATED_P since the final allocations are not
6244 frame related. */
6245 if (frame_related_p)
6246 {
6247 /* We want the CFA independent of the stack pointer for the
6248 duration of the loop. */
6249 add_reg_note (insn, REG_CFA_DEF_CFA,
6250 plus_constant (Pmode, temp1, rounded_size));
6251 RTX_FRAME_RELATED_P (insn) = 1;
6252 }
6253
6254 /* This allocates and probes the stack. Note that this re-uses some of
6255 the existing Ada stack protection code. However we are guaranteed not
6256 to enter the non loop or residual branches of that code.
6257
6258 The non-loop part won't be entered because if our allocation amount
6259 doesn't require a loop, the case above would handle it.
6260
6261 The residual amount won't be entered because TEMP1 is a mutliple of
6262 the allocation size. The residual will always be 0. As such, the only
6263 part we are actually using from that code is the loop setup. The
6264 actual probing is done in aarch64_output_probe_stack_range. */
6265 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6266 stack_pointer_rtx, temp1));
6267
6268 /* Now reset the CFA register if needed. */
6269 if (frame_related_p)
6270 {
6271 add_reg_note (insn, REG_CFA_DEF_CFA,
6272 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6273 RTX_FRAME_RELATED_P (insn) = 1;
6274 }
6275
6276 emit_insn (gen_blockage ());
6277 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6278 }
6279
6280 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6281 be probed. This maintains the requirement that each page is probed at
6282 least once. For initial probing we probe only if the allocation is
6283 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6284 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6285 GUARD_SIZE. This works that for any allocation that is large enough to
6286 trigger a probe here, we'll have at least one, and if they're not large
6287 enough for this code to emit anything for them, The page would have been
6288 probed by the saving of FP/LR either by this function or any callees. If
6289 we don't have any callees then we won't have more stack adjustments and so
6290 are still safe. */
6291 if (residual)
6292 {
6293 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6294 /* If we're doing final adjustments, and we've done any full page
6295 allocations then any residual needs to be probed. */
6296 if (final_adjustment_p && rounded_size != 0)
6297 min_probe_threshold = 0;
6298 /* If doing a small final adjustment, we always probe at offset 0.
6299 This is done to avoid issues when LR is not at position 0 or when
6300 the final adjustment is smaller than the probing offset. */
6301 else if (final_adjustment_p && rounded_size == 0)
6302 residual_probe_offset = 0;
6303
6304 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6305 if (residual >= min_probe_threshold)
6306 {
6307 if (dump_file)
6308 fprintf (dump_file,
6309 "Stack clash AArch64 prologue residuals: "
6310 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6311 "\n", residual);
6312
6313 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6314 residual_probe_offset));
6315 emit_insn (gen_blockage ());
6316 }
6317 }
6318 }
6319
6320 /* Return 1 if the register is used by the epilogue. We need to say the
6321 return register is used, but only after epilogue generation is complete.
6322 Note that in the case of sibcalls, the values "used by the epilogue" are
6323 considered live at the start of the called function.
6324
6325 For SIMD functions we need to return 1 for FP registers that are saved and
6326 restored by a function but are not zero in call_used_regs. If we do not do
6327 this optimizations may remove the restore of the register. */
6328
6329 int
6330 aarch64_epilogue_uses (int regno)
6331 {
6332 if (epilogue_completed)
6333 {
6334 if (regno == LR_REGNUM)
6335 return 1;
6336 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6337 return 1;
6338 }
6339 return 0;
6340 }
6341
6342 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6343 is saved at BASE + OFFSET. */
6344
6345 static void
6346 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6347 rtx base, poly_int64 offset)
6348 {
6349 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6350 add_reg_note (insn, REG_CFA_EXPRESSION,
6351 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6352 }
6353
6354 /* AArch64 stack frames generated by this compiler look like:
6355
6356 +-------------------------------+
6357 | |
6358 | incoming stack arguments |
6359 | |
6360 +-------------------------------+
6361 | | <-- incoming stack pointer (aligned)
6362 | callee-allocated save area |
6363 | for register varargs |
6364 | |
6365 +-------------------------------+
6366 | local variables | <-- frame_pointer_rtx
6367 | |
6368 +-------------------------------+
6369 | padding | \
6370 +-------------------------------+ |
6371 | callee-saved registers | | frame.saved_regs_size
6372 +-------------------------------+ |
6373 | LR' | |
6374 +-------------------------------+ |
6375 | FP' | / <- hard_frame_pointer_rtx (aligned)
6376 +-------------------------------+
6377 | dynamic allocation |
6378 +-------------------------------+
6379 | padding |
6380 +-------------------------------+
6381 | outgoing stack arguments | <-- arg_pointer
6382 | |
6383 +-------------------------------+
6384 | | <-- stack_pointer_rtx (aligned)
6385
6386 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6387 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6388 unchanged.
6389
6390 By default for stack-clash we assume the guard is at least 64KB, but this
6391 value is configurable to either 4KB or 64KB. We also force the guard size to
6392 be the same as the probing interval and both values are kept in sync.
6393
6394 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6395 on the guard size) of stack space without probing.
6396
6397 When probing is needed, we emit a probe at the start of the prologue
6398 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6399
6400 We have to track how much space has been allocated and the only stores
6401 to the stack we track as implicit probes are the FP/LR stores.
6402
6403 For outgoing arguments we probe if the size is larger than 1KB, such that
6404 the ABI specified buffer is maintained for the next callee.
6405
6406 The following registers are reserved during frame layout and should not be
6407 used for any other purpose:
6408
6409 - r11: Used by stack clash protection when SVE is enabled.
6410 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6411 - r14 and r15: Used for speculation tracking.
6412 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6413 - r30(LR), r29(FP): Used by standard frame layout.
6414
6415 These registers must be avoided in frame layout related code unless the
6416 explicit intention is to interact with one of the features listed above. */
6417
6418 /* Generate the prologue instructions for entry into a function.
6419 Establish the stack frame by decreasing the stack pointer with a
6420 properly calculated size and, if necessary, create a frame record
6421 filled with the values of LR and previous frame pointer. The
6422 current FP is also set up if it is in use. */
6423
6424 void
6425 aarch64_expand_prologue (void)
6426 {
6427 poly_int64 frame_size = cfun->machine->frame.frame_size;
6428 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6429 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6430 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6431 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6432 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6433 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6434 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6435 rtx_insn *insn;
6436
6437 /* Sign return address for functions. */
6438 if (aarch64_return_address_signing_enabled ())
6439 {
6440 switch (aarch64_ra_sign_key)
6441 {
6442 case AARCH64_KEY_A:
6443 insn = emit_insn (gen_paciasp ());
6444 break;
6445 case AARCH64_KEY_B:
6446 insn = emit_insn (gen_pacibsp ());
6447 break;
6448 default:
6449 gcc_unreachable ();
6450 }
6451 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6452 RTX_FRAME_RELATED_P (insn) = 1;
6453 }
6454
6455 if (flag_stack_usage_info)
6456 current_function_static_stack_size = constant_lower_bound (frame_size);
6457
6458 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6459 {
6460 if (crtl->is_leaf && !cfun->calls_alloca)
6461 {
6462 if (maybe_gt (frame_size, PROBE_INTERVAL)
6463 && maybe_gt (frame_size, get_stack_check_protect ()))
6464 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6465 (frame_size
6466 - get_stack_check_protect ()));
6467 }
6468 else if (maybe_gt (frame_size, 0))
6469 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6470 }
6471
6472 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6473 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6474
6475 /* In theory we should never have both an initial adjustment
6476 and a callee save adjustment. Verify that is the case since the
6477 code below does not handle it for -fstack-clash-protection. */
6478 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6479
6480 /* Will only probe if the initial adjustment is larger than the guard
6481 less the amount of the guard reserved for use by the caller's
6482 outgoing args. */
6483 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6484 true, false);
6485
6486 if (callee_adjust != 0)
6487 aarch64_push_regs (reg1, reg2, callee_adjust);
6488
6489 if (emit_frame_chain)
6490 {
6491 poly_int64 reg_offset = callee_adjust;
6492 if (callee_adjust == 0)
6493 {
6494 reg1 = R29_REGNUM;
6495 reg2 = R30_REGNUM;
6496 reg_offset = callee_offset;
6497 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6498 }
6499 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6500 stack_pointer_rtx, callee_offset,
6501 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6502 if (frame_pointer_needed && !frame_size.is_constant ())
6503 {
6504 /* Variable-sized frames need to describe the save slot
6505 address using DW_CFA_expression rather than DW_CFA_offset.
6506 This means that, without taking further action, the
6507 locations of the registers that we've already saved would
6508 remain based on the stack pointer even after we redefine
6509 the CFA based on the frame pointer. We therefore need new
6510 DW_CFA_expressions to re-express the save slots with addresses
6511 based on the frame pointer. */
6512 rtx_insn *insn = get_last_insn ();
6513 gcc_assert (RTX_FRAME_RELATED_P (insn));
6514
6515 /* Add an explicit CFA definition if this was previously
6516 implicit. */
6517 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6518 {
6519 rtx src = plus_constant (Pmode, stack_pointer_rtx,
6520 callee_offset);
6521 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6522 gen_rtx_SET (hard_frame_pointer_rtx, src));
6523 }
6524
6525 /* Change the save slot expressions for the registers that
6526 we've already saved. */
6527 reg_offset -= callee_offset;
6528 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6529 reg_offset + UNITS_PER_WORD);
6530 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6531 reg_offset);
6532 }
6533 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6534 }
6535
6536 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6537 callee_adjust != 0 || emit_frame_chain);
6538 if (aarch64_simd_decl_p (cfun->decl))
6539 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6540 callee_adjust != 0 || emit_frame_chain);
6541 else
6542 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6543 callee_adjust != 0 || emit_frame_chain);
6544
6545 /* We may need to probe the final adjustment if it is larger than the guard
6546 that is assumed by the called. */
6547 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6548 !frame_pointer_needed, true);
6549 }
6550
6551 /* Return TRUE if we can use a simple_return insn.
6552
6553 This function checks whether the callee saved stack is empty, which
6554 means no restore actions are need. The pro_and_epilogue will use
6555 this to check whether shrink-wrapping opt is feasible. */
6556
6557 bool
6558 aarch64_use_return_insn_p (void)
6559 {
6560 if (!reload_completed)
6561 return false;
6562
6563 if (crtl->profile)
6564 return false;
6565
6566 return known_eq (cfun->machine->frame.frame_size, 0);
6567 }
6568
6569 /* Return false for non-leaf SIMD functions in order to avoid
6570 shrink-wrapping them. Doing this will lose the necessary
6571 save/restore of FP registers. */
6572
6573 bool
6574 aarch64_use_simple_return_insn_p (void)
6575 {
6576 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6577 return false;
6578
6579 return true;
6580 }
6581
6582 /* Generate the epilogue instructions for returning from a function.
6583 This is almost exactly the reverse of the prolog sequence, except
6584 that we need to insert barriers to avoid scheduling loads that read
6585 from a deallocated stack, and we optimize the unwind records by
6586 emitting them all together if possible. */
6587 void
6588 aarch64_expand_epilogue (bool for_sibcall)
6589 {
6590 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6591 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6592 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6593 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6594 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6595 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6596 rtx cfi_ops = NULL;
6597 rtx_insn *insn;
6598 /* A stack clash protection prologue may not have left EP0_REGNUM or
6599 EP1_REGNUM in a usable state. The same is true for allocations
6600 with an SVE component, since we then need both temporary registers
6601 for each allocation. For stack clash we are in a usable state if
6602 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6603 HOST_WIDE_INT guard_size
6604 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6605 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6606
6607 /* We can re-use the registers when the allocation amount is smaller than
6608 guard_size - guard_used_by_caller because we won't be doing any probes
6609 then. In such situations the register should remain live with the correct
6610 value. */
6611 bool can_inherit_p = (initial_adjust.is_constant ()
6612 && final_adjust.is_constant ())
6613 && (!flag_stack_clash_protection
6614 || known_lt (initial_adjust,
6615 guard_size - guard_used_by_caller));
6616
6617 /* We need to add memory barrier to prevent read from deallocated stack. */
6618 bool need_barrier_p
6619 = maybe_ne (get_frame_size ()
6620 + cfun->machine->frame.saved_varargs_size, 0);
6621
6622 /* Emit a barrier to prevent loads from a deallocated stack. */
6623 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6624 || cfun->calls_alloca
6625 || crtl->calls_eh_return)
6626 {
6627 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6628 need_barrier_p = false;
6629 }
6630
6631 /* Restore the stack pointer from the frame pointer if it may not
6632 be the same as the stack pointer. */
6633 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6634 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6635 if (frame_pointer_needed
6636 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6637 /* If writeback is used when restoring callee-saves, the CFA
6638 is restored on the instruction doing the writeback. */
6639 aarch64_add_offset (Pmode, stack_pointer_rtx,
6640 hard_frame_pointer_rtx, -callee_offset,
6641 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6642 else
6643 /* The case where we need to re-use the register here is very rare, so
6644 avoid the complicated condition and just always emit a move if the
6645 immediate doesn't fit. */
6646 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6647
6648 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6649 callee_adjust != 0, &cfi_ops);
6650 if (aarch64_simd_decl_p (cfun->decl))
6651 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6652 callee_adjust != 0, &cfi_ops);
6653 else
6654 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6655 callee_adjust != 0, &cfi_ops);
6656
6657 if (need_barrier_p)
6658 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6659
6660 if (callee_adjust != 0)
6661 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6662
6663 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6664 {
6665 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6666 insn = get_last_insn ();
6667 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6668 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6669 RTX_FRAME_RELATED_P (insn) = 1;
6670 cfi_ops = NULL;
6671 }
6672
6673 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6674 add restriction on emit_move optimization to leaf functions. */
6675 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6676 (!can_inherit_p || !crtl->is_leaf
6677 || df_regs_ever_live_p (EP0_REGNUM)));
6678
6679 if (cfi_ops)
6680 {
6681 /* Emit delayed restores and reset the CFA to be SP. */
6682 insn = get_last_insn ();
6683 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6684 REG_NOTES (insn) = cfi_ops;
6685 RTX_FRAME_RELATED_P (insn) = 1;
6686 }
6687
6688 /* We prefer to emit the combined return/authenticate instruction RETAA,
6689 however there are three cases in which we must instead emit an explicit
6690 authentication instruction.
6691
6692 1) Sibcalls don't return in a normal way, so if we're about to call one
6693 we must authenticate.
6694
6695 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6696 generating code for !TARGET_ARMV8_3 we can't use it and must
6697 explicitly authenticate.
6698
6699 3) On an eh_return path we make extra stack adjustments to update the
6700 canonical frame address to be the exception handler's CFA. We want
6701 to authenticate using the CFA of the function which calls eh_return.
6702 */
6703 if (aarch64_return_address_signing_enabled ()
6704 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6705 {
6706 switch (aarch64_ra_sign_key)
6707 {
6708 case AARCH64_KEY_A:
6709 insn = emit_insn (gen_autiasp ());
6710 break;
6711 case AARCH64_KEY_B:
6712 insn = emit_insn (gen_autibsp ());
6713 break;
6714 default:
6715 gcc_unreachable ();
6716 }
6717 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6718 RTX_FRAME_RELATED_P (insn) = 1;
6719 }
6720
6721 /* Stack adjustment for exception handler. */
6722 if (crtl->calls_eh_return && !for_sibcall)
6723 {
6724 /* We need to unwind the stack by the offset computed by
6725 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6726 to be SP; letting the CFA move during this adjustment
6727 is just as correct as retaining the CFA from the body
6728 of the function. Therefore, do nothing special. */
6729 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6730 }
6731
6732 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6733 if (!for_sibcall)
6734 emit_jump_insn (ret_rtx);
6735 }
6736
6737 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6738 normally or return to a previous frame after unwinding.
6739
6740 An EH return uses a single shared return sequence. The epilogue is
6741 exactly like a normal epilogue except that it has an extra input
6742 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6743 that must be applied after the frame has been destroyed. An extra label
6744 is inserted before the epilogue which initializes this register to zero,
6745 and this is the entry point for a normal return.
6746
6747 An actual EH return updates the return address, initializes the stack
6748 adjustment and jumps directly into the epilogue (bypassing the zeroing
6749 of the adjustment). Since the return address is typically saved on the
6750 stack when a function makes a call, the saved LR must be updated outside
6751 the epilogue.
6752
6753 This poses problems as the store is generated well before the epilogue,
6754 so the offset of LR is not known yet. Also optimizations will remove the
6755 store as it appears dead, even after the epilogue is generated (as the
6756 base or offset for loading LR is different in many cases).
6757
6758 To avoid these problems this implementation forces the frame pointer
6759 in eh_return functions so that the location of LR is fixed and known early.
6760 It also marks the store volatile, so no optimization is permitted to
6761 remove the store. */
6762 rtx
6763 aarch64_eh_return_handler_rtx (void)
6764 {
6765 rtx tmp = gen_frame_mem (Pmode,
6766 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6767
6768 /* Mark the store volatile, so no optimization is permitted to remove it. */
6769 MEM_VOLATILE_P (tmp) = true;
6770 return tmp;
6771 }
6772
6773 /* Output code to add DELTA to the first argument, and then jump
6774 to FUNCTION. Used for C++ multiple inheritance. */
6775 static void
6776 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6777 HOST_WIDE_INT delta,
6778 HOST_WIDE_INT vcall_offset,
6779 tree function)
6780 {
6781 /* The this pointer is always in x0. Note that this differs from
6782 Arm where the this pointer maybe bumped to r1 if r0 is required
6783 to return a pointer to an aggregate. On AArch64 a result value
6784 pointer will be in x8. */
6785 int this_regno = R0_REGNUM;
6786 rtx this_rtx, temp0, temp1, addr, funexp;
6787 rtx_insn *insn;
6788 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6789
6790 if (aarch64_bti_enabled ())
6791 emit_insn (gen_bti_c());
6792
6793 reload_completed = 1;
6794 emit_note (NOTE_INSN_PROLOGUE_END);
6795
6796 this_rtx = gen_rtx_REG (Pmode, this_regno);
6797 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6798 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6799
6800 if (vcall_offset == 0)
6801 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6802 else
6803 {
6804 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6805
6806 addr = this_rtx;
6807 if (delta != 0)
6808 {
6809 if (delta >= -256 && delta < 256)
6810 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6811 plus_constant (Pmode, this_rtx, delta));
6812 else
6813 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6814 temp1, temp0, false);
6815 }
6816
6817 if (Pmode == ptr_mode)
6818 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6819 else
6820 aarch64_emit_move (temp0,
6821 gen_rtx_ZERO_EXTEND (Pmode,
6822 gen_rtx_MEM (ptr_mode, addr)));
6823
6824 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6825 addr = plus_constant (Pmode, temp0, vcall_offset);
6826 else
6827 {
6828 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6829 Pmode);
6830 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6831 }
6832
6833 if (Pmode == ptr_mode)
6834 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6835 else
6836 aarch64_emit_move (temp1,
6837 gen_rtx_SIGN_EXTEND (Pmode,
6838 gen_rtx_MEM (ptr_mode, addr)));
6839
6840 emit_insn (gen_add2_insn (this_rtx, temp1));
6841 }
6842
6843 /* Generate a tail call to the target function. */
6844 if (!TREE_USED (function))
6845 {
6846 assemble_external (function);
6847 TREE_USED (function) = 1;
6848 }
6849 funexp = XEXP (DECL_RTL (function), 0);
6850 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6851 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6852 SIBLING_CALL_P (insn) = 1;
6853
6854 insn = get_insns ();
6855 shorten_branches (insn);
6856
6857 assemble_start_function (thunk, fnname);
6858 final_start_function (insn, file, 1);
6859 final (insn, file, 1);
6860 final_end_function ();
6861 assemble_end_function (thunk, fnname);
6862
6863 /* Stop pretending to be a post-reload pass. */
6864 reload_completed = 0;
6865 }
6866
6867 static bool
6868 aarch64_tls_referenced_p (rtx x)
6869 {
6870 if (!TARGET_HAVE_TLS)
6871 return false;
6872 subrtx_iterator::array_type array;
6873 FOR_EACH_SUBRTX (iter, array, x, ALL)
6874 {
6875 const_rtx x = *iter;
6876 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6877 return true;
6878 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6879 TLS offsets, not real symbol references. */
6880 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6881 iter.skip_subrtxes ();
6882 }
6883 return false;
6884 }
6885
6886
6887 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6888 a left shift of 0 or 12 bits. */
6889 bool
6890 aarch64_uimm12_shift (HOST_WIDE_INT val)
6891 {
6892 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6893 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6894 );
6895 }
6896
6897 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6898 that can be created with a left shift of 0 or 12. */
6899 static HOST_WIDE_INT
6900 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6901 {
6902 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6903 handle correctly. */
6904 gcc_assert ((val & 0xffffff) == val);
6905
6906 if (((val & 0xfff) << 0) == val)
6907 return val;
6908
6909 return val & (0xfff << 12);
6910 }
6911
6912 /* Return true if val is an immediate that can be loaded into a
6913 register by a MOVZ instruction. */
6914 static bool
6915 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6916 {
6917 if (GET_MODE_SIZE (mode) > 4)
6918 {
6919 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6920 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6921 return 1;
6922 }
6923 else
6924 {
6925 /* Ignore sign extension. */
6926 val &= (HOST_WIDE_INT) 0xffffffff;
6927 }
6928 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6929 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6930 }
6931
6932 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6933 64-bit (DImode) integer. */
6934
6935 static unsigned HOST_WIDE_INT
6936 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6937 {
6938 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6939 while (size < 64)
6940 {
6941 val &= (HOST_WIDE_INT_1U << size) - 1;
6942 val |= val << size;
6943 size *= 2;
6944 }
6945 return val;
6946 }
6947
6948 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6949
6950 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6951 {
6952 0x0000000100000001ull,
6953 0x0001000100010001ull,
6954 0x0101010101010101ull,
6955 0x1111111111111111ull,
6956 0x5555555555555555ull,
6957 };
6958
6959
6960 /* Return true if val is a valid bitmask immediate. */
6961
6962 bool
6963 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6964 {
6965 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6966 int bits;
6967
6968 /* Check for a single sequence of one bits and return quickly if so.
6969 The special cases of all ones and all zeroes returns false. */
6970 val = aarch64_replicate_bitmask_imm (val_in, mode);
6971 tmp = val + (val & -val);
6972
6973 if (tmp == (tmp & -tmp))
6974 return (val + 1) > 1;
6975
6976 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6977 if (mode == SImode)
6978 val = (val << 32) | (val & 0xffffffff);
6979
6980 /* Invert if the immediate doesn't start with a zero bit - this means we
6981 only need to search for sequences of one bits. */
6982 if (val & 1)
6983 val = ~val;
6984
6985 /* Find the first set bit and set tmp to val with the first sequence of one
6986 bits removed. Return success if there is a single sequence of ones. */
6987 first_one = val & -val;
6988 tmp = val & (val + first_one);
6989
6990 if (tmp == 0)
6991 return true;
6992
6993 /* Find the next set bit and compute the difference in bit position. */
6994 next_one = tmp & -tmp;
6995 bits = clz_hwi (first_one) - clz_hwi (next_one);
6996 mask = val ^ tmp;
6997
6998 /* Check the bit position difference is a power of 2, and that the first
6999 sequence of one bits fits within 'bits' bits. */
7000 if ((mask >> bits) != 0 || bits != (bits & -bits))
7001 return false;
7002
7003 /* Check the sequence of one bits is repeated 64/bits times. */
7004 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7005 }
7006
7007 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7008 Assumed precondition: VAL_IN Is not zero. */
7009
7010 unsigned HOST_WIDE_INT
7011 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7012 {
7013 int lowest_bit_set = ctz_hwi (val_in);
7014 int highest_bit_set = floor_log2 (val_in);
7015 gcc_assert (val_in != 0);
7016
7017 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7018 (HOST_WIDE_INT_1U << lowest_bit_set));
7019 }
7020
7021 /* Create constant where bits outside of lowest bit set to highest bit set
7022 are set to 1. */
7023
7024 unsigned HOST_WIDE_INT
7025 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7026 {
7027 return val_in | ~aarch64_and_split_imm1 (val_in);
7028 }
7029
7030 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7031
7032 bool
7033 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7034 {
7035 scalar_int_mode int_mode;
7036 if (!is_a <scalar_int_mode> (mode, &int_mode))
7037 return false;
7038
7039 if (aarch64_bitmask_imm (val_in, int_mode))
7040 return false;
7041
7042 if (aarch64_move_imm (val_in, int_mode))
7043 return false;
7044
7045 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
7046
7047 return aarch64_bitmask_imm (imm2, int_mode);
7048 }
7049
7050 /* Return true if val is an immediate that can be loaded into a
7051 register in a single instruction. */
7052 bool
7053 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
7054 {
7055 scalar_int_mode int_mode;
7056 if (!is_a <scalar_int_mode> (mode, &int_mode))
7057 return false;
7058
7059 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
7060 return 1;
7061 return aarch64_bitmask_imm (val, int_mode);
7062 }
7063
7064 static bool
7065 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
7066 {
7067 rtx base, offset;
7068
7069 if (GET_CODE (x) == HIGH)
7070 return true;
7071
7072 /* There's no way to calculate VL-based values using relocations. */
7073 subrtx_iterator::array_type array;
7074 FOR_EACH_SUBRTX (iter, array, x, ALL)
7075 if (GET_CODE (*iter) == CONST_POLY_INT)
7076 return true;
7077
7078 split_const (x, &base, &offset);
7079 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
7080 {
7081 if (aarch64_classify_symbol (base, INTVAL (offset))
7082 != SYMBOL_FORCE_TO_MEM)
7083 return true;
7084 else
7085 /* Avoid generating a 64-bit relocation in ILP32; leave
7086 to aarch64_expand_mov_immediate to handle it properly. */
7087 return mode != ptr_mode;
7088 }
7089
7090 return aarch64_tls_referenced_p (x);
7091 }
7092
7093 /* Implement TARGET_CASE_VALUES_THRESHOLD.
7094 The expansion for a table switch is quite expensive due to the number
7095 of instructions, the table lookup and hard to predict indirect jump.
7096 When optimizing for speed, and -O3 enabled, use the per-core tuning if
7097 set, otherwise use tables for > 16 cases as a tradeoff between size and
7098 performance. When optimizing for size, use the default setting. */
7099
7100 static unsigned int
7101 aarch64_case_values_threshold (void)
7102 {
7103 /* Use the specified limit for the number of cases before using jump
7104 tables at higher optimization levels. */
7105 if (optimize > 2
7106 && selected_cpu->tune->max_case_values != 0)
7107 return selected_cpu->tune->max_case_values;
7108 else
7109 return optimize_size ? default_case_values_threshold () : 17;
7110 }
7111
7112 /* Return true if register REGNO is a valid index register.
7113 STRICT_P is true if REG_OK_STRICT is in effect. */
7114
7115 bool
7116 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
7117 {
7118 if (!HARD_REGISTER_NUM_P (regno))
7119 {
7120 if (!strict_p)
7121 return true;
7122
7123 if (!reg_renumber)
7124 return false;
7125
7126 regno = reg_renumber[regno];
7127 }
7128 return GP_REGNUM_P (regno);
7129 }
7130
7131 /* Return true if register REGNO is a valid base register for mode MODE.
7132 STRICT_P is true if REG_OK_STRICT is in effect. */
7133
7134 bool
7135 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
7136 {
7137 if (!HARD_REGISTER_NUM_P (regno))
7138 {
7139 if (!strict_p)
7140 return true;
7141
7142 if (!reg_renumber)
7143 return false;
7144
7145 regno = reg_renumber[regno];
7146 }
7147
7148 /* The fake registers will be eliminated to either the stack or
7149 hard frame pointer, both of which are usually valid base registers.
7150 Reload deals with the cases where the eliminated form isn't valid. */
7151 return (GP_REGNUM_P (regno)
7152 || regno == SP_REGNUM
7153 || regno == FRAME_POINTER_REGNUM
7154 || regno == ARG_POINTER_REGNUM);
7155 }
7156
7157 /* Return true if X is a valid base register for mode MODE.
7158 STRICT_P is true if REG_OK_STRICT is in effect. */
7159
7160 static bool
7161 aarch64_base_register_rtx_p (rtx x, bool strict_p)
7162 {
7163 if (!strict_p
7164 && GET_CODE (x) == SUBREG
7165 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
7166 x = SUBREG_REG (x);
7167
7168 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
7169 }
7170
7171 /* Return true if address offset is a valid index. If it is, fill in INFO
7172 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
7173
7174 static bool
7175 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
7176 machine_mode mode, bool strict_p)
7177 {
7178 enum aarch64_address_type type;
7179 rtx index;
7180 int shift;
7181
7182 /* (reg:P) */
7183 if ((REG_P (x) || GET_CODE (x) == SUBREG)
7184 && GET_MODE (x) == Pmode)
7185 {
7186 type = ADDRESS_REG_REG;
7187 index = x;
7188 shift = 0;
7189 }
7190 /* (sign_extend:DI (reg:SI)) */
7191 else if ((GET_CODE (x) == SIGN_EXTEND
7192 || GET_CODE (x) == ZERO_EXTEND)
7193 && GET_MODE (x) == DImode
7194 && GET_MODE (XEXP (x, 0)) == SImode)
7195 {
7196 type = (GET_CODE (x) == SIGN_EXTEND)
7197 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7198 index = XEXP (x, 0);
7199 shift = 0;
7200 }
7201 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
7202 else if (GET_CODE (x) == MULT
7203 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7204 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7205 && GET_MODE (XEXP (x, 0)) == DImode
7206 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7207 && CONST_INT_P (XEXP (x, 1)))
7208 {
7209 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7210 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7211 index = XEXP (XEXP (x, 0), 0);
7212 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7213 }
7214 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
7215 else if (GET_CODE (x) == ASHIFT
7216 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
7217 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
7218 && GET_MODE (XEXP (x, 0)) == DImode
7219 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
7220 && CONST_INT_P (XEXP (x, 1)))
7221 {
7222 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
7223 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7224 index = XEXP (XEXP (x, 0), 0);
7225 shift = INTVAL (XEXP (x, 1));
7226 }
7227 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
7228 else if ((GET_CODE (x) == SIGN_EXTRACT
7229 || GET_CODE (x) == ZERO_EXTRACT)
7230 && GET_MODE (x) == DImode
7231 && GET_CODE (XEXP (x, 0)) == MULT
7232 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7233 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7234 {
7235 type = (GET_CODE (x) == SIGN_EXTRACT)
7236 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7237 index = XEXP (XEXP (x, 0), 0);
7238 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7239 if (INTVAL (XEXP (x, 1)) != 32 + shift
7240 || INTVAL (XEXP (x, 2)) != 0)
7241 shift = -1;
7242 }
7243 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7244 (const_int 0xffffffff<<shift)) */
7245 else if (GET_CODE (x) == AND
7246 && GET_MODE (x) == DImode
7247 && GET_CODE (XEXP (x, 0)) == MULT
7248 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7249 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7250 && CONST_INT_P (XEXP (x, 1)))
7251 {
7252 type = ADDRESS_REG_UXTW;
7253 index = XEXP (XEXP (x, 0), 0);
7254 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7255 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7256 shift = -1;
7257 }
7258 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7259 else if ((GET_CODE (x) == SIGN_EXTRACT
7260 || GET_CODE (x) == ZERO_EXTRACT)
7261 && GET_MODE (x) == DImode
7262 && GET_CODE (XEXP (x, 0)) == ASHIFT
7263 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7264 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7265 {
7266 type = (GET_CODE (x) == SIGN_EXTRACT)
7267 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7268 index = XEXP (XEXP (x, 0), 0);
7269 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7270 if (INTVAL (XEXP (x, 1)) != 32 + shift
7271 || INTVAL (XEXP (x, 2)) != 0)
7272 shift = -1;
7273 }
7274 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7275 (const_int 0xffffffff<<shift)) */
7276 else if (GET_CODE (x) == AND
7277 && GET_MODE (x) == DImode
7278 && GET_CODE (XEXP (x, 0)) == ASHIFT
7279 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7280 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7281 && CONST_INT_P (XEXP (x, 1)))
7282 {
7283 type = ADDRESS_REG_UXTW;
7284 index = XEXP (XEXP (x, 0), 0);
7285 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7286 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7287 shift = -1;
7288 }
7289 /* (mult:P (reg:P) (const_int scale)) */
7290 else if (GET_CODE (x) == MULT
7291 && GET_MODE (x) == Pmode
7292 && GET_MODE (XEXP (x, 0)) == Pmode
7293 && CONST_INT_P (XEXP (x, 1)))
7294 {
7295 type = ADDRESS_REG_REG;
7296 index = XEXP (x, 0);
7297 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7298 }
7299 /* (ashift:P (reg:P) (const_int shift)) */
7300 else if (GET_CODE (x) == ASHIFT
7301 && GET_MODE (x) == Pmode
7302 && GET_MODE (XEXP (x, 0)) == Pmode
7303 && CONST_INT_P (XEXP (x, 1)))
7304 {
7305 type = ADDRESS_REG_REG;
7306 index = XEXP (x, 0);
7307 shift = INTVAL (XEXP (x, 1));
7308 }
7309 else
7310 return false;
7311
7312 if (!strict_p
7313 && GET_CODE (index) == SUBREG
7314 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7315 index = SUBREG_REG (index);
7316
7317 if (aarch64_sve_data_mode_p (mode))
7318 {
7319 if (type != ADDRESS_REG_REG
7320 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7321 return false;
7322 }
7323 else
7324 {
7325 if (shift != 0
7326 && !(IN_RANGE (shift, 1, 3)
7327 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7328 return false;
7329 }
7330
7331 if (REG_P (index)
7332 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7333 {
7334 info->type = type;
7335 info->offset = index;
7336 info->shift = shift;
7337 return true;
7338 }
7339
7340 return false;
7341 }
7342
7343 /* Return true if MODE is one of the modes for which we
7344 support LDP/STP operations. */
7345
7346 static bool
7347 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7348 {
7349 return mode == SImode || mode == DImode
7350 || mode == SFmode || mode == DFmode
7351 || (aarch64_vector_mode_supported_p (mode)
7352 && (known_eq (GET_MODE_SIZE (mode), 8)
7353 || (known_eq (GET_MODE_SIZE (mode), 16)
7354 && (aarch64_tune_params.extra_tuning_flags
7355 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7356 }
7357
7358 /* Return true if REGNO is a virtual pointer register, or an eliminable
7359 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7360 include stack_pointer or hard_frame_pointer. */
7361 static bool
7362 virt_or_elim_regno_p (unsigned regno)
7363 {
7364 return ((regno >= FIRST_VIRTUAL_REGISTER
7365 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7366 || regno == FRAME_POINTER_REGNUM
7367 || regno == ARG_POINTER_REGNUM);
7368 }
7369
7370 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7371 If it is, fill in INFO appropriately. STRICT_P is true if
7372 REG_OK_STRICT is in effect. */
7373
7374 bool
7375 aarch64_classify_address (struct aarch64_address_info *info,
7376 rtx x, machine_mode mode, bool strict_p,
7377 aarch64_addr_query_type type)
7378 {
7379 enum rtx_code code = GET_CODE (x);
7380 rtx op0, op1;
7381 poly_int64 offset;
7382
7383 HOST_WIDE_INT const_size;
7384
7385 /* On BE, we use load/store pair for all large int mode load/stores.
7386 TI/TFmode may also use a load/store pair. */
7387 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7388 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7389 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7390 || type == ADDR_QUERY_LDP_STP_N
7391 || mode == TImode
7392 || mode == TFmode
7393 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7394
7395 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7396 corresponds to the actual size of the memory being loaded/stored and the
7397 mode of the corresponding addressing mode is half of that. */
7398 if (type == ADDR_QUERY_LDP_STP_N
7399 && known_eq (GET_MODE_SIZE (mode), 16))
7400 mode = DFmode;
7401
7402 bool allow_reg_index_p = (!load_store_pair_p
7403 && (known_lt (GET_MODE_SIZE (mode), 16)
7404 || vec_flags == VEC_ADVSIMD
7405 || vec_flags & VEC_SVE_DATA));
7406
7407 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7408 [Rn, #offset, MUL VL]. */
7409 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7410 && (code != REG && code != PLUS))
7411 return false;
7412
7413 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7414 REG addressing. */
7415 if (advsimd_struct_p
7416 && !BYTES_BIG_ENDIAN
7417 && (code != POST_INC && code != REG))
7418 return false;
7419
7420 gcc_checking_assert (GET_MODE (x) == VOIDmode
7421 || SCALAR_INT_MODE_P (GET_MODE (x)));
7422
7423 switch (code)
7424 {
7425 case REG:
7426 case SUBREG:
7427 info->type = ADDRESS_REG_IMM;
7428 info->base = x;
7429 info->offset = const0_rtx;
7430 info->const_offset = 0;
7431 return aarch64_base_register_rtx_p (x, strict_p);
7432
7433 case PLUS:
7434 op0 = XEXP (x, 0);
7435 op1 = XEXP (x, 1);
7436
7437 if (! strict_p
7438 && REG_P (op0)
7439 && virt_or_elim_regno_p (REGNO (op0))
7440 && poly_int_rtx_p (op1, &offset))
7441 {
7442 info->type = ADDRESS_REG_IMM;
7443 info->base = op0;
7444 info->offset = op1;
7445 info->const_offset = offset;
7446
7447 return true;
7448 }
7449
7450 if (maybe_ne (GET_MODE_SIZE (mode), 0)
7451 && aarch64_base_register_rtx_p (op0, strict_p)
7452 && poly_int_rtx_p (op1, &offset))
7453 {
7454 info->type = ADDRESS_REG_IMM;
7455 info->base = op0;
7456 info->offset = op1;
7457 info->const_offset = offset;
7458
7459 /* TImode and TFmode values are allowed in both pairs of X
7460 registers and individual Q registers. The available
7461 address modes are:
7462 X,X: 7-bit signed scaled offset
7463 Q: 9-bit signed offset
7464 We conservatively require an offset representable in either mode.
7465 When performing the check for pairs of X registers i.e. LDP/STP
7466 pass down DImode since that is the natural size of the LDP/STP
7467 instruction memory accesses. */
7468 if (mode == TImode || mode == TFmode)
7469 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7470 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7471 || offset_12bit_unsigned_scaled_p (mode, offset)));
7472
7473 /* A 7bit offset check because OImode will emit a ldp/stp
7474 instruction (only big endian will get here).
7475 For ldp/stp instructions, the offset is scaled for the size of a
7476 single element of the pair. */
7477 if (mode == OImode)
7478 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7479
7480 /* Three 9/12 bit offsets checks because CImode will emit three
7481 ldr/str instructions (only big endian will get here). */
7482 if (mode == CImode)
7483 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7484 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7485 offset + 32)
7486 || offset_12bit_unsigned_scaled_p (V16QImode,
7487 offset + 32)));
7488
7489 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7490 instructions (only big endian will get here). */
7491 if (mode == XImode)
7492 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7493 && aarch64_offset_7bit_signed_scaled_p (TImode,
7494 offset + 32));
7495
7496 /* Make "m" use the LD1 offset range for SVE data modes, so
7497 that pre-RTL optimizers like ivopts will work to that
7498 instead of the wider LDR/STR range. */
7499 if (vec_flags == VEC_SVE_DATA)
7500 return (type == ADDR_QUERY_M
7501 ? offset_4bit_signed_scaled_p (mode, offset)
7502 : offset_9bit_signed_scaled_p (mode, offset));
7503
7504 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7505 {
7506 poly_int64 end_offset = (offset
7507 + GET_MODE_SIZE (mode)
7508 - BYTES_PER_SVE_VECTOR);
7509 return (type == ADDR_QUERY_M
7510 ? offset_4bit_signed_scaled_p (mode, offset)
7511 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7512 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7513 end_offset)));
7514 }
7515
7516 if (vec_flags == VEC_SVE_PRED)
7517 return offset_9bit_signed_scaled_p (mode, offset);
7518
7519 if (load_store_pair_p)
7520 return ((known_eq (GET_MODE_SIZE (mode), 4)
7521 || known_eq (GET_MODE_SIZE (mode), 8)
7522 || known_eq (GET_MODE_SIZE (mode), 16))
7523 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7524 else
7525 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7526 || offset_12bit_unsigned_scaled_p (mode, offset));
7527 }
7528
7529 if (allow_reg_index_p)
7530 {
7531 /* Look for base + (scaled/extended) index register. */
7532 if (aarch64_base_register_rtx_p (op0, strict_p)
7533 && aarch64_classify_index (info, op1, mode, strict_p))
7534 {
7535 info->base = op0;
7536 return true;
7537 }
7538 if (aarch64_base_register_rtx_p (op1, strict_p)
7539 && aarch64_classify_index (info, op0, mode, strict_p))
7540 {
7541 info->base = op1;
7542 return true;
7543 }
7544 }
7545
7546 return false;
7547
7548 case POST_INC:
7549 case POST_DEC:
7550 case PRE_INC:
7551 case PRE_DEC:
7552 info->type = ADDRESS_REG_WB;
7553 info->base = XEXP (x, 0);
7554 info->offset = NULL_RTX;
7555 return aarch64_base_register_rtx_p (info->base, strict_p);
7556
7557 case POST_MODIFY:
7558 case PRE_MODIFY:
7559 info->type = ADDRESS_REG_WB;
7560 info->base = XEXP (x, 0);
7561 if (GET_CODE (XEXP (x, 1)) == PLUS
7562 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7563 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7564 && aarch64_base_register_rtx_p (info->base, strict_p))
7565 {
7566 info->offset = XEXP (XEXP (x, 1), 1);
7567 info->const_offset = offset;
7568
7569 /* TImode and TFmode values are allowed in both pairs of X
7570 registers and individual Q registers. The available
7571 address modes are:
7572 X,X: 7-bit signed scaled offset
7573 Q: 9-bit signed offset
7574 We conservatively require an offset representable in either mode.
7575 */
7576 if (mode == TImode || mode == TFmode)
7577 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7578 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7579
7580 if (load_store_pair_p)
7581 return ((known_eq (GET_MODE_SIZE (mode), 4)
7582 || known_eq (GET_MODE_SIZE (mode), 8)
7583 || known_eq (GET_MODE_SIZE (mode), 16))
7584 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7585 else
7586 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7587 }
7588 return false;
7589
7590 case CONST:
7591 case SYMBOL_REF:
7592 case LABEL_REF:
7593 /* load literal: pc-relative constant pool entry. Only supported
7594 for SI mode or larger. */
7595 info->type = ADDRESS_SYMBOLIC;
7596
7597 if (!load_store_pair_p
7598 && GET_MODE_SIZE (mode).is_constant (&const_size)
7599 && const_size >= 4)
7600 {
7601 rtx sym, addend;
7602
7603 split_const (x, &sym, &addend);
7604 return ((GET_CODE (sym) == LABEL_REF
7605 || (GET_CODE (sym) == SYMBOL_REF
7606 && CONSTANT_POOL_ADDRESS_P (sym)
7607 && aarch64_pcrelative_literal_loads)));
7608 }
7609 return false;
7610
7611 case LO_SUM:
7612 info->type = ADDRESS_LO_SUM;
7613 info->base = XEXP (x, 0);
7614 info->offset = XEXP (x, 1);
7615 if (allow_reg_index_p
7616 && aarch64_base_register_rtx_p (info->base, strict_p))
7617 {
7618 rtx sym, offs;
7619 split_const (info->offset, &sym, &offs);
7620 if (GET_CODE (sym) == SYMBOL_REF
7621 && (aarch64_classify_symbol (sym, INTVAL (offs))
7622 == SYMBOL_SMALL_ABSOLUTE))
7623 {
7624 /* The symbol and offset must be aligned to the access size. */
7625 unsigned int align;
7626
7627 if (CONSTANT_POOL_ADDRESS_P (sym))
7628 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7629 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7630 {
7631 tree exp = SYMBOL_REF_DECL (sym);
7632 align = TYPE_ALIGN (TREE_TYPE (exp));
7633 align = aarch64_constant_alignment (exp, align);
7634 }
7635 else if (SYMBOL_REF_DECL (sym))
7636 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7637 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7638 && SYMBOL_REF_BLOCK (sym) != NULL)
7639 align = SYMBOL_REF_BLOCK (sym)->alignment;
7640 else
7641 align = BITS_PER_UNIT;
7642
7643 poly_int64 ref_size = GET_MODE_SIZE (mode);
7644 if (known_eq (ref_size, 0))
7645 ref_size = GET_MODE_SIZE (DImode);
7646
7647 return (multiple_p (INTVAL (offs), ref_size)
7648 && multiple_p (align / BITS_PER_UNIT, ref_size));
7649 }
7650 }
7651 return false;
7652
7653 default:
7654 return false;
7655 }
7656 }
7657
7658 /* Return true if the address X is valid for a PRFM instruction.
7659 STRICT_P is true if we should do strict checking with
7660 aarch64_classify_address. */
7661
7662 bool
7663 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7664 {
7665 struct aarch64_address_info addr;
7666
7667 /* PRFM accepts the same addresses as DImode... */
7668 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7669 if (!res)
7670 return false;
7671
7672 /* ... except writeback forms. */
7673 return addr.type != ADDRESS_REG_WB;
7674 }
7675
7676 bool
7677 aarch64_symbolic_address_p (rtx x)
7678 {
7679 rtx offset;
7680
7681 split_const (x, &x, &offset);
7682 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7683 }
7684
7685 /* Classify the base of symbolic expression X. */
7686
7687 enum aarch64_symbol_type
7688 aarch64_classify_symbolic_expression (rtx x)
7689 {
7690 rtx offset;
7691
7692 split_const (x, &x, &offset);
7693 return aarch64_classify_symbol (x, INTVAL (offset));
7694 }
7695
7696
7697 /* Return TRUE if X is a legitimate address for accessing memory in
7698 mode MODE. */
7699 static bool
7700 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7701 {
7702 struct aarch64_address_info addr;
7703
7704 return aarch64_classify_address (&addr, x, mode, strict_p);
7705 }
7706
7707 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7708 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7709 bool
7710 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7711 aarch64_addr_query_type type)
7712 {
7713 struct aarch64_address_info addr;
7714
7715 return aarch64_classify_address (&addr, x, mode, strict_p, type);
7716 }
7717
7718 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7719
7720 static bool
7721 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7722 poly_int64 orig_offset,
7723 machine_mode mode)
7724 {
7725 HOST_WIDE_INT size;
7726 if (GET_MODE_SIZE (mode).is_constant (&size))
7727 {
7728 HOST_WIDE_INT const_offset, second_offset;
7729
7730 /* A general SVE offset is A * VQ + B. Remove the A component from
7731 coefficient 0 in order to get the constant B. */
7732 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7733
7734 /* Split an out-of-range address displacement into a base and
7735 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7736 range otherwise to increase opportunities for sharing the base
7737 address of different sizes. Unaligned accesses use the signed
7738 9-bit range, TImode/TFmode use the intersection of signed
7739 scaled 7-bit and signed 9-bit offset. */
7740 if (mode == TImode || mode == TFmode)
7741 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7742 else if ((const_offset & (size - 1)) != 0)
7743 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7744 else
7745 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7746
7747 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7748 return false;
7749
7750 /* Split the offset into second_offset and the rest. */
7751 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7752 *offset2 = gen_int_mode (second_offset, Pmode);
7753 return true;
7754 }
7755 else
7756 {
7757 /* Get the mode we should use as the basis of the range. For structure
7758 modes this is the mode of one vector. */
7759 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7760 machine_mode step_mode
7761 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7762
7763 /* Get the "mul vl" multiplier we'd like to use. */
7764 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7765 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7766 if (vec_flags & VEC_SVE_DATA)
7767 /* LDR supports a 9-bit range, but the move patterns for
7768 structure modes require all vectors to be in range of the
7769 same base. The simplest way of accomodating that while still
7770 promoting reuse of anchor points between different modes is
7771 to use an 8-bit range unconditionally. */
7772 vnum = ((vnum + 128) & 255) - 128;
7773 else
7774 /* Predicates are only handled singly, so we might as well use
7775 the full range. */
7776 vnum = ((vnum + 256) & 511) - 256;
7777 if (vnum == 0)
7778 return false;
7779
7780 /* Convert the "mul vl" multiplier into a byte offset. */
7781 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7782 if (known_eq (second_offset, orig_offset))
7783 return false;
7784
7785 /* Split the offset into second_offset and the rest. */
7786 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7787 *offset2 = gen_int_mode (second_offset, Pmode);
7788 return true;
7789 }
7790 }
7791
7792 /* Return the binary representation of floating point constant VALUE in INTVAL.
7793 If the value cannot be converted, return false without setting INTVAL.
7794 The conversion is done in the given MODE. */
7795 bool
7796 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7797 {
7798
7799 /* We make a general exception for 0. */
7800 if (aarch64_float_const_zero_rtx_p (value))
7801 {
7802 *intval = 0;
7803 return true;
7804 }
7805
7806 scalar_float_mode mode;
7807 if (GET_CODE (value) != CONST_DOUBLE
7808 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7809 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7810 /* Only support up to DF mode. */
7811 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7812 return false;
7813
7814 unsigned HOST_WIDE_INT ival = 0;
7815
7816 long res[2];
7817 real_to_target (res,
7818 CONST_DOUBLE_REAL_VALUE (value),
7819 REAL_MODE_FORMAT (mode));
7820
7821 if (mode == DFmode)
7822 {
7823 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7824 ival = zext_hwi (res[order], 32);
7825 ival |= (zext_hwi (res[1 - order], 32) << 32);
7826 }
7827 else
7828 ival = zext_hwi (res[0], 32);
7829
7830 *intval = ival;
7831 return true;
7832 }
7833
7834 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7835 single MOV(+MOVK) followed by an FMOV. */
7836 bool
7837 aarch64_float_const_rtx_p (rtx x)
7838 {
7839 machine_mode mode = GET_MODE (x);
7840 if (mode == VOIDmode)
7841 return false;
7842
7843 /* Determine whether it's cheaper to write float constants as
7844 mov/movk pairs over ldr/adrp pairs. */
7845 unsigned HOST_WIDE_INT ival;
7846
7847 if (GET_CODE (x) == CONST_DOUBLE
7848 && SCALAR_FLOAT_MODE_P (mode)
7849 && aarch64_reinterpret_float_as_int (x, &ival))
7850 {
7851 scalar_int_mode imode = (mode == HFmode
7852 ? SImode
7853 : int_mode_for_mode (mode).require ());
7854 int num_instr = aarch64_internal_mov_immediate
7855 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7856 return num_instr < 3;
7857 }
7858
7859 return false;
7860 }
7861
7862 /* Return TRUE if rtx X is immediate constant 0.0 */
7863 bool
7864 aarch64_float_const_zero_rtx_p (rtx x)
7865 {
7866 if (GET_MODE (x) == VOIDmode)
7867 return false;
7868
7869 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7870 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7871 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7872 }
7873
7874 /* Return TRUE if rtx X is immediate constant that fits in a single
7875 MOVI immediate operation. */
7876 bool
7877 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7878 {
7879 if (!TARGET_SIMD)
7880 return false;
7881
7882 machine_mode vmode;
7883 scalar_int_mode imode;
7884 unsigned HOST_WIDE_INT ival;
7885
7886 if (GET_CODE (x) == CONST_DOUBLE
7887 && SCALAR_FLOAT_MODE_P (mode))
7888 {
7889 if (!aarch64_reinterpret_float_as_int (x, &ival))
7890 return false;
7891
7892 /* We make a general exception for 0. */
7893 if (aarch64_float_const_zero_rtx_p (x))
7894 return true;
7895
7896 imode = int_mode_for_mode (mode).require ();
7897 }
7898 else if (GET_CODE (x) == CONST_INT
7899 && is_a <scalar_int_mode> (mode, &imode))
7900 ival = INTVAL (x);
7901 else
7902 return false;
7903
7904 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7905 a 128 bit vector mode. */
7906 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7907
7908 vmode = aarch64_simd_container_mode (imode, width);
7909 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7910
7911 return aarch64_simd_valid_immediate (v_op, NULL);
7912 }
7913
7914
7915 /* Return the fixed registers used for condition codes. */
7916
7917 static bool
7918 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7919 {
7920 *p1 = CC_REGNUM;
7921 *p2 = INVALID_REGNUM;
7922 return true;
7923 }
7924
7925 /* This function is used by the call expanders of the machine description.
7926 RESULT is the register in which the result is returned. It's NULL for
7927 "call" and "sibcall".
7928 MEM is the location of the function call.
7929 SIBCALL indicates whether this function call is normal call or sibling call.
7930 It will generate different pattern accordingly. */
7931
7932 void
7933 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7934 {
7935 rtx call, callee, tmp;
7936 rtvec vec;
7937 machine_mode mode;
7938
7939 gcc_assert (MEM_P (mem));
7940 callee = XEXP (mem, 0);
7941 mode = GET_MODE (callee);
7942 gcc_assert (mode == Pmode);
7943
7944 /* Decide if we should generate indirect calls by loading the
7945 address of the callee into a register before performing
7946 the branch-and-link. */
7947 if (SYMBOL_REF_P (callee)
7948 ? (aarch64_is_long_call_p (callee)
7949 || aarch64_is_noplt_call_p (callee))
7950 : !REG_P (callee))
7951 XEXP (mem, 0) = force_reg (mode, callee);
7952
7953 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7954
7955 if (result != NULL_RTX)
7956 call = gen_rtx_SET (result, call);
7957
7958 if (sibcall)
7959 tmp = ret_rtx;
7960 else
7961 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7962
7963 vec = gen_rtvec (2, call, tmp);
7964 call = gen_rtx_PARALLEL (VOIDmode, vec);
7965
7966 aarch64_emit_call_insn (call);
7967 }
7968
7969 /* Emit call insn with PAT and do aarch64-specific handling. */
7970
7971 void
7972 aarch64_emit_call_insn (rtx pat)
7973 {
7974 rtx insn = emit_call_insn (pat);
7975
7976 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7977 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7978 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7979 }
7980
7981 machine_mode
7982 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7983 {
7984 machine_mode mode_x = GET_MODE (x);
7985 rtx_code code_x = GET_CODE (x);
7986
7987 /* All floating point compares return CCFP if it is an equality
7988 comparison, and CCFPE otherwise. */
7989 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7990 {
7991 switch (code)
7992 {
7993 case EQ:
7994 case NE:
7995 case UNORDERED:
7996 case ORDERED:
7997 case UNLT:
7998 case UNLE:
7999 case UNGT:
8000 case UNGE:
8001 case UNEQ:
8002 return CCFPmode;
8003
8004 case LT:
8005 case LE:
8006 case GT:
8007 case GE:
8008 case LTGT:
8009 return CCFPEmode;
8010
8011 default:
8012 gcc_unreachable ();
8013 }
8014 }
8015
8016 /* Equality comparisons of short modes against zero can be performed
8017 using the TST instruction with the appropriate bitmask. */
8018 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8019 && (code == EQ || code == NE)
8020 && (mode_x == HImode || mode_x == QImode))
8021 return CC_NZmode;
8022
8023 /* Similarly, comparisons of zero_extends from shorter modes can
8024 be performed using an ANDS with an immediate mask. */
8025 if (y == const0_rtx && code_x == ZERO_EXTEND
8026 && (mode_x == SImode || mode_x == DImode)
8027 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
8028 && (code == EQ || code == NE))
8029 return CC_NZmode;
8030
8031 if ((mode_x == SImode || mode_x == DImode)
8032 && y == const0_rtx
8033 && (code == EQ || code == NE || code == LT || code == GE)
8034 && (code_x == PLUS || code_x == MINUS || code_x == AND
8035 || code_x == NEG
8036 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
8037 && CONST_INT_P (XEXP (x, 2)))))
8038 return CC_NZmode;
8039
8040 /* A compare with a shifted operand. Because of canonicalization,
8041 the comparison will have to be swapped when we emit the assembly
8042 code. */
8043 if ((mode_x == SImode || mode_x == DImode)
8044 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
8045 && (code_x == ASHIFT || code_x == ASHIFTRT
8046 || code_x == LSHIFTRT
8047 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
8048 return CC_SWPmode;
8049
8050 /* Similarly for a negated operand, but we can only do this for
8051 equalities. */
8052 if ((mode_x == SImode || mode_x == DImode)
8053 && (REG_P (y) || GET_CODE (y) == SUBREG)
8054 && (code == EQ || code == NE)
8055 && code_x == NEG)
8056 return CC_Zmode;
8057
8058 /* A test for unsigned overflow from an addition. */
8059 if ((mode_x == DImode || mode_x == TImode)
8060 && (code == LTU || code == GEU)
8061 && code_x == PLUS
8062 && rtx_equal_p (XEXP (x, 0), y))
8063 return CC_Cmode;
8064
8065 /* A test for unsigned overflow from an add with carry. */
8066 if ((mode_x == DImode || mode_x == TImode)
8067 && (code == LTU || code == GEU)
8068 && code_x == PLUS
8069 && CONST_SCALAR_INT_P (y)
8070 && (rtx_mode_t (y, mode_x)
8071 == (wi::shwi (1, mode_x)
8072 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
8073 return CC_ADCmode;
8074
8075 /* A test for signed overflow. */
8076 if ((mode_x == DImode || mode_x == TImode)
8077 && code == NE
8078 && code_x == PLUS
8079 && GET_CODE (y) == SIGN_EXTEND)
8080 return CC_Vmode;
8081
8082 /* For everything else, return CCmode. */
8083 return CCmode;
8084 }
8085
8086 static int
8087 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
8088
8089 int
8090 aarch64_get_condition_code (rtx x)
8091 {
8092 machine_mode mode = GET_MODE (XEXP (x, 0));
8093 enum rtx_code comp_code = GET_CODE (x);
8094
8095 if (GET_MODE_CLASS (mode) != MODE_CC)
8096 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
8097 return aarch64_get_condition_code_1 (mode, comp_code);
8098 }
8099
8100 static int
8101 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
8102 {
8103 switch (mode)
8104 {
8105 case E_CCFPmode:
8106 case E_CCFPEmode:
8107 switch (comp_code)
8108 {
8109 case GE: return AARCH64_GE;
8110 case GT: return AARCH64_GT;
8111 case LE: return AARCH64_LS;
8112 case LT: return AARCH64_MI;
8113 case NE: return AARCH64_NE;
8114 case EQ: return AARCH64_EQ;
8115 case ORDERED: return AARCH64_VC;
8116 case UNORDERED: return AARCH64_VS;
8117 case UNLT: return AARCH64_LT;
8118 case UNLE: return AARCH64_LE;
8119 case UNGT: return AARCH64_HI;
8120 case UNGE: return AARCH64_PL;
8121 default: return -1;
8122 }
8123 break;
8124
8125 case E_CCmode:
8126 switch (comp_code)
8127 {
8128 case NE: return AARCH64_NE;
8129 case EQ: return AARCH64_EQ;
8130 case GE: return AARCH64_GE;
8131 case GT: return AARCH64_GT;
8132 case LE: return AARCH64_LE;
8133 case LT: return AARCH64_LT;
8134 case GEU: return AARCH64_CS;
8135 case GTU: return AARCH64_HI;
8136 case LEU: return AARCH64_LS;
8137 case LTU: return AARCH64_CC;
8138 default: return -1;
8139 }
8140 break;
8141
8142 case E_CC_SWPmode:
8143 switch (comp_code)
8144 {
8145 case NE: return AARCH64_NE;
8146 case EQ: return AARCH64_EQ;
8147 case GE: return AARCH64_LE;
8148 case GT: return AARCH64_LT;
8149 case LE: return AARCH64_GE;
8150 case LT: return AARCH64_GT;
8151 case GEU: return AARCH64_LS;
8152 case GTU: return AARCH64_CC;
8153 case LEU: return AARCH64_CS;
8154 case LTU: return AARCH64_HI;
8155 default: return -1;
8156 }
8157 break;
8158
8159 case E_CC_NZCmode:
8160 switch (comp_code)
8161 {
8162 case NE: return AARCH64_NE; /* = any */
8163 case EQ: return AARCH64_EQ; /* = none */
8164 case GE: return AARCH64_PL; /* = nfrst */
8165 case LT: return AARCH64_MI; /* = first */
8166 case GEU: return AARCH64_CS; /* = nlast */
8167 case GTU: return AARCH64_HI; /* = pmore */
8168 case LEU: return AARCH64_LS; /* = plast */
8169 case LTU: return AARCH64_CC; /* = last */
8170 default: return -1;
8171 }
8172 break;
8173
8174 case E_CC_NZmode:
8175 switch (comp_code)
8176 {
8177 case NE: return AARCH64_NE;
8178 case EQ: return AARCH64_EQ;
8179 case GE: return AARCH64_PL;
8180 case LT: return AARCH64_MI;
8181 default: return -1;
8182 }
8183 break;
8184
8185 case E_CC_Zmode:
8186 switch (comp_code)
8187 {
8188 case NE: return AARCH64_NE;
8189 case EQ: return AARCH64_EQ;
8190 default: return -1;
8191 }
8192 break;
8193
8194 case E_CC_Cmode:
8195 switch (comp_code)
8196 {
8197 case LTU: return AARCH64_CS;
8198 case GEU: return AARCH64_CC;
8199 default: return -1;
8200 }
8201 break;
8202
8203 case E_CC_ADCmode:
8204 switch (comp_code)
8205 {
8206 case GEU: return AARCH64_CS;
8207 case LTU: return AARCH64_CC;
8208 default: return -1;
8209 }
8210 break;
8211
8212 case E_CC_Vmode:
8213 switch (comp_code)
8214 {
8215 case NE: return AARCH64_VS;
8216 case EQ: return AARCH64_VC;
8217 default: return -1;
8218 }
8219 break;
8220
8221 default:
8222 return -1;
8223 }
8224
8225 return -1;
8226 }
8227
8228 bool
8229 aarch64_const_vec_all_same_in_range_p (rtx x,
8230 HOST_WIDE_INT minval,
8231 HOST_WIDE_INT maxval)
8232 {
8233 rtx elt;
8234 return (const_vec_duplicate_p (x, &elt)
8235 && CONST_INT_P (elt)
8236 && IN_RANGE (INTVAL (elt), minval, maxval));
8237 }
8238
8239 bool
8240 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8241 {
8242 return aarch64_const_vec_all_same_in_range_p (x, val, val);
8243 }
8244
8245 /* Return true if VEC is a constant in which every element is in the range
8246 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8247
8248 static bool
8249 aarch64_const_vec_all_in_range_p (rtx vec,
8250 HOST_WIDE_INT minval,
8251 HOST_WIDE_INT maxval)
8252 {
8253 if (GET_CODE (vec) != CONST_VECTOR
8254 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8255 return false;
8256
8257 int nunits;
8258 if (!CONST_VECTOR_STEPPED_P (vec))
8259 nunits = const_vector_encoded_nelts (vec);
8260 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8261 return false;
8262
8263 for (int i = 0; i < nunits; i++)
8264 {
8265 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8266 if (!CONST_INT_P (vec_elem)
8267 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8268 return false;
8269 }
8270 return true;
8271 }
8272
8273 /* N Z C V. */
8274 #define AARCH64_CC_V 1
8275 #define AARCH64_CC_C (1 << 1)
8276 #define AARCH64_CC_Z (1 << 2)
8277 #define AARCH64_CC_N (1 << 3)
8278
8279 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8280 static const int aarch64_nzcv_codes[] =
8281 {
8282 0, /* EQ, Z == 1. */
8283 AARCH64_CC_Z, /* NE, Z == 0. */
8284 0, /* CS, C == 1. */
8285 AARCH64_CC_C, /* CC, C == 0. */
8286 0, /* MI, N == 1. */
8287 AARCH64_CC_N, /* PL, N == 0. */
8288 0, /* VS, V == 1. */
8289 AARCH64_CC_V, /* VC, V == 0. */
8290 0, /* HI, C ==1 && Z == 0. */
8291 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
8292 AARCH64_CC_V, /* GE, N == V. */
8293 0, /* LT, N != V. */
8294 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
8295 0, /* LE, !(Z == 0 && N == V). */
8296 0, /* AL, Any. */
8297 0 /* NV, Any. */
8298 };
8299
8300 /* Print floating-point vector immediate operand X to F, negating it
8301 first if NEGATE is true. Return true on success, false if it isn't
8302 a constant we can handle. */
8303
8304 static bool
8305 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8306 {
8307 rtx elt;
8308
8309 if (!const_vec_duplicate_p (x, &elt))
8310 return false;
8311
8312 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8313 if (negate)
8314 r = real_value_negate (&r);
8315
8316 /* Handle the SVE single-bit immediates specially, since they have a
8317 fixed form in the assembly syntax. */
8318 if (real_equal (&r, &dconst0))
8319 asm_fprintf (f, "0.0");
8320 else if (real_equal (&r, &dconst2))
8321 asm_fprintf (f, "2.0");
8322 else if (real_equal (&r, &dconst1))
8323 asm_fprintf (f, "1.0");
8324 else if (real_equal (&r, &dconsthalf))
8325 asm_fprintf (f, "0.5");
8326 else
8327 {
8328 const int buf_size = 20;
8329 char float_buf[buf_size] = {'\0'};
8330 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
8331 1, GET_MODE (elt));
8332 asm_fprintf (f, "%s", float_buf);
8333 }
8334
8335 return true;
8336 }
8337
8338 /* Return the equivalent letter for size. */
8339 static char
8340 sizetochar (int size)
8341 {
8342 switch (size)
8343 {
8344 case 64: return 'd';
8345 case 32: return 's';
8346 case 16: return 'h';
8347 case 8 : return 'b';
8348 default: gcc_unreachable ();
8349 }
8350 }
8351
8352 /* Print operand X to file F in a target specific manner according to CODE.
8353 The acceptable formatting commands given by CODE are:
8354 'c': An integer or symbol address without a preceding #
8355 sign.
8356 'C': Take the duplicated element in a vector constant
8357 and print it in hex.
8358 'D': Take the duplicated element in a vector constant
8359 and print it as an unsigned integer, in decimal.
8360 'e': Print the sign/zero-extend size as a character 8->b,
8361 16->h, 32->w. Can also be used for masks:
8362 0xff->b, 0xffff->h, 0xffffffff->w.
8363 'I': If the operand is a duplicated vector constant,
8364 replace it with the duplicated scalar. If the
8365 operand is then a floating-point constant, replace
8366 it with the integer bit representation. Print the
8367 transformed constant as a signed decimal number.
8368 'p': Prints N such that 2^N == X (X must be power of 2 and
8369 const int).
8370 'P': Print the number of non-zero bits in X (a const_int).
8371 'H': Print the higher numbered register of a pair (TImode)
8372 of regs.
8373 'm': Print a condition (eq, ne, etc).
8374 'M': Same as 'm', but invert condition.
8375 'N': Take the duplicated element in a vector constant
8376 and print the negative of it in decimal.
8377 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8378 'S/T/U/V': Print a FP/SIMD register name for a register list.
8379 The register printed is the FP/SIMD register name
8380 of X + 0/1/2/3 for S/T/U/V.
8381 'R': Print a scalar FP/SIMD register name + 1.
8382 'X': Print bottom 16 bits of integer constant in hex.
8383 'w/x': Print a general register name or the zero register
8384 (32-bit or 64-bit).
8385 '0': Print a normal operand, if it's a general register,
8386 then we assume DImode.
8387 'k': Print NZCV for conditional compare instructions.
8388 'A': Output address constant representing the first
8389 argument of X, specifying a relocation offset
8390 if appropriate.
8391 'L': Output constant address specified by X
8392 with a relocation offset if appropriate.
8393 'G': Prints address of X, specifying a PC relative
8394 relocation mode if appropriate.
8395 'y': Output address of LDP or STP - this is used for
8396 some LDP/STPs which don't use a PARALLEL in their
8397 pattern (so the mode needs to be adjusted).
8398 'z': Output address of a typical LDP or STP. */
8399
8400 static void
8401 aarch64_print_operand (FILE *f, rtx x, int code)
8402 {
8403 rtx elt;
8404 switch (code)
8405 {
8406 case 'c':
8407 switch (GET_CODE (x))
8408 {
8409 case CONST_INT:
8410 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8411 break;
8412
8413 case SYMBOL_REF:
8414 output_addr_const (f, x);
8415 break;
8416
8417 case CONST:
8418 if (GET_CODE (XEXP (x, 0)) == PLUS
8419 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8420 {
8421 output_addr_const (f, x);
8422 break;
8423 }
8424 /* Fall through. */
8425
8426 default:
8427 output_operand_lossage ("unsupported operand for code '%c'", code);
8428 }
8429 break;
8430
8431 case 'e':
8432 {
8433 x = unwrap_const_vec_duplicate (x);
8434 if (!CONST_INT_P (x))
8435 {
8436 output_operand_lossage ("invalid operand for '%%%c'", code);
8437 return;
8438 }
8439
8440 HOST_WIDE_INT val = INTVAL (x);
8441 if ((val & ~7) == 8 || val == 0xff)
8442 fputc ('b', f);
8443 else if ((val & ~7) == 16 || val == 0xffff)
8444 fputc ('h', f);
8445 else if ((val & ~7) == 32 || val == 0xffffffff)
8446 fputc ('w', f);
8447 else
8448 {
8449 output_operand_lossage ("invalid operand for '%%%c'", code);
8450 return;
8451 }
8452 }
8453 break;
8454
8455 case 'p':
8456 {
8457 int n;
8458
8459 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8460 {
8461 output_operand_lossage ("invalid operand for '%%%c'", code);
8462 return;
8463 }
8464
8465 asm_fprintf (f, "%d", n);
8466 }
8467 break;
8468
8469 case 'P':
8470 if (!CONST_INT_P (x))
8471 {
8472 output_operand_lossage ("invalid operand for '%%%c'", code);
8473 return;
8474 }
8475
8476 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8477 break;
8478
8479 case 'H':
8480 if (x == const0_rtx)
8481 {
8482 asm_fprintf (f, "xzr");
8483 break;
8484 }
8485
8486 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8487 {
8488 output_operand_lossage ("invalid operand for '%%%c'", code);
8489 return;
8490 }
8491
8492 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8493 break;
8494
8495 case 'I':
8496 {
8497 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
8498 if (CONST_INT_P (x))
8499 asm_fprintf (f, "%wd", INTVAL (x));
8500 else
8501 {
8502 output_operand_lossage ("invalid operand for '%%%c'", code);
8503 return;
8504 }
8505 break;
8506 }
8507
8508 case 'M':
8509 case 'm':
8510 {
8511 int cond_code;
8512 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8513 if (x == const_true_rtx)
8514 {
8515 if (code == 'M')
8516 fputs ("nv", f);
8517 return;
8518 }
8519
8520 if (!COMPARISON_P (x))
8521 {
8522 output_operand_lossage ("invalid operand for '%%%c'", code);
8523 return;
8524 }
8525
8526 cond_code = aarch64_get_condition_code (x);
8527 gcc_assert (cond_code >= 0);
8528 if (code == 'M')
8529 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8530 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8531 fputs (aarch64_sve_condition_codes[cond_code], f);
8532 else
8533 fputs (aarch64_condition_codes[cond_code], f);
8534 }
8535 break;
8536
8537 case 'N':
8538 if (!const_vec_duplicate_p (x, &elt))
8539 {
8540 output_operand_lossage ("invalid vector constant");
8541 return;
8542 }
8543
8544 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8545 asm_fprintf (f, "%wd", -INTVAL (elt));
8546 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8547 && aarch64_print_vector_float_operand (f, x, true))
8548 ;
8549 else
8550 {
8551 output_operand_lossage ("invalid vector constant");
8552 return;
8553 }
8554 break;
8555
8556 case 'b':
8557 case 'h':
8558 case 's':
8559 case 'd':
8560 case 'q':
8561 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8562 {
8563 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8564 return;
8565 }
8566 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8567 break;
8568
8569 case 'S':
8570 case 'T':
8571 case 'U':
8572 case 'V':
8573 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8574 {
8575 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8576 return;
8577 }
8578 asm_fprintf (f, "%c%d",
8579 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8580 REGNO (x) - V0_REGNUM + (code - 'S'));
8581 break;
8582
8583 case 'R':
8584 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8585 {
8586 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8587 return;
8588 }
8589 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8590 break;
8591
8592 case 'X':
8593 if (!CONST_INT_P (x))
8594 {
8595 output_operand_lossage ("invalid operand for '%%%c'", code);
8596 return;
8597 }
8598 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8599 break;
8600
8601 case 'C':
8602 {
8603 /* Print a replicated constant in hex. */
8604 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8605 {
8606 output_operand_lossage ("invalid operand for '%%%c'", code);
8607 return;
8608 }
8609 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8610 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8611 }
8612 break;
8613
8614 case 'D':
8615 {
8616 /* Print a replicated constant in decimal, treating it as
8617 unsigned. */
8618 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8619 {
8620 output_operand_lossage ("invalid operand for '%%%c'", code);
8621 return;
8622 }
8623 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8624 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8625 }
8626 break;
8627
8628 case 'w':
8629 case 'x':
8630 if (x == const0_rtx
8631 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8632 {
8633 asm_fprintf (f, "%czr", code);
8634 break;
8635 }
8636
8637 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8638 {
8639 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8640 break;
8641 }
8642
8643 if (REG_P (x) && REGNO (x) == SP_REGNUM)
8644 {
8645 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8646 break;
8647 }
8648
8649 /* Fall through */
8650
8651 case 0:
8652 if (x == NULL)
8653 {
8654 output_operand_lossage ("missing operand");
8655 return;
8656 }
8657
8658 switch (GET_CODE (x))
8659 {
8660 case REG:
8661 if (aarch64_sve_data_mode_p (GET_MODE (x)))
8662 {
8663 if (REG_NREGS (x) == 1)
8664 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8665 else
8666 {
8667 char suffix
8668 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8669 asm_fprintf (f, "{z%d.%c - z%d.%c}",
8670 REGNO (x) - V0_REGNUM, suffix,
8671 END_REGNO (x) - V0_REGNUM - 1, suffix);
8672 }
8673 }
8674 else
8675 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8676 break;
8677
8678 case MEM:
8679 output_address (GET_MODE (x), XEXP (x, 0));
8680 break;
8681
8682 case LABEL_REF:
8683 case SYMBOL_REF:
8684 output_addr_const (asm_out_file, x);
8685 break;
8686
8687 case CONST_INT:
8688 asm_fprintf (f, "%wd", INTVAL (x));
8689 break;
8690
8691 case CONST:
8692 if (!VECTOR_MODE_P (GET_MODE (x)))
8693 {
8694 output_addr_const (asm_out_file, x);
8695 break;
8696 }
8697 /* fall through */
8698
8699 case CONST_VECTOR:
8700 if (!const_vec_duplicate_p (x, &elt))
8701 {
8702 output_operand_lossage ("invalid vector constant");
8703 return;
8704 }
8705
8706 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8707 asm_fprintf (f, "%wd", INTVAL (elt));
8708 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8709 && aarch64_print_vector_float_operand (f, x, false))
8710 ;
8711 else
8712 {
8713 output_operand_lossage ("invalid vector constant");
8714 return;
8715 }
8716 break;
8717
8718 case CONST_DOUBLE:
8719 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8720 be getting CONST_DOUBLEs holding integers. */
8721 gcc_assert (GET_MODE (x) != VOIDmode);
8722 if (aarch64_float_const_zero_rtx_p (x))
8723 {
8724 fputc ('0', f);
8725 break;
8726 }
8727 else if (aarch64_float_const_representable_p (x))
8728 {
8729 #define buf_size 20
8730 char float_buf[buf_size] = {'\0'};
8731 real_to_decimal_for_mode (float_buf,
8732 CONST_DOUBLE_REAL_VALUE (x),
8733 buf_size, buf_size,
8734 1, GET_MODE (x));
8735 asm_fprintf (asm_out_file, "%s", float_buf);
8736 break;
8737 #undef buf_size
8738 }
8739 output_operand_lossage ("invalid constant");
8740 return;
8741 default:
8742 output_operand_lossage ("invalid operand");
8743 return;
8744 }
8745 break;
8746
8747 case 'A':
8748 if (GET_CODE (x) == HIGH)
8749 x = XEXP (x, 0);
8750
8751 switch (aarch64_classify_symbolic_expression (x))
8752 {
8753 case SYMBOL_SMALL_GOT_4G:
8754 asm_fprintf (asm_out_file, ":got:");
8755 break;
8756
8757 case SYMBOL_SMALL_TLSGD:
8758 asm_fprintf (asm_out_file, ":tlsgd:");
8759 break;
8760
8761 case SYMBOL_SMALL_TLSDESC:
8762 asm_fprintf (asm_out_file, ":tlsdesc:");
8763 break;
8764
8765 case SYMBOL_SMALL_TLSIE:
8766 asm_fprintf (asm_out_file, ":gottprel:");
8767 break;
8768
8769 case SYMBOL_TLSLE24:
8770 asm_fprintf (asm_out_file, ":tprel:");
8771 break;
8772
8773 case SYMBOL_TINY_GOT:
8774 gcc_unreachable ();
8775 break;
8776
8777 default:
8778 break;
8779 }
8780 output_addr_const (asm_out_file, x);
8781 break;
8782
8783 case 'L':
8784 switch (aarch64_classify_symbolic_expression (x))
8785 {
8786 case SYMBOL_SMALL_GOT_4G:
8787 asm_fprintf (asm_out_file, ":lo12:");
8788 break;
8789
8790 case SYMBOL_SMALL_TLSGD:
8791 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8792 break;
8793
8794 case SYMBOL_SMALL_TLSDESC:
8795 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8796 break;
8797
8798 case SYMBOL_SMALL_TLSIE:
8799 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8800 break;
8801
8802 case SYMBOL_TLSLE12:
8803 asm_fprintf (asm_out_file, ":tprel_lo12:");
8804 break;
8805
8806 case SYMBOL_TLSLE24:
8807 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8808 break;
8809
8810 case SYMBOL_TINY_GOT:
8811 asm_fprintf (asm_out_file, ":got:");
8812 break;
8813
8814 case SYMBOL_TINY_TLSIE:
8815 asm_fprintf (asm_out_file, ":gottprel:");
8816 break;
8817
8818 default:
8819 break;
8820 }
8821 output_addr_const (asm_out_file, x);
8822 break;
8823
8824 case 'G':
8825 switch (aarch64_classify_symbolic_expression (x))
8826 {
8827 case SYMBOL_TLSLE24:
8828 asm_fprintf (asm_out_file, ":tprel_hi12:");
8829 break;
8830 default:
8831 break;
8832 }
8833 output_addr_const (asm_out_file, x);
8834 break;
8835
8836 case 'k':
8837 {
8838 HOST_WIDE_INT cond_code;
8839
8840 if (!CONST_INT_P (x))
8841 {
8842 output_operand_lossage ("invalid operand for '%%%c'", code);
8843 return;
8844 }
8845
8846 cond_code = INTVAL (x);
8847 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8848 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8849 }
8850 break;
8851
8852 case 'y':
8853 case 'z':
8854 {
8855 machine_mode mode = GET_MODE (x);
8856
8857 if (GET_CODE (x) != MEM
8858 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8859 {
8860 output_operand_lossage ("invalid operand for '%%%c'", code);
8861 return;
8862 }
8863
8864 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8865 code == 'y'
8866 ? ADDR_QUERY_LDP_STP_N
8867 : ADDR_QUERY_LDP_STP))
8868 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8869 }
8870 break;
8871
8872 default:
8873 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8874 return;
8875 }
8876 }
8877
8878 /* Print address 'x' of a memory access with mode 'mode'.
8879 'op' is the context required by aarch64_classify_address. It can either be
8880 MEM for a normal memory access or PARALLEL for LDP/STP. */
8881 static bool
8882 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8883 aarch64_addr_query_type type)
8884 {
8885 struct aarch64_address_info addr;
8886 unsigned int size;
8887
8888 /* Check all addresses are Pmode - including ILP32. */
8889 if (GET_MODE (x) != Pmode
8890 && (!CONST_INT_P (x)
8891 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8892 {
8893 output_operand_lossage ("invalid address mode");
8894 return false;
8895 }
8896
8897 if (aarch64_classify_address (&addr, x, mode, true, type))
8898 switch (addr.type)
8899 {
8900 case ADDRESS_REG_IMM:
8901 if (known_eq (addr.const_offset, 0))
8902 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8903 else if (aarch64_sve_data_mode_p (mode))
8904 {
8905 HOST_WIDE_INT vnum
8906 = exact_div (addr.const_offset,
8907 BYTES_PER_SVE_VECTOR).to_constant ();
8908 asm_fprintf (f, "[%s, #%wd, mul vl]",
8909 reg_names[REGNO (addr.base)], vnum);
8910 }
8911 else if (aarch64_sve_pred_mode_p (mode))
8912 {
8913 HOST_WIDE_INT vnum
8914 = exact_div (addr.const_offset,
8915 BYTES_PER_SVE_PRED).to_constant ();
8916 asm_fprintf (f, "[%s, #%wd, mul vl]",
8917 reg_names[REGNO (addr.base)], vnum);
8918 }
8919 else
8920 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8921 INTVAL (addr.offset));
8922 return true;
8923
8924 case ADDRESS_REG_REG:
8925 if (addr.shift == 0)
8926 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8927 reg_names [REGNO (addr.offset)]);
8928 else
8929 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8930 reg_names [REGNO (addr.offset)], addr.shift);
8931 return true;
8932
8933 case ADDRESS_REG_UXTW:
8934 if (addr.shift == 0)
8935 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8936 REGNO (addr.offset) - R0_REGNUM);
8937 else
8938 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8939 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8940 return true;
8941
8942 case ADDRESS_REG_SXTW:
8943 if (addr.shift == 0)
8944 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8945 REGNO (addr.offset) - R0_REGNUM);
8946 else
8947 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8948 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8949 return true;
8950
8951 case ADDRESS_REG_WB:
8952 /* Writeback is only supported for fixed-width modes. */
8953 size = GET_MODE_SIZE (mode).to_constant ();
8954 switch (GET_CODE (x))
8955 {
8956 case PRE_INC:
8957 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8958 return true;
8959 case POST_INC:
8960 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8961 return true;
8962 case PRE_DEC:
8963 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8964 return true;
8965 case POST_DEC:
8966 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8967 return true;
8968 case PRE_MODIFY:
8969 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8970 INTVAL (addr.offset));
8971 return true;
8972 case POST_MODIFY:
8973 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8974 INTVAL (addr.offset));
8975 return true;
8976 default:
8977 break;
8978 }
8979 break;
8980
8981 case ADDRESS_LO_SUM:
8982 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8983 output_addr_const (f, addr.offset);
8984 asm_fprintf (f, "]");
8985 return true;
8986
8987 case ADDRESS_SYMBOLIC:
8988 output_addr_const (f, x);
8989 return true;
8990 }
8991
8992 return false;
8993 }
8994
8995 /* Print address 'x' of a memory access with mode 'mode'. */
8996 static void
8997 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8998 {
8999 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9000 output_addr_const (f, x);
9001 }
9002
9003 bool
9004 aarch64_label_mentioned_p (rtx x)
9005 {
9006 const char *fmt;
9007 int i;
9008
9009 if (GET_CODE (x) == LABEL_REF)
9010 return true;
9011
9012 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9013 referencing instruction, but they are constant offsets, not
9014 symbols. */
9015 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9016 return false;
9017
9018 fmt = GET_RTX_FORMAT (GET_CODE (x));
9019 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9020 {
9021 if (fmt[i] == 'E')
9022 {
9023 int j;
9024
9025 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
9026 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
9027 return 1;
9028 }
9029 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
9030 return 1;
9031 }
9032
9033 return 0;
9034 }
9035
9036 /* Implement REGNO_REG_CLASS. */
9037
9038 enum reg_class
9039 aarch64_regno_regclass (unsigned regno)
9040 {
9041 if (GP_REGNUM_P (regno))
9042 return GENERAL_REGS;
9043
9044 if (regno == SP_REGNUM)
9045 return STACK_REG;
9046
9047 if (regno == FRAME_POINTER_REGNUM
9048 || regno == ARG_POINTER_REGNUM)
9049 return POINTER_REGS;
9050
9051 if (FP_REGNUM_P (regno))
9052 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
9053 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
9054
9055 if (PR_REGNUM_P (regno))
9056 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
9057
9058 return NO_REGS;
9059 }
9060
9061 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
9062 If OFFSET is out of range, return an offset of an anchor point
9063 that is in range. Return 0 otherwise. */
9064
9065 static HOST_WIDE_INT
9066 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
9067 machine_mode mode)
9068 {
9069 /* Does it look like we'll need a 16-byte load/store-pair operation? */
9070 if (size > 16)
9071 return (offset + 0x400) & ~0x7f0;
9072
9073 /* For offsets that aren't a multiple of the access size, the limit is
9074 -256...255. */
9075 if (offset & (size - 1))
9076 {
9077 /* BLKmode typically uses LDP of X-registers. */
9078 if (mode == BLKmode)
9079 return (offset + 512) & ~0x3ff;
9080 return (offset + 0x100) & ~0x1ff;
9081 }
9082
9083 /* Small negative offsets are supported. */
9084 if (IN_RANGE (offset, -256, 0))
9085 return 0;
9086
9087 if (mode == TImode || mode == TFmode)
9088 return (offset + 0x100) & ~0x1ff;
9089
9090 /* Use 12-bit offset by access size. */
9091 return offset & (~0xfff * size);
9092 }
9093
9094 static rtx
9095 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
9096 {
9097 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
9098 where mask is selected by alignment and size of the offset.
9099 We try to pick as large a range for the offset as possible to
9100 maximize the chance of a CSE. However, for aligned addresses
9101 we limit the range to 4k so that structures with different sized
9102 elements are likely to use the same base. We need to be careful
9103 not to split a CONST for some forms of address expression, otherwise
9104 it will generate sub-optimal code. */
9105
9106 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
9107 {
9108 rtx base = XEXP (x, 0);
9109 rtx offset_rtx = XEXP (x, 1);
9110 HOST_WIDE_INT offset = INTVAL (offset_rtx);
9111
9112 if (GET_CODE (base) == PLUS)
9113 {
9114 rtx op0 = XEXP (base, 0);
9115 rtx op1 = XEXP (base, 1);
9116
9117 /* Force any scaling into a temp for CSE. */
9118 op0 = force_reg (Pmode, op0);
9119 op1 = force_reg (Pmode, op1);
9120
9121 /* Let the pointer register be in op0. */
9122 if (REG_POINTER (op1))
9123 std::swap (op0, op1);
9124
9125 /* If the pointer is virtual or frame related, then we know that
9126 virtual register instantiation or register elimination is going
9127 to apply a second constant. We want the two constants folded
9128 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
9129 if (virt_or_elim_regno_p (REGNO (op0)))
9130 {
9131 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
9132 NULL_RTX, true, OPTAB_DIRECT);
9133 return gen_rtx_PLUS (Pmode, base, op1);
9134 }
9135
9136 /* Otherwise, in order to encourage CSE (and thence loop strength
9137 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
9138 base = expand_binop (Pmode, add_optab, op0, op1,
9139 NULL_RTX, true, OPTAB_DIRECT);
9140 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
9141 }
9142
9143 HOST_WIDE_INT size;
9144 if (GET_MODE_SIZE (mode).is_constant (&size))
9145 {
9146 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
9147 mode);
9148 if (base_offset != 0)
9149 {
9150 base = plus_constant (Pmode, base, base_offset);
9151 base = force_operand (base, NULL_RTX);
9152 return plus_constant (Pmode, base, offset - base_offset);
9153 }
9154 }
9155 }
9156
9157 return x;
9158 }
9159
9160 static reg_class_t
9161 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
9162 reg_class_t rclass,
9163 machine_mode mode,
9164 secondary_reload_info *sri)
9165 {
9166 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
9167 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
9168 comment at the head of aarch64-sve.md for more details about the
9169 big-endian handling. */
9170 if (BYTES_BIG_ENDIAN
9171 && reg_class_subset_p (rclass, FP_REGS)
9172 && !((REG_P (x) && HARD_REGISTER_P (x))
9173 || aarch64_simd_valid_immediate (x, NULL))
9174 && aarch64_sve_data_mode_p (mode))
9175 {
9176 sri->icode = CODE_FOR_aarch64_sve_reload_be;
9177 return NO_REGS;
9178 }
9179
9180 /* If we have to disable direct literal pool loads and stores because the
9181 function is too big, then we need a scratch register. */
9182 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
9183 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
9184 || targetm.vector_mode_supported_p (GET_MODE (x)))
9185 && !aarch64_pcrelative_literal_loads)
9186 {
9187 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
9188 return NO_REGS;
9189 }
9190
9191 /* Without the TARGET_SIMD instructions we cannot move a Q register
9192 to a Q register directly. We need a scratch. */
9193 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
9194 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
9195 && reg_class_subset_p (rclass, FP_REGS))
9196 {
9197 sri->icode = code_for_aarch64_reload_mov (mode);
9198 return NO_REGS;
9199 }
9200
9201 /* A TFmode or TImode memory access should be handled via an FP_REGS
9202 because AArch64 has richer addressing modes for LDR/STR instructions
9203 than LDP/STP instructions. */
9204 if (TARGET_FLOAT && rclass == GENERAL_REGS
9205 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
9206 return FP_REGS;
9207
9208 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
9209 return GENERAL_REGS;
9210
9211 return NO_REGS;
9212 }
9213
9214 static bool
9215 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
9216 {
9217 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
9218
9219 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
9220 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
9221 if (frame_pointer_needed)
9222 return to == HARD_FRAME_POINTER_REGNUM;
9223 return true;
9224 }
9225
9226 poly_int64
9227 aarch64_initial_elimination_offset (unsigned from, unsigned to)
9228 {
9229 if (to == HARD_FRAME_POINTER_REGNUM)
9230 {
9231 if (from == ARG_POINTER_REGNUM)
9232 return cfun->machine->frame.hard_fp_offset;
9233
9234 if (from == FRAME_POINTER_REGNUM)
9235 return cfun->machine->frame.hard_fp_offset
9236 - cfun->machine->frame.locals_offset;
9237 }
9238
9239 if (to == STACK_POINTER_REGNUM)
9240 {
9241 if (from == FRAME_POINTER_REGNUM)
9242 return cfun->machine->frame.frame_size
9243 - cfun->machine->frame.locals_offset;
9244 }
9245
9246 return cfun->machine->frame.frame_size;
9247 }
9248
9249 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
9250 previous frame. */
9251
9252 rtx
9253 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
9254 {
9255 if (count != 0)
9256 return const0_rtx;
9257 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
9258 }
9259
9260
9261 static void
9262 aarch64_asm_trampoline_template (FILE *f)
9263 {
9264 int offset1 = 16;
9265 int offset2 = 20;
9266
9267 if (aarch64_bti_enabled ())
9268 {
9269 asm_fprintf (f, "\thint\t34 // bti c\n");
9270 offset1 -= 4;
9271 offset2 -= 4;
9272 }
9273
9274 if (TARGET_ILP32)
9275 {
9276 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9277 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9278 offset1);
9279 }
9280 else
9281 {
9282 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9283 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9284 offset2);
9285 }
9286 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9287
9288 /* The trampoline needs an extra padding instruction. In case if BTI is
9289 enabled the padding instruction is replaced by the BTI instruction at
9290 the beginning. */
9291 if (!aarch64_bti_enabled ())
9292 assemble_aligned_integer (4, const0_rtx);
9293
9294 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9295 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9296 }
9297
9298 static void
9299 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9300 {
9301 rtx fnaddr, mem, a_tramp;
9302 const int tramp_code_sz = 16;
9303
9304 /* Don't need to copy the trailing D-words, we fill those in below. */
9305 emit_block_move (m_tramp, assemble_trampoline_template (),
9306 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9307 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9308 fnaddr = XEXP (DECL_RTL (fndecl), 0);
9309 if (GET_MODE (fnaddr) != ptr_mode)
9310 fnaddr = convert_memory_address (ptr_mode, fnaddr);
9311 emit_move_insn (mem, fnaddr);
9312
9313 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9314 emit_move_insn (mem, chain_value);
9315
9316 /* XXX We should really define a "clear_cache" pattern and use
9317 gen_clear_cache(). */
9318 a_tramp = XEXP (m_tramp, 0);
9319 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9320 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9321 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9322 ptr_mode);
9323 }
9324
9325 static unsigned char
9326 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9327 {
9328 /* ??? Logically we should only need to provide a value when
9329 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9330 can hold MODE, but at the moment we need to handle all modes.
9331 Just ignore any runtime parts for registers that can't store them. */
9332 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9333 unsigned int nregs;
9334 switch (regclass)
9335 {
9336 case TAILCALL_ADDR_REGS:
9337 case POINTER_REGS:
9338 case GENERAL_REGS:
9339 case ALL_REGS:
9340 case POINTER_AND_FP_REGS:
9341 case FP_REGS:
9342 case FP_LO_REGS:
9343 case FP_LO8_REGS:
9344 if (aarch64_sve_data_mode_p (mode)
9345 && constant_multiple_p (GET_MODE_SIZE (mode),
9346 BYTES_PER_SVE_VECTOR, &nregs))
9347 return nregs;
9348 return (aarch64_vector_data_mode_p (mode)
9349 ? CEIL (lowest_size, UNITS_PER_VREG)
9350 : CEIL (lowest_size, UNITS_PER_WORD));
9351 case STACK_REG:
9352 case PR_REGS:
9353 case PR_LO_REGS:
9354 case PR_HI_REGS:
9355 return 1;
9356
9357 case NO_REGS:
9358 return 0;
9359
9360 default:
9361 break;
9362 }
9363 gcc_unreachable ();
9364 }
9365
9366 static reg_class_t
9367 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9368 {
9369 if (regclass == POINTER_REGS)
9370 return GENERAL_REGS;
9371
9372 if (regclass == STACK_REG)
9373 {
9374 if (REG_P(x)
9375 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9376 return regclass;
9377
9378 return NO_REGS;
9379 }
9380
9381 /* Register eliminiation can result in a request for
9382 SP+constant->FP_REGS. We cannot support such operations which
9383 use SP as source and an FP_REG as destination, so reject out
9384 right now. */
9385 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9386 {
9387 rtx lhs = XEXP (x, 0);
9388
9389 /* Look through a possible SUBREG introduced by ILP32. */
9390 if (GET_CODE (lhs) == SUBREG)
9391 lhs = SUBREG_REG (lhs);
9392
9393 gcc_assert (REG_P (lhs));
9394 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9395 POINTER_REGS));
9396 return NO_REGS;
9397 }
9398
9399 return regclass;
9400 }
9401
9402 void
9403 aarch64_asm_output_labelref (FILE* f, const char *name)
9404 {
9405 asm_fprintf (f, "%U%s", name);
9406 }
9407
9408 static void
9409 aarch64_elf_asm_constructor (rtx symbol, int priority)
9410 {
9411 if (priority == DEFAULT_INIT_PRIORITY)
9412 default_ctor_section_asm_out_constructor (symbol, priority);
9413 else
9414 {
9415 section *s;
9416 /* While priority is known to be in range [0, 65535], so 18 bytes
9417 would be enough, the compiler might not know that. To avoid
9418 -Wformat-truncation false positive, use a larger size. */
9419 char buf[23];
9420 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9421 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9422 switch_to_section (s);
9423 assemble_align (POINTER_SIZE);
9424 assemble_aligned_integer (POINTER_BYTES, symbol);
9425 }
9426 }
9427
9428 static void
9429 aarch64_elf_asm_destructor (rtx symbol, int priority)
9430 {
9431 if (priority == DEFAULT_INIT_PRIORITY)
9432 default_dtor_section_asm_out_destructor (symbol, priority);
9433 else
9434 {
9435 section *s;
9436 /* While priority is known to be in range [0, 65535], so 18 bytes
9437 would be enough, the compiler might not know that. To avoid
9438 -Wformat-truncation false positive, use a larger size. */
9439 char buf[23];
9440 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9441 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9442 switch_to_section (s);
9443 assemble_align (POINTER_SIZE);
9444 assemble_aligned_integer (POINTER_BYTES, symbol);
9445 }
9446 }
9447
9448 const char*
9449 aarch64_output_casesi (rtx *operands)
9450 {
9451 char buf[100];
9452 char label[100];
9453 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9454 int index;
9455 static const char *const patterns[4][2] =
9456 {
9457 {
9458 "ldrb\t%w3, [%0,%w1,uxtw]",
9459 "add\t%3, %4, %w3, sxtb #2"
9460 },
9461 {
9462 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9463 "add\t%3, %4, %w3, sxth #2"
9464 },
9465 {
9466 "ldr\t%w3, [%0,%w1,uxtw #2]",
9467 "add\t%3, %4, %w3, sxtw #2"
9468 },
9469 /* We assume that DImode is only generated when not optimizing and
9470 that we don't really need 64-bit address offsets. That would
9471 imply an object file with 8GB of code in a single function! */
9472 {
9473 "ldr\t%w3, [%0,%w1,uxtw #2]",
9474 "add\t%3, %4, %w3, sxtw #2"
9475 }
9476 };
9477
9478 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9479
9480 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9481 index = exact_log2 (GET_MODE_SIZE (mode));
9482
9483 gcc_assert (index >= 0 && index <= 3);
9484
9485 /* Need to implement table size reduction, by chaning the code below. */
9486 output_asm_insn (patterns[index][0], operands);
9487 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9488 snprintf (buf, sizeof (buf),
9489 "adr\t%%4, %s", targetm.strip_name_encoding (label));
9490 output_asm_insn (buf, operands);
9491 output_asm_insn (patterns[index][1], operands);
9492 output_asm_insn ("br\t%3", operands);
9493 assemble_label (asm_out_file, label);
9494 return "";
9495 }
9496
9497
9498 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9499 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9500 operator. */
9501
9502 int
9503 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9504 {
9505 if (shift >= 0 && shift <= 3)
9506 {
9507 int size;
9508 for (size = 8; size <= 32; size *= 2)
9509 {
9510 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9511 if (mask == bits << shift)
9512 return size;
9513 }
9514 }
9515 return 0;
9516 }
9517
9518 /* Constant pools are per function only when PC relative
9519 literal loads are true or we are in the large memory
9520 model. */
9521
9522 static inline bool
9523 aarch64_can_use_per_function_literal_pools_p (void)
9524 {
9525 return (aarch64_pcrelative_literal_loads
9526 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9527 }
9528
9529 static bool
9530 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9531 {
9532 /* We can't use blocks for constants when we're using a per-function
9533 constant pool. */
9534 return !aarch64_can_use_per_function_literal_pools_p ();
9535 }
9536
9537 /* Select appropriate section for constants depending
9538 on where we place literal pools. */
9539
9540 static section *
9541 aarch64_select_rtx_section (machine_mode mode,
9542 rtx x,
9543 unsigned HOST_WIDE_INT align)
9544 {
9545 if (aarch64_can_use_per_function_literal_pools_p ())
9546 return function_section (current_function_decl);
9547
9548 return default_elf_select_rtx_section (mode, x, align);
9549 }
9550
9551 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9552 void
9553 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9554 HOST_WIDE_INT offset)
9555 {
9556 /* When using per-function literal pools, we must ensure that any code
9557 section is aligned to the minimal instruction length, lest we get
9558 errors from the assembler re "unaligned instructions". */
9559 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9560 ASM_OUTPUT_ALIGN (f, 2);
9561 }
9562
9563 /* Costs. */
9564
9565 /* Helper function for rtx cost calculation. Strip a shift expression
9566 from X. Returns the inner operand if successful, or the original
9567 expression on failure. */
9568 static rtx
9569 aarch64_strip_shift (rtx x)
9570 {
9571 rtx op = x;
9572
9573 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9574 we can convert both to ROR during final output. */
9575 if ((GET_CODE (op) == ASHIFT
9576 || GET_CODE (op) == ASHIFTRT
9577 || GET_CODE (op) == LSHIFTRT
9578 || GET_CODE (op) == ROTATERT
9579 || GET_CODE (op) == ROTATE)
9580 && CONST_INT_P (XEXP (op, 1)))
9581 return XEXP (op, 0);
9582
9583 if (GET_CODE (op) == MULT
9584 && CONST_INT_P (XEXP (op, 1))
9585 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9586 return XEXP (op, 0);
9587
9588 return x;
9589 }
9590
9591 /* Helper function for rtx cost calculation. Strip an extend
9592 expression from X. Returns the inner operand if successful, or the
9593 original expression on failure. We deal with a number of possible
9594 canonicalization variations here. If STRIP_SHIFT is true, then
9595 we can strip off a shift also. */
9596 static rtx
9597 aarch64_strip_extend (rtx x, bool strip_shift)
9598 {
9599 scalar_int_mode mode;
9600 rtx op = x;
9601
9602 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9603 return op;
9604
9605 /* Zero and sign extraction of a widened value. */
9606 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9607 && XEXP (op, 2) == const0_rtx
9608 && GET_CODE (XEXP (op, 0)) == MULT
9609 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9610 XEXP (op, 1)))
9611 return XEXP (XEXP (op, 0), 0);
9612
9613 /* It can also be represented (for zero-extend) as an AND with an
9614 immediate. */
9615 if (GET_CODE (op) == AND
9616 && GET_CODE (XEXP (op, 0)) == MULT
9617 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9618 && CONST_INT_P (XEXP (op, 1))
9619 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9620 INTVAL (XEXP (op, 1))) != 0)
9621 return XEXP (XEXP (op, 0), 0);
9622
9623 /* Now handle extended register, as this may also have an optional
9624 left shift by 1..4. */
9625 if (strip_shift
9626 && GET_CODE (op) == ASHIFT
9627 && CONST_INT_P (XEXP (op, 1))
9628 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9629 op = XEXP (op, 0);
9630
9631 if (GET_CODE (op) == ZERO_EXTEND
9632 || GET_CODE (op) == SIGN_EXTEND)
9633 op = XEXP (op, 0);
9634
9635 if (op != x)
9636 return op;
9637
9638 return x;
9639 }
9640
9641 /* Return true iff CODE is a shift supported in combination
9642 with arithmetic instructions. */
9643
9644 static bool
9645 aarch64_shift_p (enum rtx_code code)
9646 {
9647 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9648 }
9649
9650
9651 /* Return true iff X is a cheap shift without a sign extend. */
9652
9653 static bool
9654 aarch64_cheap_mult_shift_p (rtx x)
9655 {
9656 rtx op0, op1;
9657
9658 op0 = XEXP (x, 0);
9659 op1 = XEXP (x, 1);
9660
9661 if (!(aarch64_tune_params.extra_tuning_flags
9662 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9663 return false;
9664
9665 if (GET_CODE (op0) == SIGN_EXTEND)
9666 return false;
9667
9668 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9669 && UINTVAL (op1) <= 4)
9670 return true;
9671
9672 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9673 return false;
9674
9675 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9676
9677 if (l2 > 0 && l2 <= 4)
9678 return true;
9679
9680 return false;
9681 }
9682
9683 /* Helper function for rtx cost calculation. Calculate the cost of
9684 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9685 Return the calculated cost of the expression, recursing manually in to
9686 operands where needed. */
9687
9688 static int
9689 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9690 {
9691 rtx op0, op1;
9692 const struct cpu_cost_table *extra_cost
9693 = aarch64_tune_params.insn_extra_cost;
9694 int cost = 0;
9695 bool compound_p = (outer == PLUS || outer == MINUS);
9696 machine_mode mode = GET_MODE (x);
9697
9698 gcc_checking_assert (code == MULT);
9699
9700 op0 = XEXP (x, 0);
9701 op1 = XEXP (x, 1);
9702
9703 if (VECTOR_MODE_P (mode))
9704 mode = GET_MODE_INNER (mode);
9705
9706 /* Integer multiply/fma. */
9707 if (GET_MODE_CLASS (mode) == MODE_INT)
9708 {
9709 /* The multiply will be canonicalized as a shift, cost it as such. */
9710 if (aarch64_shift_p (GET_CODE (x))
9711 || (CONST_INT_P (op1)
9712 && exact_log2 (INTVAL (op1)) > 0))
9713 {
9714 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9715 || GET_CODE (op0) == SIGN_EXTEND;
9716 if (speed)
9717 {
9718 if (compound_p)
9719 {
9720 /* If the shift is considered cheap,
9721 then don't add any cost. */
9722 if (aarch64_cheap_mult_shift_p (x))
9723 ;
9724 else if (REG_P (op1))
9725 /* ARITH + shift-by-register. */
9726 cost += extra_cost->alu.arith_shift_reg;
9727 else if (is_extend)
9728 /* ARITH + extended register. We don't have a cost field
9729 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9730 cost += extra_cost->alu.extend_arith;
9731 else
9732 /* ARITH + shift-by-immediate. */
9733 cost += extra_cost->alu.arith_shift;
9734 }
9735 else
9736 /* LSL (immediate). */
9737 cost += extra_cost->alu.shift;
9738
9739 }
9740 /* Strip extends as we will have costed them in the case above. */
9741 if (is_extend)
9742 op0 = aarch64_strip_extend (op0, true);
9743
9744 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9745
9746 return cost;
9747 }
9748
9749 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9750 compound and let the below cases handle it. After all, MNEG is a
9751 special-case alias of MSUB. */
9752 if (GET_CODE (op0) == NEG)
9753 {
9754 op0 = XEXP (op0, 0);
9755 compound_p = true;
9756 }
9757
9758 /* Integer multiplies or FMAs have zero/sign extending variants. */
9759 if ((GET_CODE (op0) == ZERO_EXTEND
9760 && GET_CODE (op1) == ZERO_EXTEND)
9761 || (GET_CODE (op0) == SIGN_EXTEND
9762 && GET_CODE (op1) == SIGN_EXTEND))
9763 {
9764 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9765 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9766
9767 if (speed)
9768 {
9769 if (compound_p)
9770 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9771 cost += extra_cost->mult[0].extend_add;
9772 else
9773 /* MUL/SMULL/UMULL. */
9774 cost += extra_cost->mult[0].extend;
9775 }
9776
9777 return cost;
9778 }
9779
9780 /* This is either an integer multiply or a MADD. In both cases
9781 we want to recurse and cost the operands. */
9782 cost += rtx_cost (op0, mode, MULT, 0, speed);
9783 cost += rtx_cost (op1, mode, MULT, 1, speed);
9784
9785 if (speed)
9786 {
9787 if (compound_p)
9788 /* MADD/MSUB. */
9789 cost += extra_cost->mult[mode == DImode].add;
9790 else
9791 /* MUL. */
9792 cost += extra_cost->mult[mode == DImode].simple;
9793 }
9794
9795 return cost;
9796 }
9797 else
9798 {
9799 if (speed)
9800 {
9801 /* Floating-point FMA/FMUL can also support negations of the
9802 operands, unless the rounding mode is upward or downward in
9803 which case FNMUL is different than FMUL with operand negation. */
9804 bool neg0 = GET_CODE (op0) == NEG;
9805 bool neg1 = GET_CODE (op1) == NEG;
9806 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9807 {
9808 if (neg0)
9809 op0 = XEXP (op0, 0);
9810 if (neg1)
9811 op1 = XEXP (op1, 0);
9812 }
9813
9814 if (compound_p)
9815 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9816 cost += extra_cost->fp[mode == DFmode].fma;
9817 else
9818 /* FMUL/FNMUL. */
9819 cost += extra_cost->fp[mode == DFmode].mult;
9820 }
9821
9822 cost += rtx_cost (op0, mode, MULT, 0, speed);
9823 cost += rtx_cost (op1, mode, MULT, 1, speed);
9824 return cost;
9825 }
9826 }
9827
9828 static int
9829 aarch64_address_cost (rtx x,
9830 machine_mode mode,
9831 addr_space_t as ATTRIBUTE_UNUSED,
9832 bool speed)
9833 {
9834 enum rtx_code c = GET_CODE (x);
9835 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9836 struct aarch64_address_info info;
9837 int cost = 0;
9838 info.shift = 0;
9839
9840 if (!aarch64_classify_address (&info, x, mode, false))
9841 {
9842 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9843 {
9844 /* This is a CONST or SYMBOL ref which will be split
9845 in a different way depending on the code model in use.
9846 Cost it through the generic infrastructure. */
9847 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9848 /* Divide through by the cost of one instruction to
9849 bring it to the same units as the address costs. */
9850 cost_symbol_ref /= COSTS_N_INSNS (1);
9851 /* The cost is then the cost of preparing the address,
9852 followed by an immediate (possibly 0) offset. */
9853 return cost_symbol_ref + addr_cost->imm_offset;
9854 }
9855 else
9856 {
9857 /* This is most likely a jump table from a case
9858 statement. */
9859 return addr_cost->register_offset;
9860 }
9861 }
9862
9863 switch (info.type)
9864 {
9865 case ADDRESS_LO_SUM:
9866 case ADDRESS_SYMBOLIC:
9867 case ADDRESS_REG_IMM:
9868 cost += addr_cost->imm_offset;
9869 break;
9870
9871 case ADDRESS_REG_WB:
9872 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9873 cost += addr_cost->pre_modify;
9874 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9875 cost += addr_cost->post_modify;
9876 else
9877 gcc_unreachable ();
9878
9879 break;
9880
9881 case ADDRESS_REG_REG:
9882 cost += addr_cost->register_offset;
9883 break;
9884
9885 case ADDRESS_REG_SXTW:
9886 cost += addr_cost->register_sextend;
9887 break;
9888
9889 case ADDRESS_REG_UXTW:
9890 cost += addr_cost->register_zextend;
9891 break;
9892
9893 default:
9894 gcc_unreachable ();
9895 }
9896
9897
9898 if (info.shift > 0)
9899 {
9900 /* For the sake of calculating the cost of the shifted register
9901 component, we can treat same sized modes in the same way. */
9902 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9903 cost += addr_cost->addr_scale_costs.hi;
9904 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9905 cost += addr_cost->addr_scale_costs.si;
9906 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9907 cost += addr_cost->addr_scale_costs.di;
9908 else
9909 /* We can't tell, or this is a 128-bit vector. */
9910 cost += addr_cost->addr_scale_costs.ti;
9911 }
9912
9913 return cost;
9914 }
9915
9916 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9917 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9918 to be taken. */
9919
9920 int
9921 aarch64_branch_cost (bool speed_p, bool predictable_p)
9922 {
9923 /* When optimizing for speed, use the cost of unpredictable branches. */
9924 const struct cpu_branch_cost *branch_costs =
9925 aarch64_tune_params.branch_costs;
9926
9927 if (!speed_p || predictable_p)
9928 return branch_costs->predictable;
9929 else
9930 return branch_costs->unpredictable;
9931 }
9932
9933 /* Return true if the RTX X in mode MODE is a zero or sign extract
9934 usable in an ADD or SUB (extended register) instruction. */
9935 static bool
9936 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9937 {
9938 /* Catch add with a sign extract.
9939 This is add_<optab><mode>_multp2. */
9940 if (GET_CODE (x) == SIGN_EXTRACT
9941 || GET_CODE (x) == ZERO_EXTRACT)
9942 {
9943 rtx op0 = XEXP (x, 0);
9944 rtx op1 = XEXP (x, 1);
9945 rtx op2 = XEXP (x, 2);
9946
9947 if (GET_CODE (op0) == MULT
9948 && CONST_INT_P (op1)
9949 && op2 == const0_rtx
9950 && CONST_INT_P (XEXP (op0, 1))
9951 && aarch64_is_extend_from_extract (mode,
9952 XEXP (op0, 1),
9953 op1))
9954 {
9955 return true;
9956 }
9957 }
9958 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9959 No shift. */
9960 else if (GET_CODE (x) == SIGN_EXTEND
9961 || GET_CODE (x) == ZERO_EXTEND)
9962 return REG_P (XEXP (x, 0));
9963
9964 return false;
9965 }
9966
9967 static bool
9968 aarch64_frint_unspec_p (unsigned int u)
9969 {
9970 switch (u)
9971 {
9972 case UNSPEC_FRINTZ:
9973 case UNSPEC_FRINTP:
9974 case UNSPEC_FRINTM:
9975 case UNSPEC_FRINTA:
9976 case UNSPEC_FRINTN:
9977 case UNSPEC_FRINTX:
9978 case UNSPEC_FRINTI:
9979 return true;
9980
9981 default:
9982 return false;
9983 }
9984 }
9985
9986 /* Return true iff X is an rtx that will match an extr instruction
9987 i.e. as described in the *extr<mode>5_insn family of patterns.
9988 OP0 and OP1 will be set to the operands of the shifts involved
9989 on success and will be NULL_RTX otherwise. */
9990
9991 static bool
9992 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9993 {
9994 rtx op0, op1;
9995 scalar_int_mode mode;
9996 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9997 return false;
9998
9999 *res_op0 = NULL_RTX;
10000 *res_op1 = NULL_RTX;
10001
10002 if (GET_CODE (x) != IOR)
10003 return false;
10004
10005 op0 = XEXP (x, 0);
10006 op1 = XEXP (x, 1);
10007
10008 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10009 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10010 {
10011 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10012 if (GET_CODE (op1) == ASHIFT)
10013 std::swap (op0, op1);
10014
10015 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10016 return false;
10017
10018 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
10019 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
10020
10021 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
10022 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
10023 {
10024 *res_op0 = XEXP (op0, 0);
10025 *res_op1 = XEXP (op1, 0);
10026 return true;
10027 }
10028 }
10029
10030 return false;
10031 }
10032
10033 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
10034 storing it in *COST. Result is true if the total cost of the operation
10035 has now been calculated. */
10036 static bool
10037 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
10038 {
10039 rtx inner;
10040 rtx comparator;
10041 enum rtx_code cmpcode;
10042
10043 if (COMPARISON_P (op0))
10044 {
10045 inner = XEXP (op0, 0);
10046 comparator = XEXP (op0, 1);
10047 cmpcode = GET_CODE (op0);
10048 }
10049 else
10050 {
10051 inner = op0;
10052 comparator = const0_rtx;
10053 cmpcode = NE;
10054 }
10055
10056 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
10057 {
10058 /* Conditional branch. */
10059 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10060 return true;
10061 else
10062 {
10063 if (cmpcode == NE || cmpcode == EQ)
10064 {
10065 if (comparator == const0_rtx)
10066 {
10067 /* TBZ/TBNZ/CBZ/CBNZ. */
10068 if (GET_CODE (inner) == ZERO_EXTRACT)
10069 /* TBZ/TBNZ. */
10070 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
10071 ZERO_EXTRACT, 0, speed);
10072 else
10073 /* CBZ/CBNZ. */
10074 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
10075
10076 return true;
10077 }
10078 }
10079 else if (cmpcode == LT || cmpcode == GE)
10080 {
10081 /* TBZ/TBNZ. */
10082 if (comparator == const0_rtx)
10083 return true;
10084 }
10085 }
10086 }
10087 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
10088 {
10089 /* CCMP. */
10090 if (GET_CODE (op1) == COMPARE)
10091 {
10092 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
10093 if (XEXP (op1, 1) == const0_rtx)
10094 *cost += 1;
10095 if (speed)
10096 {
10097 machine_mode mode = GET_MODE (XEXP (op1, 0));
10098 const struct cpu_cost_table *extra_cost
10099 = aarch64_tune_params.insn_extra_cost;
10100
10101 if (GET_MODE_CLASS (mode) == MODE_INT)
10102 *cost += extra_cost->alu.arith;
10103 else
10104 *cost += extra_cost->fp[mode == DFmode].compare;
10105 }
10106 return true;
10107 }
10108
10109 /* It's a conditional operation based on the status flags,
10110 so it must be some flavor of CSEL. */
10111
10112 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
10113 if (GET_CODE (op1) == NEG
10114 || GET_CODE (op1) == NOT
10115 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
10116 op1 = XEXP (op1, 0);
10117 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
10118 {
10119 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
10120 op1 = XEXP (op1, 0);
10121 op2 = XEXP (op2, 0);
10122 }
10123
10124 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
10125 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
10126 return true;
10127 }
10128
10129 /* We don't know what this is, cost all operands. */
10130 return false;
10131 }
10132
10133 /* Check whether X is a bitfield operation of the form shift + extend that
10134 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
10135 operand to which the bitfield operation is applied. Otherwise return
10136 NULL_RTX. */
10137
10138 static rtx
10139 aarch64_extend_bitfield_pattern_p (rtx x)
10140 {
10141 rtx_code outer_code = GET_CODE (x);
10142 machine_mode outer_mode = GET_MODE (x);
10143
10144 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
10145 && outer_mode != SImode && outer_mode != DImode)
10146 return NULL_RTX;
10147
10148 rtx inner = XEXP (x, 0);
10149 rtx_code inner_code = GET_CODE (inner);
10150 machine_mode inner_mode = GET_MODE (inner);
10151 rtx op = NULL_RTX;
10152
10153 switch (inner_code)
10154 {
10155 case ASHIFT:
10156 if (CONST_INT_P (XEXP (inner, 1))
10157 && (inner_mode == QImode || inner_mode == HImode))
10158 op = XEXP (inner, 0);
10159 break;
10160 case LSHIFTRT:
10161 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
10162 && (inner_mode == QImode || inner_mode == HImode))
10163 op = XEXP (inner, 0);
10164 break;
10165 case ASHIFTRT:
10166 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
10167 && (inner_mode == QImode || inner_mode == HImode))
10168 op = XEXP (inner, 0);
10169 break;
10170 default:
10171 break;
10172 }
10173
10174 return op;
10175 }
10176
10177 /* Return true if the mask and a shift amount from an RTX of the form
10178 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
10179 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
10180
10181 bool
10182 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
10183 rtx shft_amnt)
10184 {
10185 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
10186 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
10187 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
10188 && (INTVAL (mask)
10189 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
10190 }
10191
10192 /* Return true if the masks and a shift amount from an RTX of the form
10193 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
10194 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
10195
10196 bool
10197 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
10198 unsigned HOST_WIDE_INT mask1,
10199 unsigned HOST_WIDE_INT shft_amnt,
10200 unsigned HOST_WIDE_INT mask2)
10201 {
10202 unsigned HOST_WIDE_INT t;
10203
10204 /* Verify that there is no overlap in what bits are set in the two masks. */
10205 if (mask1 != ~mask2)
10206 return false;
10207
10208 /* Verify that mask2 is not all zeros or ones. */
10209 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
10210 return false;
10211
10212 /* The shift amount should always be less than the mode size. */
10213 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
10214
10215 /* Verify that the mask being shifted is contiguous and would be in the
10216 least significant bits after shifting by shft_amnt. */
10217 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
10218 return (t == (t & -t));
10219 }
10220
10221 /* Calculate the cost of calculating X, storing it in *COST. Result
10222 is true if the total cost of the operation has now been calculated. */
10223 static bool
10224 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
10225 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
10226 {
10227 rtx op0, op1, op2;
10228 const struct cpu_cost_table *extra_cost
10229 = aarch64_tune_params.insn_extra_cost;
10230 int code = GET_CODE (x);
10231 scalar_int_mode int_mode;
10232
10233 /* By default, assume that everything has equivalent cost to the
10234 cheapest instruction. Any additional costs are applied as a delta
10235 above this default. */
10236 *cost = COSTS_N_INSNS (1);
10237
10238 switch (code)
10239 {
10240 case SET:
10241 /* The cost depends entirely on the operands to SET. */
10242 *cost = 0;
10243 op0 = SET_DEST (x);
10244 op1 = SET_SRC (x);
10245
10246 switch (GET_CODE (op0))
10247 {
10248 case MEM:
10249 if (speed)
10250 {
10251 rtx address = XEXP (op0, 0);
10252 if (VECTOR_MODE_P (mode))
10253 *cost += extra_cost->ldst.storev;
10254 else if (GET_MODE_CLASS (mode) == MODE_INT)
10255 *cost += extra_cost->ldst.store;
10256 else if (mode == SFmode)
10257 *cost += extra_cost->ldst.storef;
10258 else if (mode == DFmode)
10259 *cost += extra_cost->ldst.stored;
10260
10261 *cost +=
10262 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10263 0, speed));
10264 }
10265
10266 *cost += rtx_cost (op1, mode, SET, 1, speed);
10267 return true;
10268
10269 case SUBREG:
10270 if (! REG_P (SUBREG_REG (op0)))
10271 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10272
10273 /* Fall through. */
10274 case REG:
10275 /* The cost is one per vector-register copied. */
10276 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10277 {
10278 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10279 *cost = COSTS_N_INSNS (nregs);
10280 }
10281 /* const0_rtx is in general free, but we will use an
10282 instruction to set a register to 0. */
10283 else if (REG_P (op1) || op1 == const0_rtx)
10284 {
10285 /* The cost is 1 per register copied. */
10286 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10287 *cost = COSTS_N_INSNS (nregs);
10288 }
10289 else
10290 /* Cost is just the cost of the RHS of the set. */
10291 *cost += rtx_cost (op1, mode, SET, 1, speed);
10292 return true;
10293
10294 case ZERO_EXTRACT:
10295 case SIGN_EXTRACT:
10296 /* Bit-field insertion. Strip any redundant widening of
10297 the RHS to meet the width of the target. */
10298 if (GET_CODE (op1) == SUBREG)
10299 op1 = SUBREG_REG (op1);
10300 if ((GET_CODE (op1) == ZERO_EXTEND
10301 || GET_CODE (op1) == SIGN_EXTEND)
10302 && CONST_INT_P (XEXP (op0, 1))
10303 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10304 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10305 op1 = XEXP (op1, 0);
10306
10307 if (CONST_INT_P (op1))
10308 {
10309 /* MOV immediate is assumed to always be cheap. */
10310 *cost = COSTS_N_INSNS (1);
10311 }
10312 else
10313 {
10314 /* BFM. */
10315 if (speed)
10316 *cost += extra_cost->alu.bfi;
10317 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10318 }
10319
10320 return true;
10321
10322 default:
10323 /* We can't make sense of this, assume default cost. */
10324 *cost = COSTS_N_INSNS (1);
10325 return false;
10326 }
10327 return false;
10328
10329 case CONST_INT:
10330 /* If an instruction can incorporate a constant within the
10331 instruction, the instruction's expression avoids calling
10332 rtx_cost() on the constant. If rtx_cost() is called on a
10333 constant, then it is usually because the constant must be
10334 moved into a register by one or more instructions.
10335
10336 The exception is constant 0, which can be expressed
10337 as XZR/WZR and is therefore free. The exception to this is
10338 if we have (set (reg) (const0_rtx)) in which case we must cost
10339 the move. However, we can catch that when we cost the SET, so
10340 we don't need to consider that here. */
10341 if (x == const0_rtx)
10342 *cost = 0;
10343 else
10344 {
10345 /* To an approximation, building any other constant is
10346 proportionally expensive to the number of instructions
10347 required to build that constant. This is true whether we
10348 are compiling for SPEED or otherwise. */
10349 if (!is_a <scalar_int_mode> (mode, &int_mode))
10350 int_mode = word_mode;
10351 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10352 (NULL_RTX, x, false, int_mode));
10353 }
10354 return true;
10355
10356 case CONST_DOUBLE:
10357
10358 /* First determine number of instructions to do the move
10359 as an integer constant. */
10360 if (!aarch64_float_const_representable_p (x)
10361 && !aarch64_can_const_movi_rtx_p (x, mode)
10362 && aarch64_float_const_rtx_p (x))
10363 {
10364 unsigned HOST_WIDE_INT ival;
10365 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10366 gcc_assert (succeed);
10367
10368 scalar_int_mode imode = (mode == HFmode
10369 ? SImode
10370 : int_mode_for_mode (mode).require ());
10371 int ncost = aarch64_internal_mov_immediate
10372 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10373 *cost += COSTS_N_INSNS (ncost);
10374 return true;
10375 }
10376
10377 if (speed)
10378 {
10379 /* mov[df,sf]_aarch64. */
10380 if (aarch64_float_const_representable_p (x))
10381 /* FMOV (scalar immediate). */
10382 *cost += extra_cost->fp[mode == DFmode].fpconst;
10383 else if (!aarch64_float_const_zero_rtx_p (x))
10384 {
10385 /* This will be a load from memory. */
10386 if (mode == DFmode)
10387 *cost += extra_cost->ldst.loadd;
10388 else
10389 *cost += extra_cost->ldst.loadf;
10390 }
10391 else
10392 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10393 or MOV v0.s[0], wzr - neither of which are modeled by the
10394 cost tables. Just use the default cost. */
10395 {
10396 }
10397 }
10398
10399 return true;
10400
10401 case MEM:
10402 if (speed)
10403 {
10404 /* For loads we want the base cost of a load, plus an
10405 approximation for the additional cost of the addressing
10406 mode. */
10407 rtx address = XEXP (x, 0);
10408 if (VECTOR_MODE_P (mode))
10409 *cost += extra_cost->ldst.loadv;
10410 else if (GET_MODE_CLASS (mode) == MODE_INT)
10411 *cost += extra_cost->ldst.load;
10412 else if (mode == SFmode)
10413 *cost += extra_cost->ldst.loadf;
10414 else if (mode == DFmode)
10415 *cost += extra_cost->ldst.loadd;
10416
10417 *cost +=
10418 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10419 0, speed));
10420 }
10421
10422 return true;
10423
10424 case NEG:
10425 op0 = XEXP (x, 0);
10426
10427 if (VECTOR_MODE_P (mode))
10428 {
10429 if (speed)
10430 {
10431 /* FNEG. */
10432 *cost += extra_cost->vect.alu;
10433 }
10434 return false;
10435 }
10436
10437 if (GET_MODE_CLASS (mode) == MODE_INT)
10438 {
10439 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10440 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10441 {
10442 /* CSETM. */
10443 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10444 return true;
10445 }
10446
10447 /* Cost this as SUB wzr, X. */
10448 op0 = CONST0_RTX (mode);
10449 op1 = XEXP (x, 0);
10450 goto cost_minus;
10451 }
10452
10453 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10454 {
10455 /* Support (neg(fma...)) as a single instruction only if
10456 sign of zeros is unimportant. This matches the decision
10457 making in aarch64.md. */
10458 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10459 {
10460 /* FNMADD. */
10461 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10462 return true;
10463 }
10464 if (GET_CODE (op0) == MULT)
10465 {
10466 /* FNMUL. */
10467 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10468 return true;
10469 }
10470 if (speed)
10471 /* FNEG. */
10472 *cost += extra_cost->fp[mode == DFmode].neg;
10473 return false;
10474 }
10475
10476 return false;
10477
10478 case CLRSB:
10479 case CLZ:
10480 if (speed)
10481 {
10482 if (VECTOR_MODE_P (mode))
10483 *cost += extra_cost->vect.alu;
10484 else
10485 *cost += extra_cost->alu.clz;
10486 }
10487
10488 return false;
10489
10490 case COMPARE:
10491 op0 = XEXP (x, 0);
10492 op1 = XEXP (x, 1);
10493
10494 if (op1 == const0_rtx
10495 && GET_CODE (op0) == AND)
10496 {
10497 x = op0;
10498 mode = GET_MODE (op0);
10499 goto cost_logic;
10500 }
10501
10502 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10503 {
10504 /* TODO: A write to the CC flags possibly costs extra, this
10505 needs encoding in the cost tables. */
10506
10507 mode = GET_MODE (op0);
10508 /* ANDS. */
10509 if (GET_CODE (op0) == AND)
10510 {
10511 x = op0;
10512 goto cost_logic;
10513 }
10514
10515 if (GET_CODE (op0) == PLUS)
10516 {
10517 /* ADDS (and CMN alias). */
10518 x = op0;
10519 goto cost_plus;
10520 }
10521
10522 if (GET_CODE (op0) == MINUS)
10523 {
10524 /* SUBS. */
10525 x = op0;
10526 goto cost_minus;
10527 }
10528
10529 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10530 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10531 && CONST_INT_P (XEXP (op0, 2)))
10532 {
10533 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10534 Handle it here directly rather than going to cost_logic
10535 since we know the immediate generated for the TST is valid
10536 so we can avoid creating an intermediate rtx for it only
10537 for costing purposes. */
10538 if (speed)
10539 *cost += extra_cost->alu.logical;
10540
10541 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10542 ZERO_EXTRACT, 0, speed);
10543 return true;
10544 }
10545
10546 if (GET_CODE (op1) == NEG)
10547 {
10548 /* CMN. */
10549 if (speed)
10550 *cost += extra_cost->alu.arith;
10551
10552 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10553 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10554 return true;
10555 }
10556
10557 /* CMP.
10558
10559 Compare can freely swap the order of operands, and
10560 canonicalization puts the more complex operation first.
10561 But the integer MINUS logic expects the shift/extend
10562 operation in op1. */
10563 if (! (REG_P (op0)
10564 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10565 {
10566 op0 = XEXP (x, 1);
10567 op1 = XEXP (x, 0);
10568 }
10569 goto cost_minus;
10570 }
10571
10572 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10573 {
10574 /* FCMP. */
10575 if (speed)
10576 *cost += extra_cost->fp[mode == DFmode].compare;
10577
10578 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10579 {
10580 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10581 /* FCMP supports constant 0.0 for no extra cost. */
10582 return true;
10583 }
10584 return false;
10585 }
10586
10587 if (VECTOR_MODE_P (mode))
10588 {
10589 /* Vector compare. */
10590 if (speed)
10591 *cost += extra_cost->vect.alu;
10592
10593 if (aarch64_float_const_zero_rtx_p (op1))
10594 {
10595 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10596 cost. */
10597 return true;
10598 }
10599 return false;
10600 }
10601 return false;
10602
10603 case MINUS:
10604 {
10605 op0 = XEXP (x, 0);
10606 op1 = XEXP (x, 1);
10607
10608 cost_minus:
10609 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10610
10611 /* Detect valid immediates. */
10612 if ((GET_MODE_CLASS (mode) == MODE_INT
10613 || (GET_MODE_CLASS (mode) == MODE_CC
10614 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10615 && CONST_INT_P (op1)
10616 && aarch64_uimm12_shift (INTVAL (op1)))
10617 {
10618 if (speed)
10619 /* SUB(S) (immediate). */
10620 *cost += extra_cost->alu.arith;
10621 return true;
10622 }
10623
10624 /* Look for SUB (extended register). */
10625 if (is_a <scalar_int_mode> (mode, &int_mode)
10626 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10627 {
10628 if (speed)
10629 *cost += extra_cost->alu.extend_arith;
10630
10631 op1 = aarch64_strip_extend (op1, true);
10632 *cost += rtx_cost (op1, VOIDmode,
10633 (enum rtx_code) GET_CODE (op1), 0, speed);
10634 return true;
10635 }
10636
10637 rtx new_op1 = aarch64_strip_extend (op1, false);
10638
10639 /* Cost this as an FMA-alike operation. */
10640 if ((GET_CODE (new_op1) == MULT
10641 || aarch64_shift_p (GET_CODE (new_op1)))
10642 && code != COMPARE)
10643 {
10644 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10645 (enum rtx_code) code,
10646 speed);
10647 return true;
10648 }
10649
10650 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10651
10652 if (speed)
10653 {
10654 if (VECTOR_MODE_P (mode))
10655 {
10656 /* Vector SUB. */
10657 *cost += extra_cost->vect.alu;
10658 }
10659 else if (GET_MODE_CLASS (mode) == MODE_INT)
10660 {
10661 /* SUB(S). */
10662 *cost += extra_cost->alu.arith;
10663 }
10664 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10665 {
10666 /* FSUB. */
10667 *cost += extra_cost->fp[mode == DFmode].addsub;
10668 }
10669 }
10670 return true;
10671 }
10672
10673 case PLUS:
10674 {
10675 rtx new_op0;
10676
10677 op0 = XEXP (x, 0);
10678 op1 = XEXP (x, 1);
10679
10680 cost_plus:
10681 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10682 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10683 {
10684 /* CSINC. */
10685 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10686 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10687 return true;
10688 }
10689
10690 if (GET_MODE_CLASS (mode) == MODE_INT
10691 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10692 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10693 {
10694 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10695
10696 if (speed)
10697 /* ADD (immediate). */
10698 *cost += extra_cost->alu.arith;
10699 return true;
10700 }
10701
10702 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10703
10704 /* Look for ADD (extended register). */
10705 if (is_a <scalar_int_mode> (mode, &int_mode)
10706 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10707 {
10708 if (speed)
10709 *cost += extra_cost->alu.extend_arith;
10710
10711 op0 = aarch64_strip_extend (op0, true);
10712 *cost += rtx_cost (op0, VOIDmode,
10713 (enum rtx_code) GET_CODE (op0), 0, speed);
10714 return true;
10715 }
10716
10717 /* Strip any extend, leave shifts behind as we will
10718 cost them through mult_cost. */
10719 new_op0 = aarch64_strip_extend (op0, false);
10720
10721 if (GET_CODE (new_op0) == MULT
10722 || aarch64_shift_p (GET_CODE (new_op0)))
10723 {
10724 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10725 speed);
10726 return true;
10727 }
10728
10729 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10730
10731 if (speed)
10732 {
10733 if (VECTOR_MODE_P (mode))
10734 {
10735 /* Vector ADD. */
10736 *cost += extra_cost->vect.alu;
10737 }
10738 else if (GET_MODE_CLASS (mode) == MODE_INT)
10739 {
10740 /* ADD. */
10741 *cost += extra_cost->alu.arith;
10742 }
10743 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10744 {
10745 /* FADD. */
10746 *cost += extra_cost->fp[mode == DFmode].addsub;
10747 }
10748 }
10749 return true;
10750 }
10751
10752 case BSWAP:
10753 *cost = COSTS_N_INSNS (1);
10754
10755 if (speed)
10756 {
10757 if (VECTOR_MODE_P (mode))
10758 *cost += extra_cost->vect.alu;
10759 else
10760 *cost += extra_cost->alu.rev;
10761 }
10762 return false;
10763
10764 case IOR:
10765 if (aarch_rev16_p (x))
10766 {
10767 *cost = COSTS_N_INSNS (1);
10768
10769 if (speed)
10770 {
10771 if (VECTOR_MODE_P (mode))
10772 *cost += extra_cost->vect.alu;
10773 else
10774 *cost += extra_cost->alu.rev;
10775 }
10776 return true;
10777 }
10778
10779 if (aarch64_extr_rtx_p (x, &op0, &op1))
10780 {
10781 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10782 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10783 if (speed)
10784 *cost += extra_cost->alu.shift;
10785
10786 return true;
10787 }
10788 /* Fall through. */
10789 case XOR:
10790 case AND:
10791 cost_logic:
10792 op0 = XEXP (x, 0);
10793 op1 = XEXP (x, 1);
10794
10795 if (VECTOR_MODE_P (mode))
10796 {
10797 if (speed)
10798 *cost += extra_cost->vect.alu;
10799 return true;
10800 }
10801
10802 if (code == AND
10803 && GET_CODE (op0) == MULT
10804 && CONST_INT_P (XEXP (op0, 1))
10805 && CONST_INT_P (op1)
10806 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10807 INTVAL (op1)) != 0)
10808 {
10809 /* This is a UBFM/SBFM. */
10810 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10811 if (speed)
10812 *cost += extra_cost->alu.bfx;
10813 return true;
10814 }
10815
10816 if (is_int_mode (mode, &int_mode))
10817 {
10818 if (CONST_INT_P (op1))
10819 {
10820 /* We have a mask + shift version of a UBFIZ
10821 i.e. the *andim_ashift<mode>_bfiz pattern. */
10822 if (GET_CODE (op0) == ASHIFT
10823 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10824 XEXP (op0, 1)))
10825 {
10826 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10827 (enum rtx_code) code, 0, speed);
10828 if (speed)
10829 *cost += extra_cost->alu.bfx;
10830
10831 return true;
10832 }
10833 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10834 {
10835 /* We possibly get the immediate for free, this is not
10836 modelled. */
10837 *cost += rtx_cost (op0, int_mode,
10838 (enum rtx_code) code, 0, speed);
10839 if (speed)
10840 *cost += extra_cost->alu.logical;
10841
10842 return true;
10843 }
10844 }
10845 else
10846 {
10847 rtx new_op0 = op0;
10848
10849 /* Handle ORN, EON, or BIC. */
10850 if (GET_CODE (op0) == NOT)
10851 op0 = XEXP (op0, 0);
10852
10853 new_op0 = aarch64_strip_shift (op0);
10854
10855 /* If we had a shift on op0 then this is a logical-shift-
10856 by-register/immediate operation. Otherwise, this is just
10857 a logical operation. */
10858 if (speed)
10859 {
10860 if (new_op0 != op0)
10861 {
10862 /* Shift by immediate. */
10863 if (CONST_INT_P (XEXP (op0, 1)))
10864 *cost += extra_cost->alu.log_shift;
10865 else
10866 *cost += extra_cost->alu.log_shift_reg;
10867 }
10868 else
10869 *cost += extra_cost->alu.logical;
10870 }
10871
10872 /* In both cases we want to cost both operands. */
10873 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10874 0, speed);
10875 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10876 1, speed);
10877
10878 return true;
10879 }
10880 }
10881 return false;
10882
10883 case NOT:
10884 x = XEXP (x, 0);
10885 op0 = aarch64_strip_shift (x);
10886
10887 if (VECTOR_MODE_P (mode))
10888 {
10889 /* Vector NOT. */
10890 *cost += extra_cost->vect.alu;
10891 return false;
10892 }
10893
10894 /* MVN-shifted-reg. */
10895 if (op0 != x)
10896 {
10897 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10898
10899 if (speed)
10900 *cost += extra_cost->alu.log_shift;
10901
10902 return true;
10903 }
10904 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10905 Handle the second form here taking care that 'a' in the above can
10906 be a shift. */
10907 else if (GET_CODE (op0) == XOR)
10908 {
10909 rtx newop0 = XEXP (op0, 0);
10910 rtx newop1 = XEXP (op0, 1);
10911 rtx op0_stripped = aarch64_strip_shift (newop0);
10912
10913 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10914 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10915
10916 if (speed)
10917 {
10918 if (op0_stripped != newop0)
10919 *cost += extra_cost->alu.log_shift;
10920 else
10921 *cost += extra_cost->alu.logical;
10922 }
10923
10924 return true;
10925 }
10926 /* MVN. */
10927 if (speed)
10928 *cost += extra_cost->alu.logical;
10929
10930 return false;
10931
10932 case ZERO_EXTEND:
10933
10934 op0 = XEXP (x, 0);
10935 /* If a value is written in SI mode, then zero extended to DI
10936 mode, the operation will in general be free as a write to
10937 a 'w' register implicitly zeroes the upper bits of an 'x'
10938 register. However, if this is
10939
10940 (set (reg) (zero_extend (reg)))
10941
10942 we must cost the explicit register move. */
10943 if (mode == DImode
10944 && GET_MODE (op0) == SImode
10945 && outer == SET)
10946 {
10947 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10948
10949 /* If OP_COST is non-zero, then the cost of the zero extend
10950 is effectively the cost of the inner operation. Otherwise
10951 we have a MOV instruction and we take the cost from the MOV
10952 itself. This is true independently of whether we are
10953 optimizing for space or time. */
10954 if (op_cost)
10955 *cost = op_cost;
10956
10957 return true;
10958 }
10959 else if (MEM_P (op0))
10960 {
10961 /* All loads can zero extend to any size for free. */
10962 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10963 return true;
10964 }
10965
10966 op0 = aarch64_extend_bitfield_pattern_p (x);
10967 if (op0)
10968 {
10969 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10970 if (speed)
10971 *cost += extra_cost->alu.bfx;
10972 return true;
10973 }
10974
10975 if (speed)
10976 {
10977 if (VECTOR_MODE_P (mode))
10978 {
10979 /* UMOV. */
10980 *cost += extra_cost->vect.alu;
10981 }
10982 else
10983 {
10984 /* We generate an AND instead of UXTB/UXTH. */
10985 *cost += extra_cost->alu.logical;
10986 }
10987 }
10988 return false;
10989
10990 case SIGN_EXTEND:
10991 if (MEM_P (XEXP (x, 0)))
10992 {
10993 /* LDRSH. */
10994 if (speed)
10995 {
10996 rtx address = XEXP (XEXP (x, 0), 0);
10997 *cost += extra_cost->ldst.load_sign_extend;
10998
10999 *cost +=
11000 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11001 0, speed));
11002 }
11003 return true;
11004 }
11005
11006 op0 = aarch64_extend_bitfield_pattern_p (x);
11007 if (op0)
11008 {
11009 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11010 if (speed)
11011 *cost += extra_cost->alu.bfx;
11012 return true;
11013 }
11014
11015 if (speed)
11016 {
11017 if (VECTOR_MODE_P (mode))
11018 *cost += extra_cost->vect.alu;
11019 else
11020 *cost += extra_cost->alu.extend;
11021 }
11022 return false;
11023
11024 case ASHIFT:
11025 op0 = XEXP (x, 0);
11026 op1 = XEXP (x, 1);
11027
11028 if (CONST_INT_P (op1))
11029 {
11030 if (speed)
11031 {
11032 if (VECTOR_MODE_P (mode))
11033 {
11034 /* Vector shift (immediate). */
11035 *cost += extra_cost->vect.alu;
11036 }
11037 else
11038 {
11039 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
11040 aliases. */
11041 *cost += extra_cost->alu.shift;
11042 }
11043 }
11044
11045 /* We can incorporate zero/sign extend for free. */
11046 if (GET_CODE (op0) == ZERO_EXTEND
11047 || GET_CODE (op0) == SIGN_EXTEND)
11048 op0 = XEXP (op0, 0);
11049
11050 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
11051 return true;
11052 }
11053 else
11054 {
11055 if (VECTOR_MODE_P (mode))
11056 {
11057 if (speed)
11058 /* Vector shift (register). */
11059 *cost += extra_cost->vect.alu;
11060 }
11061 else
11062 {
11063 if (speed)
11064 /* LSLV. */
11065 *cost += extra_cost->alu.shift_reg;
11066
11067 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11068 && CONST_INT_P (XEXP (op1, 1))
11069 && known_eq (INTVAL (XEXP (op1, 1)),
11070 GET_MODE_BITSIZE (mode) - 1))
11071 {
11072 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11073 /* We already demanded XEXP (op1, 0) to be REG_P, so
11074 don't recurse into it. */
11075 return true;
11076 }
11077 }
11078 return false; /* All arguments need to be in registers. */
11079 }
11080
11081 case ROTATE:
11082 case ROTATERT:
11083 case LSHIFTRT:
11084 case ASHIFTRT:
11085 op0 = XEXP (x, 0);
11086 op1 = XEXP (x, 1);
11087
11088 if (CONST_INT_P (op1))
11089 {
11090 /* ASR (immediate) and friends. */
11091 if (speed)
11092 {
11093 if (VECTOR_MODE_P (mode))
11094 *cost += extra_cost->vect.alu;
11095 else
11096 *cost += extra_cost->alu.shift;
11097 }
11098
11099 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11100 return true;
11101 }
11102 else
11103 {
11104 if (VECTOR_MODE_P (mode))
11105 {
11106 if (speed)
11107 /* Vector shift (register). */
11108 *cost += extra_cost->vect.alu;
11109 }
11110 else
11111 {
11112 if (speed)
11113 /* ASR (register) and friends. */
11114 *cost += extra_cost->alu.shift_reg;
11115
11116 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
11117 && CONST_INT_P (XEXP (op1, 1))
11118 && known_eq (INTVAL (XEXP (op1, 1)),
11119 GET_MODE_BITSIZE (mode) - 1))
11120 {
11121 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
11122 /* We already demanded XEXP (op1, 0) to be REG_P, so
11123 don't recurse into it. */
11124 return true;
11125 }
11126 }
11127 return false; /* All arguments need to be in registers. */
11128 }
11129
11130 case SYMBOL_REF:
11131
11132 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
11133 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
11134 {
11135 /* LDR. */
11136 if (speed)
11137 *cost += extra_cost->ldst.load;
11138 }
11139 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
11140 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
11141 {
11142 /* ADRP, followed by ADD. */
11143 *cost += COSTS_N_INSNS (1);
11144 if (speed)
11145 *cost += 2 * extra_cost->alu.arith;
11146 }
11147 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
11148 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11149 {
11150 /* ADR. */
11151 if (speed)
11152 *cost += extra_cost->alu.arith;
11153 }
11154
11155 if (flag_pic)
11156 {
11157 /* One extra load instruction, after accessing the GOT. */
11158 *cost += COSTS_N_INSNS (1);
11159 if (speed)
11160 *cost += extra_cost->ldst.load;
11161 }
11162 return true;
11163
11164 case HIGH:
11165 case LO_SUM:
11166 /* ADRP/ADD (immediate). */
11167 if (speed)
11168 *cost += extra_cost->alu.arith;
11169 return true;
11170
11171 case ZERO_EXTRACT:
11172 case SIGN_EXTRACT:
11173 /* UBFX/SBFX. */
11174 if (speed)
11175 {
11176 if (VECTOR_MODE_P (mode))
11177 *cost += extra_cost->vect.alu;
11178 else
11179 *cost += extra_cost->alu.bfx;
11180 }
11181
11182 /* We can trust that the immediates used will be correct (there
11183 are no by-register forms), so we need only cost op0. */
11184 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
11185 return true;
11186
11187 case MULT:
11188 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
11189 /* aarch64_rtx_mult_cost always handles recursion to its
11190 operands. */
11191 return true;
11192
11193 case MOD:
11194 /* We can expand signed mod by power of 2 using a NEGS, two parallel
11195 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
11196 an unconditional negate. This case should only ever be reached through
11197 the set_smod_pow2_cheap check in expmed.c. */
11198 if (CONST_INT_P (XEXP (x, 1))
11199 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
11200 && (mode == SImode || mode == DImode))
11201 {
11202 /* We expand to 4 instructions. Reset the baseline. */
11203 *cost = COSTS_N_INSNS (4);
11204
11205 if (speed)
11206 *cost += 2 * extra_cost->alu.logical
11207 + 2 * extra_cost->alu.arith;
11208
11209 return true;
11210 }
11211
11212 /* Fall-through. */
11213 case UMOD:
11214 if (speed)
11215 {
11216 /* Slighly prefer UMOD over SMOD. */
11217 if (VECTOR_MODE_P (mode))
11218 *cost += extra_cost->vect.alu;
11219 else if (GET_MODE_CLASS (mode) == MODE_INT)
11220 *cost += (extra_cost->mult[mode == DImode].add
11221 + extra_cost->mult[mode == DImode].idiv
11222 + (code == MOD ? 1 : 0));
11223 }
11224 return false; /* All arguments need to be in registers. */
11225
11226 case DIV:
11227 case UDIV:
11228 case SQRT:
11229 if (speed)
11230 {
11231 if (VECTOR_MODE_P (mode))
11232 *cost += extra_cost->vect.alu;
11233 else if (GET_MODE_CLASS (mode) == MODE_INT)
11234 /* There is no integer SQRT, so only DIV and UDIV can get
11235 here. */
11236 *cost += (extra_cost->mult[mode == DImode].idiv
11237 /* Slighly prefer UDIV over SDIV. */
11238 + (code == DIV ? 1 : 0));
11239 else
11240 *cost += extra_cost->fp[mode == DFmode].div;
11241 }
11242 return false; /* All arguments need to be in registers. */
11243
11244 case IF_THEN_ELSE:
11245 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
11246 XEXP (x, 2), cost, speed);
11247
11248 case EQ:
11249 case NE:
11250 case GT:
11251 case GTU:
11252 case LT:
11253 case LTU:
11254 case GE:
11255 case GEU:
11256 case LE:
11257 case LEU:
11258
11259 return false; /* All arguments must be in registers. */
11260
11261 case FMA:
11262 op0 = XEXP (x, 0);
11263 op1 = XEXP (x, 1);
11264 op2 = XEXP (x, 2);
11265
11266 if (speed)
11267 {
11268 if (VECTOR_MODE_P (mode))
11269 *cost += extra_cost->vect.alu;
11270 else
11271 *cost += extra_cost->fp[mode == DFmode].fma;
11272 }
11273
11274 /* FMSUB, FNMADD, and FNMSUB are free. */
11275 if (GET_CODE (op0) == NEG)
11276 op0 = XEXP (op0, 0);
11277
11278 if (GET_CODE (op2) == NEG)
11279 op2 = XEXP (op2, 0);
11280
11281 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11282 and the by-element operand as operand 0. */
11283 if (GET_CODE (op1) == NEG)
11284 op1 = XEXP (op1, 0);
11285
11286 /* Catch vector-by-element operations. The by-element operand can
11287 either be (vec_duplicate (vec_select (x))) or just
11288 (vec_select (x)), depending on whether we are multiplying by
11289 a vector or a scalar.
11290
11291 Canonicalization is not very good in these cases, FMA4 will put the
11292 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11293 if (GET_CODE (op0) == VEC_DUPLICATE)
11294 op0 = XEXP (op0, 0);
11295 else if (GET_CODE (op1) == VEC_DUPLICATE)
11296 op1 = XEXP (op1, 0);
11297
11298 if (GET_CODE (op0) == VEC_SELECT)
11299 op0 = XEXP (op0, 0);
11300 else if (GET_CODE (op1) == VEC_SELECT)
11301 op1 = XEXP (op1, 0);
11302
11303 /* If the remaining parameters are not registers,
11304 get the cost to put them into registers. */
11305 *cost += rtx_cost (op0, mode, FMA, 0, speed);
11306 *cost += rtx_cost (op1, mode, FMA, 1, speed);
11307 *cost += rtx_cost (op2, mode, FMA, 2, speed);
11308 return true;
11309
11310 case FLOAT:
11311 case UNSIGNED_FLOAT:
11312 if (speed)
11313 *cost += extra_cost->fp[mode == DFmode].fromint;
11314 return false;
11315
11316 case FLOAT_EXTEND:
11317 if (speed)
11318 {
11319 if (VECTOR_MODE_P (mode))
11320 {
11321 /*Vector truncate. */
11322 *cost += extra_cost->vect.alu;
11323 }
11324 else
11325 *cost += extra_cost->fp[mode == DFmode].widen;
11326 }
11327 return false;
11328
11329 case FLOAT_TRUNCATE:
11330 if (speed)
11331 {
11332 if (VECTOR_MODE_P (mode))
11333 {
11334 /*Vector conversion. */
11335 *cost += extra_cost->vect.alu;
11336 }
11337 else
11338 *cost += extra_cost->fp[mode == DFmode].narrow;
11339 }
11340 return false;
11341
11342 case FIX:
11343 case UNSIGNED_FIX:
11344 x = XEXP (x, 0);
11345 /* Strip the rounding part. They will all be implemented
11346 by the fcvt* family of instructions anyway. */
11347 if (GET_CODE (x) == UNSPEC)
11348 {
11349 unsigned int uns_code = XINT (x, 1);
11350
11351 if (uns_code == UNSPEC_FRINTA
11352 || uns_code == UNSPEC_FRINTM
11353 || uns_code == UNSPEC_FRINTN
11354 || uns_code == UNSPEC_FRINTP
11355 || uns_code == UNSPEC_FRINTZ)
11356 x = XVECEXP (x, 0, 0);
11357 }
11358
11359 if (speed)
11360 {
11361 if (VECTOR_MODE_P (mode))
11362 *cost += extra_cost->vect.alu;
11363 else
11364 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11365 }
11366
11367 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11368 fixed-point fcvt. */
11369 if (GET_CODE (x) == MULT
11370 && ((VECTOR_MODE_P (mode)
11371 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11372 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11373 {
11374 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11375 0, speed);
11376 return true;
11377 }
11378
11379 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11380 return true;
11381
11382 case ABS:
11383 if (VECTOR_MODE_P (mode))
11384 {
11385 /* ABS (vector). */
11386 if (speed)
11387 *cost += extra_cost->vect.alu;
11388 }
11389 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11390 {
11391 op0 = XEXP (x, 0);
11392
11393 /* FABD, which is analogous to FADD. */
11394 if (GET_CODE (op0) == MINUS)
11395 {
11396 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11397 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11398 if (speed)
11399 *cost += extra_cost->fp[mode == DFmode].addsub;
11400
11401 return true;
11402 }
11403 /* Simple FABS is analogous to FNEG. */
11404 if (speed)
11405 *cost += extra_cost->fp[mode == DFmode].neg;
11406 }
11407 else
11408 {
11409 /* Integer ABS will either be split to
11410 two arithmetic instructions, or will be an ABS
11411 (scalar), which we don't model. */
11412 *cost = COSTS_N_INSNS (2);
11413 if (speed)
11414 *cost += 2 * extra_cost->alu.arith;
11415 }
11416 return false;
11417
11418 case SMAX:
11419 case SMIN:
11420 if (speed)
11421 {
11422 if (VECTOR_MODE_P (mode))
11423 *cost += extra_cost->vect.alu;
11424 else
11425 {
11426 /* FMAXNM/FMINNM/FMAX/FMIN.
11427 TODO: This may not be accurate for all implementations, but
11428 we do not model this in the cost tables. */
11429 *cost += extra_cost->fp[mode == DFmode].addsub;
11430 }
11431 }
11432 return false;
11433
11434 case UNSPEC:
11435 /* The floating point round to integer frint* instructions. */
11436 if (aarch64_frint_unspec_p (XINT (x, 1)))
11437 {
11438 if (speed)
11439 *cost += extra_cost->fp[mode == DFmode].roundint;
11440
11441 return false;
11442 }
11443
11444 if (XINT (x, 1) == UNSPEC_RBIT)
11445 {
11446 if (speed)
11447 *cost += extra_cost->alu.rev;
11448
11449 return false;
11450 }
11451 break;
11452
11453 case TRUNCATE:
11454
11455 /* Decompose <su>muldi3_highpart. */
11456 if (/* (truncate:DI */
11457 mode == DImode
11458 /* (lshiftrt:TI */
11459 && GET_MODE (XEXP (x, 0)) == TImode
11460 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11461 /* (mult:TI */
11462 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11463 /* (ANY_EXTEND:TI (reg:DI))
11464 (ANY_EXTEND:TI (reg:DI))) */
11465 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11466 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11467 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11468 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11469 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11470 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11471 /* (const_int 64) */
11472 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11473 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11474 {
11475 /* UMULH/SMULH. */
11476 if (speed)
11477 *cost += extra_cost->mult[mode == DImode].extend;
11478 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11479 mode, MULT, 0, speed);
11480 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11481 mode, MULT, 1, speed);
11482 return true;
11483 }
11484
11485 /* Fall through. */
11486 default:
11487 break;
11488 }
11489
11490 if (dump_file
11491 && flag_aarch64_verbose_cost)
11492 fprintf (dump_file,
11493 "\nFailed to cost RTX. Assuming default cost.\n");
11494
11495 return true;
11496 }
11497
11498 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11499 calculated for X. This cost is stored in *COST. Returns true
11500 if the total cost of X was calculated. */
11501 static bool
11502 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11503 int param, int *cost, bool speed)
11504 {
11505 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11506
11507 if (dump_file
11508 && flag_aarch64_verbose_cost)
11509 {
11510 print_rtl_single (dump_file, x);
11511 fprintf (dump_file, "\n%s cost: %d (%s)\n",
11512 speed ? "Hot" : "Cold",
11513 *cost, result ? "final" : "partial");
11514 }
11515
11516 return result;
11517 }
11518
11519 static int
11520 aarch64_register_move_cost (machine_mode mode,
11521 reg_class_t from_i, reg_class_t to_i)
11522 {
11523 enum reg_class from = (enum reg_class) from_i;
11524 enum reg_class to = (enum reg_class) to_i;
11525 const struct cpu_regmove_cost *regmove_cost
11526 = aarch64_tune_params.regmove_cost;
11527
11528 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11529 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11530 to = GENERAL_REGS;
11531
11532 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11533 from = GENERAL_REGS;
11534
11535 /* Moving between GPR and stack cost is the same as GP2GP. */
11536 if ((from == GENERAL_REGS && to == STACK_REG)
11537 || (to == GENERAL_REGS && from == STACK_REG))
11538 return regmove_cost->GP2GP;
11539
11540 /* To/From the stack register, we move via the gprs. */
11541 if (to == STACK_REG || from == STACK_REG)
11542 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11543 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11544
11545 if (known_eq (GET_MODE_SIZE (mode), 16))
11546 {
11547 /* 128-bit operations on general registers require 2 instructions. */
11548 if (from == GENERAL_REGS && to == GENERAL_REGS)
11549 return regmove_cost->GP2GP * 2;
11550 else if (from == GENERAL_REGS)
11551 return regmove_cost->GP2FP * 2;
11552 else if (to == GENERAL_REGS)
11553 return regmove_cost->FP2GP * 2;
11554
11555 /* When AdvSIMD instructions are disabled it is not possible to move
11556 a 128-bit value directly between Q registers. This is handled in
11557 secondary reload. A general register is used as a scratch to move
11558 the upper DI value and the lower DI value is moved directly,
11559 hence the cost is the sum of three moves. */
11560 if (! TARGET_SIMD)
11561 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11562
11563 return regmove_cost->FP2FP;
11564 }
11565
11566 if (from == GENERAL_REGS && to == GENERAL_REGS)
11567 return regmove_cost->GP2GP;
11568 else if (from == GENERAL_REGS)
11569 return regmove_cost->GP2FP;
11570 else if (to == GENERAL_REGS)
11571 return regmove_cost->FP2GP;
11572
11573 return regmove_cost->FP2FP;
11574 }
11575
11576 static int
11577 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11578 reg_class_t rclass ATTRIBUTE_UNUSED,
11579 bool in ATTRIBUTE_UNUSED)
11580 {
11581 return aarch64_tune_params.memmov_cost;
11582 }
11583
11584 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11585 to optimize 1.0/sqrt. */
11586
11587 static bool
11588 use_rsqrt_p (machine_mode mode)
11589 {
11590 return (!flag_trapping_math
11591 && flag_unsafe_math_optimizations
11592 && ((aarch64_tune_params.approx_modes->recip_sqrt
11593 & AARCH64_APPROX_MODE (mode))
11594 || flag_mrecip_low_precision_sqrt));
11595 }
11596
11597 /* Function to decide when to use the approximate reciprocal square root
11598 builtin. */
11599
11600 static tree
11601 aarch64_builtin_reciprocal (tree fndecl)
11602 {
11603 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11604
11605 if (!use_rsqrt_p (mode))
11606 return NULL_TREE;
11607 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11608 }
11609
11610 /* Emit instruction sequence to compute either the approximate square root
11611 or its approximate reciprocal, depending on the flag RECP, and return
11612 whether the sequence was emitted or not. */
11613
11614 bool
11615 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11616 {
11617 machine_mode mode = GET_MODE (dst);
11618
11619 if (GET_MODE_INNER (mode) == HFmode)
11620 {
11621 gcc_assert (!recp);
11622 return false;
11623 }
11624
11625 if (!recp)
11626 {
11627 if (!(flag_mlow_precision_sqrt
11628 || (aarch64_tune_params.approx_modes->sqrt
11629 & AARCH64_APPROX_MODE (mode))))
11630 return false;
11631
11632 if (flag_finite_math_only
11633 || flag_trapping_math
11634 || !flag_unsafe_math_optimizations
11635 || optimize_function_for_size_p (cfun))
11636 return false;
11637 }
11638 else
11639 /* Caller assumes we cannot fail. */
11640 gcc_assert (use_rsqrt_p (mode));
11641
11642 machine_mode mmsk = mode_for_int_vector (mode).require ();
11643 rtx xmsk = gen_reg_rtx (mmsk);
11644 if (!recp)
11645 /* When calculating the approximate square root, compare the
11646 argument with 0.0 and create a mask. */
11647 emit_insn (gen_rtx_SET (xmsk,
11648 gen_rtx_NEG (mmsk,
11649 gen_rtx_EQ (mmsk, src,
11650 CONST0_RTX (mode)))));
11651
11652 /* Estimate the approximate reciprocal square root. */
11653 rtx xdst = gen_reg_rtx (mode);
11654 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11655
11656 /* Iterate over the series twice for SF and thrice for DF. */
11657 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11658
11659 /* Optionally iterate over the series once less for faster performance
11660 while sacrificing the accuracy. */
11661 if ((recp && flag_mrecip_low_precision_sqrt)
11662 || (!recp && flag_mlow_precision_sqrt))
11663 iterations--;
11664
11665 /* Iterate over the series to calculate the approximate reciprocal square
11666 root. */
11667 rtx x1 = gen_reg_rtx (mode);
11668 while (iterations--)
11669 {
11670 rtx x2 = gen_reg_rtx (mode);
11671 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11672
11673 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11674
11675 if (iterations > 0)
11676 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11677 }
11678
11679 if (!recp)
11680 {
11681 /* Qualify the approximate reciprocal square root when the argument is
11682 0.0 by squashing the intermediary result to 0.0. */
11683 rtx xtmp = gen_reg_rtx (mmsk);
11684 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11685 gen_rtx_SUBREG (mmsk, xdst, 0)));
11686 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11687
11688 /* Calculate the approximate square root. */
11689 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11690 }
11691
11692 /* Finalize the approximation. */
11693 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11694
11695 return true;
11696 }
11697
11698 /* Emit the instruction sequence to compute the approximation for the division
11699 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11700
11701 bool
11702 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11703 {
11704 machine_mode mode = GET_MODE (quo);
11705
11706 if (GET_MODE_INNER (mode) == HFmode)
11707 return false;
11708
11709 bool use_approx_division_p = (flag_mlow_precision_div
11710 || (aarch64_tune_params.approx_modes->division
11711 & AARCH64_APPROX_MODE (mode)));
11712
11713 if (!flag_finite_math_only
11714 || flag_trapping_math
11715 || !flag_unsafe_math_optimizations
11716 || optimize_function_for_size_p (cfun)
11717 || !use_approx_division_p)
11718 return false;
11719
11720 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11721 return false;
11722
11723 /* Estimate the approximate reciprocal. */
11724 rtx xrcp = gen_reg_rtx (mode);
11725 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11726
11727 /* Iterate over the series twice for SF and thrice for DF. */
11728 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11729
11730 /* Optionally iterate over the series once less for faster performance,
11731 while sacrificing the accuracy. */
11732 if (flag_mlow_precision_div)
11733 iterations--;
11734
11735 /* Iterate over the series to calculate the approximate reciprocal. */
11736 rtx xtmp = gen_reg_rtx (mode);
11737 while (iterations--)
11738 {
11739 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11740
11741 if (iterations > 0)
11742 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11743 }
11744
11745 if (num != CONST1_RTX (mode))
11746 {
11747 /* As the approximate reciprocal of DEN is already calculated, only
11748 calculate the approximate division when NUM is not 1.0. */
11749 rtx xnum = force_reg (mode, num);
11750 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11751 }
11752
11753 /* Finalize the approximation. */
11754 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11755 return true;
11756 }
11757
11758 /* Return the number of instructions that can be issued per cycle. */
11759 static int
11760 aarch64_sched_issue_rate (void)
11761 {
11762 return aarch64_tune_params.issue_rate;
11763 }
11764
11765 static int
11766 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11767 {
11768 int issue_rate = aarch64_sched_issue_rate ();
11769
11770 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11771 }
11772
11773
11774 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11775 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11776 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11777
11778 static int
11779 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11780 int ready_index)
11781 {
11782 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11783 }
11784
11785
11786 /* Vectorizer cost model target hooks. */
11787
11788 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11789 static int
11790 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11791 tree vectype,
11792 int misalign ATTRIBUTE_UNUSED)
11793 {
11794 unsigned elements;
11795 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11796 bool fp = false;
11797
11798 if (vectype != NULL)
11799 fp = FLOAT_TYPE_P (vectype);
11800
11801 switch (type_of_cost)
11802 {
11803 case scalar_stmt:
11804 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11805
11806 case scalar_load:
11807 return costs->scalar_load_cost;
11808
11809 case scalar_store:
11810 return costs->scalar_store_cost;
11811
11812 case vector_stmt:
11813 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11814
11815 case vector_load:
11816 return costs->vec_align_load_cost;
11817
11818 case vector_store:
11819 return costs->vec_store_cost;
11820
11821 case vec_to_scalar:
11822 return costs->vec_to_scalar_cost;
11823
11824 case scalar_to_vec:
11825 return costs->scalar_to_vec_cost;
11826
11827 case unaligned_load:
11828 case vector_gather_load:
11829 return costs->vec_unalign_load_cost;
11830
11831 case unaligned_store:
11832 case vector_scatter_store:
11833 return costs->vec_unalign_store_cost;
11834
11835 case cond_branch_taken:
11836 return costs->cond_taken_branch_cost;
11837
11838 case cond_branch_not_taken:
11839 return costs->cond_not_taken_branch_cost;
11840
11841 case vec_perm:
11842 return costs->vec_permute_cost;
11843
11844 case vec_promote_demote:
11845 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11846
11847 case vec_construct:
11848 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11849 return elements / 2 + 1;
11850
11851 default:
11852 gcc_unreachable ();
11853 }
11854 }
11855
11856 /* Implement targetm.vectorize.add_stmt_cost. */
11857 static unsigned
11858 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11859 struct _stmt_vec_info *stmt_info, int misalign,
11860 enum vect_cost_model_location where)
11861 {
11862 unsigned *cost = (unsigned *) data;
11863 unsigned retval = 0;
11864
11865 if (flag_vect_cost_model)
11866 {
11867 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11868 int stmt_cost =
11869 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11870
11871 /* Statements in an inner loop relative to the loop being
11872 vectorized are weighted more heavily. The value here is
11873 arbitrary and could potentially be improved with analysis. */
11874 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11875 count *= 50; /* FIXME */
11876
11877 retval = (unsigned) (count * stmt_cost);
11878 cost[where] += retval;
11879 }
11880
11881 return retval;
11882 }
11883
11884 static void initialize_aarch64_code_model (struct gcc_options *);
11885
11886 /* Parse the TO_PARSE string and put the architecture struct that it
11887 selects into RES and the architectural features into ISA_FLAGS.
11888 Return an aarch64_parse_opt_result describing the parse result.
11889 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11890 When the TO_PARSE string contains an invalid extension,
11891 a copy of the string is created and stored to INVALID_EXTENSION. */
11892
11893 static enum aarch64_parse_opt_result
11894 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11895 uint64_t *isa_flags, std::string *invalid_extension)
11896 {
11897 const char *ext;
11898 const struct processor *arch;
11899 size_t len;
11900
11901 ext = strchr (to_parse, '+');
11902
11903 if (ext != NULL)
11904 len = ext - to_parse;
11905 else
11906 len = strlen (to_parse);
11907
11908 if (len == 0)
11909 return AARCH64_PARSE_MISSING_ARG;
11910
11911
11912 /* Loop through the list of supported ARCHes to find a match. */
11913 for (arch = all_architectures; arch->name != NULL; arch++)
11914 {
11915 if (strlen (arch->name) == len
11916 && strncmp (arch->name, to_parse, len) == 0)
11917 {
11918 uint64_t isa_temp = arch->flags;
11919
11920 if (ext != NULL)
11921 {
11922 /* TO_PARSE string contains at least one extension. */
11923 enum aarch64_parse_opt_result ext_res
11924 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11925
11926 if (ext_res != AARCH64_PARSE_OK)
11927 return ext_res;
11928 }
11929 /* Extension parsing was successful. Confirm the result
11930 arch and ISA flags. */
11931 *res = arch;
11932 *isa_flags = isa_temp;
11933 return AARCH64_PARSE_OK;
11934 }
11935 }
11936
11937 /* ARCH name not found in list. */
11938 return AARCH64_PARSE_INVALID_ARG;
11939 }
11940
11941 /* Parse the TO_PARSE string and put the result tuning in RES and the
11942 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11943 describing the parse result. If there is an error parsing, RES and
11944 ISA_FLAGS are left unchanged.
11945 When the TO_PARSE string contains an invalid extension,
11946 a copy of the string is created and stored to INVALID_EXTENSION. */
11947
11948 static enum aarch64_parse_opt_result
11949 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11950 uint64_t *isa_flags, std::string *invalid_extension)
11951 {
11952 const char *ext;
11953 const struct processor *cpu;
11954 size_t len;
11955
11956 ext = strchr (to_parse, '+');
11957
11958 if (ext != NULL)
11959 len = ext - to_parse;
11960 else
11961 len = strlen (to_parse);
11962
11963 if (len == 0)
11964 return AARCH64_PARSE_MISSING_ARG;
11965
11966
11967 /* Loop through the list of supported CPUs to find a match. */
11968 for (cpu = all_cores; cpu->name != NULL; cpu++)
11969 {
11970 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11971 {
11972 uint64_t isa_temp = cpu->flags;
11973
11974
11975 if (ext != NULL)
11976 {
11977 /* TO_PARSE string contains at least one extension. */
11978 enum aarch64_parse_opt_result ext_res
11979 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11980
11981 if (ext_res != AARCH64_PARSE_OK)
11982 return ext_res;
11983 }
11984 /* Extension parsing was successfull. Confirm the result
11985 cpu and ISA flags. */
11986 *res = cpu;
11987 *isa_flags = isa_temp;
11988 return AARCH64_PARSE_OK;
11989 }
11990 }
11991
11992 /* CPU name not found in list. */
11993 return AARCH64_PARSE_INVALID_ARG;
11994 }
11995
11996 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11997 Return an aarch64_parse_opt_result describing the parse result.
11998 If the parsing fails the RES does not change. */
11999
12000 static enum aarch64_parse_opt_result
12001 aarch64_parse_tune (const char *to_parse, const struct processor **res)
12002 {
12003 const struct processor *cpu;
12004
12005 /* Loop through the list of supported CPUs to find a match. */
12006 for (cpu = all_cores; cpu->name != NULL; cpu++)
12007 {
12008 if (strcmp (cpu->name, to_parse) == 0)
12009 {
12010 *res = cpu;
12011 return AARCH64_PARSE_OK;
12012 }
12013 }
12014
12015 /* CPU name not found in list. */
12016 return AARCH64_PARSE_INVALID_ARG;
12017 }
12018
12019 /* Parse TOKEN, which has length LENGTH to see if it is an option
12020 described in FLAG. If it is, return the index bit for that fusion type.
12021 If not, error (printing OPTION_NAME) and return zero. */
12022
12023 static unsigned int
12024 aarch64_parse_one_option_token (const char *token,
12025 size_t length,
12026 const struct aarch64_flag_desc *flag,
12027 const char *option_name)
12028 {
12029 for (; flag->name != NULL; flag++)
12030 {
12031 if (length == strlen (flag->name)
12032 && !strncmp (flag->name, token, length))
12033 return flag->flag;
12034 }
12035
12036 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
12037 return 0;
12038 }
12039
12040 /* Parse OPTION which is a comma-separated list of flags to enable.
12041 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
12042 default state we inherit from the CPU tuning structures. OPTION_NAME
12043 gives the top-level option we are parsing in the -moverride string,
12044 for use in error messages. */
12045
12046 static unsigned int
12047 aarch64_parse_boolean_options (const char *option,
12048 const struct aarch64_flag_desc *flags,
12049 unsigned int initial_state,
12050 const char *option_name)
12051 {
12052 const char separator = '.';
12053 const char* specs = option;
12054 const char* ntoken = option;
12055 unsigned int found_flags = initial_state;
12056
12057 while ((ntoken = strchr (specs, separator)))
12058 {
12059 size_t token_length = ntoken - specs;
12060 unsigned token_ops = aarch64_parse_one_option_token (specs,
12061 token_length,
12062 flags,
12063 option_name);
12064 /* If we find "none" (or, for simplicity's sake, an error) anywhere
12065 in the token stream, reset the supported operations. So:
12066
12067 adrp+add.cmp+branch.none.adrp+add
12068
12069 would have the result of turning on only adrp+add fusion. */
12070 if (!token_ops)
12071 found_flags = 0;
12072
12073 found_flags |= token_ops;
12074 specs = ++ntoken;
12075 }
12076
12077 /* We ended with a comma, print something. */
12078 if (!(*specs))
12079 {
12080 error ("%s string ill-formed\n", option_name);
12081 return 0;
12082 }
12083
12084 /* We still have one more token to parse. */
12085 size_t token_length = strlen (specs);
12086 unsigned token_ops = aarch64_parse_one_option_token (specs,
12087 token_length,
12088 flags,
12089 option_name);
12090 if (!token_ops)
12091 found_flags = 0;
12092
12093 found_flags |= token_ops;
12094 return found_flags;
12095 }
12096
12097 /* Support for overriding instruction fusion. */
12098
12099 static void
12100 aarch64_parse_fuse_string (const char *fuse_string,
12101 struct tune_params *tune)
12102 {
12103 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
12104 aarch64_fusible_pairs,
12105 tune->fusible_ops,
12106 "fuse=");
12107 }
12108
12109 /* Support for overriding other tuning flags. */
12110
12111 static void
12112 aarch64_parse_tune_string (const char *tune_string,
12113 struct tune_params *tune)
12114 {
12115 tune->extra_tuning_flags
12116 = aarch64_parse_boolean_options (tune_string,
12117 aarch64_tuning_flags,
12118 tune->extra_tuning_flags,
12119 "tune=");
12120 }
12121
12122 /* Parse the sve_width tuning moverride string in TUNE_STRING.
12123 Accept the valid SVE vector widths allowed by
12124 aarch64_sve_vector_bits_enum and use it to override sve_width
12125 in TUNE. */
12126
12127 static void
12128 aarch64_parse_sve_width_string (const char *tune_string,
12129 struct tune_params *tune)
12130 {
12131 int width = -1;
12132
12133 int n = sscanf (tune_string, "%d", &width);
12134 if (n == EOF)
12135 {
12136 error ("invalid format for sve_width");
12137 return;
12138 }
12139 switch (width)
12140 {
12141 case SVE_128:
12142 case SVE_256:
12143 case SVE_512:
12144 case SVE_1024:
12145 case SVE_2048:
12146 break;
12147 default:
12148 error ("invalid sve_width value: %d", width);
12149 }
12150 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
12151 }
12152
12153 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
12154 we understand. If it is, extract the option string and handoff to
12155 the appropriate function. */
12156
12157 void
12158 aarch64_parse_one_override_token (const char* token,
12159 size_t length,
12160 struct tune_params *tune)
12161 {
12162 const struct aarch64_tuning_override_function *fn
12163 = aarch64_tuning_override_functions;
12164
12165 const char *option_part = strchr (token, '=');
12166 if (!option_part)
12167 {
12168 error ("tuning string missing in option (%s)", token);
12169 return;
12170 }
12171
12172 /* Get the length of the option name. */
12173 length = option_part - token;
12174 /* Skip the '=' to get to the option string. */
12175 option_part++;
12176
12177 for (; fn->name != NULL; fn++)
12178 {
12179 if (!strncmp (fn->name, token, length))
12180 {
12181 fn->parse_override (option_part, tune);
12182 return;
12183 }
12184 }
12185
12186 error ("unknown tuning option (%s)",token);
12187 return;
12188 }
12189
12190 /* A checking mechanism for the implementation of the tls size. */
12191
12192 static void
12193 initialize_aarch64_tls_size (struct gcc_options *opts)
12194 {
12195 if (aarch64_tls_size == 0)
12196 aarch64_tls_size = 24;
12197
12198 switch (opts->x_aarch64_cmodel_var)
12199 {
12200 case AARCH64_CMODEL_TINY:
12201 /* Both the default and maximum TLS size allowed under tiny is 1M which
12202 needs two instructions to address, so we clamp the size to 24. */
12203 if (aarch64_tls_size > 24)
12204 aarch64_tls_size = 24;
12205 break;
12206 case AARCH64_CMODEL_SMALL:
12207 /* The maximum TLS size allowed under small is 4G. */
12208 if (aarch64_tls_size > 32)
12209 aarch64_tls_size = 32;
12210 break;
12211 case AARCH64_CMODEL_LARGE:
12212 /* The maximum TLS size allowed under large is 16E.
12213 FIXME: 16E should be 64bit, we only support 48bit offset now. */
12214 if (aarch64_tls_size > 48)
12215 aarch64_tls_size = 48;
12216 break;
12217 default:
12218 gcc_unreachable ();
12219 }
12220
12221 return;
12222 }
12223
12224 /* Parse STRING looking for options in the format:
12225 string :: option:string
12226 option :: name=substring
12227 name :: {a-z}
12228 substring :: defined by option. */
12229
12230 static void
12231 aarch64_parse_override_string (const char* input_string,
12232 struct tune_params* tune)
12233 {
12234 const char separator = ':';
12235 size_t string_length = strlen (input_string) + 1;
12236 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
12237 char *string = string_root;
12238 strncpy (string, input_string, string_length);
12239 string[string_length - 1] = '\0';
12240
12241 char* ntoken = string;
12242
12243 while ((ntoken = strchr (string, separator)))
12244 {
12245 size_t token_length = ntoken - string;
12246 /* Make this substring look like a string. */
12247 *ntoken = '\0';
12248 aarch64_parse_one_override_token (string, token_length, tune);
12249 string = ++ntoken;
12250 }
12251
12252 /* One last option to parse. */
12253 aarch64_parse_one_override_token (string, strlen (string), tune);
12254 free (string_root);
12255 }
12256
12257
12258 static void
12259 aarch64_override_options_after_change_1 (struct gcc_options *opts)
12260 {
12261 if (accepted_branch_protection_string)
12262 {
12263 opts->x_aarch64_branch_protection_string
12264 = xstrdup (accepted_branch_protection_string);
12265 }
12266
12267 /* PR 70044: We have to be careful about being called multiple times for the
12268 same function. This means all changes should be repeatable. */
12269
12270 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12271 Disable the frame pointer flag so the mid-end will not use a frame
12272 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12273 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12274 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12275 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12276 if (opts->x_flag_omit_frame_pointer == 0)
12277 opts->x_flag_omit_frame_pointer = 2;
12278
12279 /* If not optimizing for size, set the default
12280 alignment to what the target wants. */
12281 if (!opts->x_optimize_size)
12282 {
12283 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12284 opts->x_str_align_loops = aarch64_tune_params.loop_align;
12285 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12286 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12287 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12288 opts->x_str_align_functions = aarch64_tune_params.function_align;
12289 }
12290
12291 /* We default to no pc-relative literal loads. */
12292
12293 aarch64_pcrelative_literal_loads = false;
12294
12295 /* If -mpc-relative-literal-loads is set on the command line, this
12296 implies that the user asked for PC relative literal loads. */
12297 if (opts->x_pcrelative_literal_loads == 1)
12298 aarch64_pcrelative_literal_loads = true;
12299
12300 /* In the tiny memory model it makes no sense to disallow PC relative
12301 literal pool loads. */
12302 if (aarch64_cmodel == AARCH64_CMODEL_TINY
12303 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12304 aarch64_pcrelative_literal_loads = true;
12305
12306 /* When enabling the lower precision Newton series for the square root, also
12307 enable it for the reciprocal square root, since the latter is an
12308 intermediary step for the former. */
12309 if (flag_mlow_precision_sqrt)
12310 flag_mrecip_low_precision_sqrt = true;
12311 }
12312
12313 /* 'Unpack' up the internal tuning structs and update the options
12314 in OPTS. The caller must have set up selected_tune and selected_arch
12315 as all the other target-specific codegen decisions are
12316 derived from them. */
12317
12318 void
12319 aarch64_override_options_internal (struct gcc_options *opts)
12320 {
12321 aarch64_tune_flags = selected_tune->flags;
12322 aarch64_tune = selected_tune->sched_core;
12323 /* Make a copy of the tuning parameters attached to the core, which
12324 we may later overwrite. */
12325 aarch64_tune_params = *(selected_tune->tune);
12326 aarch64_architecture_version = selected_arch->architecture_version;
12327
12328 if (opts->x_aarch64_override_tune_string)
12329 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12330 &aarch64_tune_params);
12331
12332 /* This target defaults to strict volatile bitfields. */
12333 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12334 opts->x_flag_strict_volatile_bitfields = 1;
12335
12336 if (aarch64_stack_protector_guard == SSP_GLOBAL
12337 && opts->x_aarch64_stack_protector_guard_offset_str)
12338 {
12339 error ("incompatible options %<-mstack-protector-guard=global%> and "
12340 "%<-mstack-protector-guard-offset=%s%>",
12341 aarch64_stack_protector_guard_offset_str);
12342 }
12343
12344 if (aarch64_stack_protector_guard == SSP_SYSREG
12345 && !(opts->x_aarch64_stack_protector_guard_offset_str
12346 && opts->x_aarch64_stack_protector_guard_reg_str))
12347 {
12348 error ("both %<-mstack-protector-guard-offset%> and "
12349 "%<-mstack-protector-guard-reg%> must be used "
12350 "with %<-mstack-protector-guard=sysreg%>");
12351 }
12352
12353 if (opts->x_aarch64_stack_protector_guard_reg_str)
12354 {
12355 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12356 error ("specify a system register with a small string length.");
12357 }
12358
12359 if (opts->x_aarch64_stack_protector_guard_offset_str)
12360 {
12361 char *end;
12362 const char *str = aarch64_stack_protector_guard_offset_str;
12363 errno = 0;
12364 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12365 if (!*str || *end || errno)
12366 error ("%qs is not a valid offset in %qs", str,
12367 "-mstack-protector-guard-offset=");
12368 aarch64_stack_protector_guard_offset = offs;
12369 }
12370
12371 initialize_aarch64_code_model (opts);
12372 initialize_aarch64_tls_size (opts);
12373
12374 int queue_depth = 0;
12375 switch (aarch64_tune_params.autoprefetcher_model)
12376 {
12377 case tune_params::AUTOPREFETCHER_OFF:
12378 queue_depth = -1;
12379 break;
12380 case tune_params::AUTOPREFETCHER_WEAK:
12381 queue_depth = 0;
12382 break;
12383 case tune_params::AUTOPREFETCHER_STRONG:
12384 queue_depth = max_insn_queue_index + 1;
12385 break;
12386 default:
12387 gcc_unreachable ();
12388 }
12389
12390 /* We don't mind passing in global_options_set here as we don't use
12391 the *options_set structs anyway. */
12392 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12393 queue_depth,
12394 opts->x_param_values,
12395 global_options_set.x_param_values);
12396
12397 /* Set up parameters to be used in prefetching algorithm. Do not
12398 override the defaults unless we are tuning for a core we have
12399 researched values for. */
12400 if (aarch64_tune_params.prefetch->num_slots > 0)
12401 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12402 aarch64_tune_params.prefetch->num_slots,
12403 opts->x_param_values,
12404 global_options_set.x_param_values);
12405 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12406 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12407 aarch64_tune_params.prefetch->l1_cache_size,
12408 opts->x_param_values,
12409 global_options_set.x_param_values);
12410 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12411 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12412 aarch64_tune_params.prefetch->l1_cache_line_size,
12413 opts->x_param_values,
12414 global_options_set.x_param_values);
12415 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12416 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12417 aarch64_tune_params.prefetch->l2_cache_size,
12418 opts->x_param_values,
12419 global_options_set.x_param_values);
12420 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12421 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12422 0,
12423 opts->x_param_values,
12424 global_options_set.x_param_values);
12425 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12426 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12427 aarch64_tune_params.prefetch->minimum_stride,
12428 opts->x_param_values,
12429 global_options_set.x_param_values);
12430
12431 /* Use the alternative scheduling-pressure algorithm by default. */
12432 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12433 opts->x_param_values,
12434 global_options_set.x_param_values);
12435
12436 /* If the user hasn't changed it via configure then set the default to 64 KB
12437 for the backend. */
12438 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12439 DEFAULT_STK_CLASH_GUARD_SIZE == 0
12440 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12441 opts->x_param_values,
12442 global_options_set.x_param_values);
12443
12444 /* Validate the guard size. */
12445 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12446
12447 /* Enforce that interval is the same size as size so the mid-end does the
12448 right thing. */
12449 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12450 guard_size,
12451 opts->x_param_values,
12452 global_options_set.x_param_values);
12453
12454 /* The maybe_set calls won't update the value if the user has explicitly set
12455 one. Which means we need to validate that probing interval and guard size
12456 are equal. */
12457 int probe_interval
12458 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12459 if (guard_size != probe_interval)
12460 error ("stack clash guard size %<%d%> must be equal to probing interval "
12461 "%<%d%>", guard_size, probe_interval);
12462
12463 /* Enable sw prefetching at specified optimization level for
12464 CPUS that have prefetch. Lower optimization level threshold by 1
12465 when profiling is enabled. */
12466 if (opts->x_flag_prefetch_loop_arrays < 0
12467 && !opts->x_optimize_size
12468 && aarch64_tune_params.prefetch->default_opt_level >= 0
12469 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12470 opts->x_flag_prefetch_loop_arrays = 1;
12471
12472 if (opts->x_aarch64_arch_string == NULL)
12473 opts->x_aarch64_arch_string = selected_arch->name;
12474 if (opts->x_aarch64_cpu_string == NULL)
12475 opts->x_aarch64_cpu_string = selected_cpu->name;
12476 if (opts->x_aarch64_tune_string == NULL)
12477 opts->x_aarch64_tune_string = selected_tune->name;
12478
12479 aarch64_override_options_after_change_1 (opts);
12480 }
12481
12482 /* Print a hint with a suggestion for a core or architecture name that
12483 most closely resembles what the user passed in STR. ARCH is true if
12484 the user is asking for an architecture name. ARCH is false if the user
12485 is asking for a core name. */
12486
12487 static void
12488 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12489 {
12490 auto_vec<const char *> candidates;
12491 const struct processor *entry = arch ? all_architectures : all_cores;
12492 for (; entry->name != NULL; entry++)
12493 candidates.safe_push (entry->name);
12494
12495 #ifdef HAVE_LOCAL_CPU_DETECT
12496 /* Add also "native" as possible value. */
12497 if (arch)
12498 candidates.safe_push ("native");
12499 #endif
12500
12501 char *s;
12502 const char *hint = candidates_list_and_hint (str, s, candidates);
12503 if (hint)
12504 inform (input_location, "valid arguments are: %s;"
12505 " did you mean %qs?", s, hint);
12506 else
12507 inform (input_location, "valid arguments are: %s", s);
12508
12509 XDELETEVEC (s);
12510 }
12511
12512 /* Print a hint with a suggestion for a core name that most closely resembles
12513 what the user passed in STR. */
12514
12515 inline static void
12516 aarch64_print_hint_for_core (const char *str)
12517 {
12518 aarch64_print_hint_for_core_or_arch (str, false);
12519 }
12520
12521 /* Print a hint with a suggestion for an architecture name that most closely
12522 resembles what the user passed in STR. */
12523
12524 inline static void
12525 aarch64_print_hint_for_arch (const char *str)
12526 {
12527 aarch64_print_hint_for_core_or_arch (str, true);
12528 }
12529
12530
12531 /* Print a hint with a suggestion for an extension name
12532 that most closely resembles what the user passed in STR. */
12533
12534 void
12535 aarch64_print_hint_for_extensions (const std::string &str)
12536 {
12537 auto_vec<const char *> candidates;
12538 aarch64_get_all_extension_candidates (&candidates);
12539 char *s;
12540 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12541 if (hint)
12542 inform (input_location, "valid arguments are: %s;"
12543 " did you mean %qs?", s, hint);
12544 else
12545 inform (input_location, "valid arguments are: %s;", s);
12546
12547 XDELETEVEC (s);
12548 }
12549
12550 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12551 specified in STR and throw errors if appropriate. Put the results if
12552 they are valid in RES and ISA_FLAGS. Return whether the option is
12553 valid. */
12554
12555 static bool
12556 aarch64_validate_mcpu (const char *str, const struct processor **res,
12557 uint64_t *isa_flags)
12558 {
12559 std::string invalid_extension;
12560 enum aarch64_parse_opt_result parse_res
12561 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12562
12563 if (parse_res == AARCH64_PARSE_OK)
12564 return true;
12565
12566 switch (parse_res)
12567 {
12568 case AARCH64_PARSE_MISSING_ARG:
12569 error ("missing cpu name in %<-mcpu=%s%>", str);
12570 break;
12571 case AARCH64_PARSE_INVALID_ARG:
12572 error ("unknown value %qs for %<-mcpu%>", str);
12573 aarch64_print_hint_for_core (str);
12574 break;
12575 case AARCH64_PARSE_INVALID_FEATURE:
12576 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12577 invalid_extension.c_str (), str);
12578 aarch64_print_hint_for_extensions (invalid_extension);
12579 break;
12580 default:
12581 gcc_unreachable ();
12582 }
12583
12584 return false;
12585 }
12586
12587 /* Parses CONST_STR for branch protection features specified in
12588 aarch64_branch_protect_types, and set any global variables required. Returns
12589 the parsing result and assigns LAST_STR to the last processed token from
12590 CONST_STR so that it can be used for error reporting. */
12591
12592 static enum
12593 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12594 char** last_str)
12595 {
12596 char *str_root = xstrdup (const_str);
12597 char* token_save = NULL;
12598 char *str = strtok_r (str_root, "+", &token_save);
12599 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12600 if (!str)
12601 res = AARCH64_PARSE_MISSING_ARG;
12602 else
12603 {
12604 char *next_str = strtok_r (NULL, "+", &token_save);
12605 /* Reset the branch protection features to their defaults. */
12606 aarch64_handle_no_branch_protection (NULL, NULL);
12607
12608 while (str && res == AARCH64_PARSE_OK)
12609 {
12610 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12611 bool found = false;
12612 /* Search for this type. */
12613 while (type && type->name && !found && res == AARCH64_PARSE_OK)
12614 {
12615 if (strcmp (str, type->name) == 0)
12616 {
12617 found = true;
12618 res = type->handler (str, next_str);
12619 str = next_str;
12620 next_str = strtok_r (NULL, "+", &token_save);
12621 }
12622 else
12623 type++;
12624 }
12625 if (found && res == AARCH64_PARSE_OK)
12626 {
12627 bool found_subtype = true;
12628 /* Loop through each token until we find one that isn't a
12629 subtype. */
12630 while (found_subtype)
12631 {
12632 found_subtype = false;
12633 const aarch64_branch_protect_type *subtype = type->subtypes;
12634 /* Search for the subtype. */
12635 while (str && subtype && subtype->name && !found_subtype
12636 && res == AARCH64_PARSE_OK)
12637 {
12638 if (strcmp (str, subtype->name) == 0)
12639 {
12640 found_subtype = true;
12641 res = subtype->handler (str, next_str);
12642 str = next_str;
12643 next_str = strtok_r (NULL, "+", &token_save);
12644 }
12645 else
12646 subtype++;
12647 }
12648 }
12649 }
12650 else if (!found)
12651 res = AARCH64_PARSE_INVALID_ARG;
12652 }
12653 }
12654 /* Copy the last processed token into the argument to pass it back.
12655 Used by option and attribute validation to print the offending token. */
12656 if (last_str)
12657 {
12658 if (str) strcpy (*last_str, str);
12659 else *last_str = NULL;
12660 }
12661 if (res == AARCH64_PARSE_OK)
12662 {
12663 /* If needed, alloc the accepted string then copy in const_str.
12664 Used by override_option_after_change_1. */
12665 if (!accepted_branch_protection_string)
12666 accepted_branch_protection_string = (char *) xmalloc (
12667 BRANCH_PROTECT_STR_MAX
12668 + 1);
12669 strncpy (accepted_branch_protection_string, const_str,
12670 BRANCH_PROTECT_STR_MAX + 1);
12671 /* Forcibly null-terminate. */
12672 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12673 }
12674 return res;
12675 }
12676
12677 static bool
12678 aarch64_validate_mbranch_protection (const char *const_str)
12679 {
12680 char *str = (char *) xmalloc (strlen (const_str));
12681 enum aarch64_parse_opt_result res =
12682 aarch64_parse_branch_protection (const_str, &str);
12683 if (res == AARCH64_PARSE_INVALID_ARG)
12684 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12685 else if (res == AARCH64_PARSE_MISSING_ARG)
12686 error ("missing argument for %<-mbranch-protection=%>");
12687 free (str);
12688 return res == AARCH64_PARSE_OK;
12689 }
12690
12691 /* Validate a command-line -march option. Parse the arch and extensions
12692 (if any) specified in STR and throw errors if appropriate. Put the
12693 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12694 option is valid. */
12695
12696 static bool
12697 aarch64_validate_march (const char *str, const struct processor **res,
12698 uint64_t *isa_flags)
12699 {
12700 std::string invalid_extension;
12701 enum aarch64_parse_opt_result parse_res
12702 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12703
12704 if (parse_res == AARCH64_PARSE_OK)
12705 return true;
12706
12707 switch (parse_res)
12708 {
12709 case AARCH64_PARSE_MISSING_ARG:
12710 error ("missing arch name in %<-march=%s%>", str);
12711 break;
12712 case AARCH64_PARSE_INVALID_ARG:
12713 error ("unknown value %qs for %<-march%>", str);
12714 aarch64_print_hint_for_arch (str);
12715 break;
12716 case AARCH64_PARSE_INVALID_FEATURE:
12717 error ("invalid feature modifier %qs in %<-march=%s%>",
12718 invalid_extension.c_str (), str);
12719 aarch64_print_hint_for_extensions (invalid_extension);
12720 break;
12721 default:
12722 gcc_unreachable ();
12723 }
12724
12725 return false;
12726 }
12727
12728 /* Validate a command-line -mtune option. Parse the cpu
12729 specified in STR and throw errors if appropriate. Put the
12730 result, if it is valid, in RES. Return whether the option is
12731 valid. */
12732
12733 static bool
12734 aarch64_validate_mtune (const char *str, const struct processor **res)
12735 {
12736 enum aarch64_parse_opt_result parse_res
12737 = aarch64_parse_tune (str, res);
12738
12739 if (parse_res == AARCH64_PARSE_OK)
12740 return true;
12741
12742 switch (parse_res)
12743 {
12744 case AARCH64_PARSE_MISSING_ARG:
12745 error ("missing cpu name in %<-mtune=%s%>", str);
12746 break;
12747 case AARCH64_PARSE_INVALID_ARG:
12748 error ("unknown value %qs for %<-mtune%>", str);
12749 aarch64_print_hint_for_core (str);
12750 break;
12751 default:
12752 gcc_unreachable ();
12753 }
12754 return false;
12755 }
12756
12757 /* Return the CPU corresponding to the enum CPU.
12758 If it doesn't specify a cpu, return the default. */
12759
12760 static const struct processor *
12761 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12762 {
12763 if (cpu != aarch64_none)
12764 return &all_cores[cpu];
12765
12766 /* The & 0x3f is to extract the bottom 6 bits that encode the
12767 default cpu as selected by the --with-cpu GCC configure option
12768 in config.gcc.
12769 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12770 flags mechanism should be reworked to make it more sane. */
12771 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12772 }
12773
12774 /* Return the architecture corresponding to the enum ARCH.
12775 If it doesn't specify a valid architecture, return the default. */
12776
12777 static const struct processor *
12778 aarch64_get_arch (enum aarch64_arch arch)
12779 {
12780 if (arch != aarch64_no_arch)
12781 return &all_architectures[arch];
12782
12783 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12784
12785 return &all_architectures[cpu->arch];
12786 }
12787
12788 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12789
12790 static poly_uint16
12791 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12792 {
12793 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12794 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12795 deciding which .md file patterns to use and when deciding whether
12796 something is a legitimate address or constant. */
12797 if (value == SVE_SCALABLE || value == SVE_128)
12798 return poly_uint16 (2, 2);
12799 else
12800 return (int) value / 64;
12801 }
12802
12803 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12804 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12805 tuning structs. In particular it must set selected_tune and
12806 aarch64_isa_flags that define the available ISA features and tuning
12807 decisions. It must also set selected_arch as this will be used to
12808 output the .arch asm tags for each function. */
12809
12810 static void
12811 aarch64_override_options (void)
12812 {
12813 uint64_t cpu_isa = 0;
12814 uint64_t arch_isa = 0;
12815 aarch64_isa_flags = 0;
12816
12817 bool valid_cpu = true;
12818 bool valid_tune = true;
12819 bool valid_arch = true;
12820
12821 selected_cpu = NULL;
12822 selected_arch = NULL;
12823 selected_tune = NULL;
12824
12825 if (aarch64_branch_protection_string)
12826 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12827
12828 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12829 If either of -march or -mtune is given, they override their
12830 respective component of -mcpu. */
12831 if (aarch64_cpu_string)
12832 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12833 &cpu_isa);
12834
12835 if (aarch64_arch_string)
12836 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12837 &arch_isa);
12838
12839 if (aarch64_tune_string)
12840 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12841
12842 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12843 SUBTARGET_OVERRIDE_OPTIONS;
12844 #endif
12845
12846 /* If the user did not specify a processor, choose the default
12847 one for them. This will be the CPU set during configuration using
12848 --with-cpu, otherwise it is "generic". */
12849 if (!selected_cpu)
12850 {
12851 if (selected_arch)
12852 {
12853 selected_cpu = &all_cores[selected_arch->ident];
12854 aarch64_isa_flags = arch_isa;
12855 explicit_arch = selected_arch->arch;
12856 }
12857 else
12858 {
12859 /* Get default configure-time CPU. */
12860 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12861 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12862 }
12863
12864 if (selected_tune)
12865 explicit_tune_core = selected_tune->ident;
12866 }
12867 /* If both -mcpu and -march are specified check that they are architecturally
12868 compatible, warn if they're not and prefer the -march ISA flags. */
12869 else if (selected_arch)
12870 {
12871 if (selected_arch->arch != selected_cpu->arch)
12872 {
12873 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12874 all_architectures[selected_cpu->arch].name,
12875 selected_arch->name);
12876 }
12877 aarch64_isa_flags = arch_isa;
12878 explicit_arch = selected_arch->arch;
12879 explicit_tune_core = selected_tune ? selected_tune->ident
12880 : selected_cpu->ident;
12881 }
12882 else
12883 {
12884 /* -mcpu but no -march. */
12885 aarch64_isa_flags = cpu_isa;
12886 explicit_tune_core = selected_tune ? selected_tune->ident
12887 : selected_cpu->ident;
12888 gcc_assert (selected_cpu);
12889 selected_arch = &all_architectures[selected_cpu->arch];
12890 explicit_arch = selected_arch->arch;
12891 }
12892
12893 /* Set the arch as well as we will need it when outputing
12894 the .arch directive in assembly. */
12895 if (!selected_arch)
12896 {
12897 gcc_assert (selected_cpu);
12898 selected_arch = &all_architectures[selected_cpu->arch];
12899 }
12900
12901 if (!selected_tune)
12902 selected_tune = selected_cpu;
12903
12904 if (aarch64_enable_bti == 2)
12905 {
12906 #ifdef TARGET_ENABLE_BTI
12907 aarch64_enable_bti = 1;
12908 #else
12909 aarch64_enable_bti = 0;
12910 #endif
12911 }
12912
12913 /* Return address signing is currently not supported for ILP32 targets. For
12914 LP64 targets use the configured option in the absence of a command-line
12915 option for -mbranch-protection. */
12916 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12917 {
12918 #ifdef TARGET_ENABLE_PAC_RET
12919 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12920 #else
12921 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12922 #endif
12923 }
12924
12925 #ifndef HAVE_AS_MABI_OPTION
12926 /* The compiler may have been configured with 2.23.* binutils, which does
12927 not have support for ILP32. */
12928 if (TARGET_ILP32)
12929 error ("assembler does not support %<-mabi=ilp32%>");
12930 #endif
12931
12932 /* Convert -msve-vector-bits to a VG count. */
12933 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12934
12935 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12936 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12937
12938 /* Make sure we properly set up the explicit options. */
12939 if ((aarch64_cpu_string && valid_cpu)
12940 || (aarch64_tune_string && valid_tune))
12941 gcc_assert (explicit_tune_core != aarch64_none);
12942
12943 if ((aarch64_cpu_string && valid_cpu)
12944 || (aarch64_arch_string && valid_arch))
12945 gcc_assert (explicit_arch != aarch64_no_arch);
12946
12947 /* The pass to insert speculation tracking runs before
12948 shrink-wrapping and the latter does not know how to update the
12949 tracking status. So disable it in this case. */
12950 if (aarch64_track_speculation)
12951 flag_shrink_wrap = 0;
12952
12953 aarch64_override_options_internal (&global_options);
12954
12955 /* Save these options as the default ones in case we push and pop them later
12956 while processing functions with potential target attributes. */
12957 target_option_default_node = target_option_current_node
12958 = build_target_option_node (&global_options);
12959 }
12960
12961 /* Implement targetm.override_options_after_change. */
12962
12963 static void
12964 aarch64_override_options_after_change (void)
12965 {
12966 aarch64_override_options_after_change_1 (&global_options);
12967 }
12968
12969 static struct machine_function *
12970 aarch64_init_machine_status (void)
12971 {
12972 struct machine_function *machine;
12973 machine = ggc_cleared_alloc<machine_function> ();
12974 return machine;
12975 }
12976
12977 void
12978 aarch64_init_expanders (void)
12979 {
12980 init_machine_status = aarch64_init_machine_status;
12981 }
12982
12983 /* A checking mechanism for the implementation of the various code models. */
12984 static void
12985 initialize_aarch64_code_model (struct gcc_options *opts)
12986 {
12987 if (opts->x_flag_pic)
12988 {
12989 switch (opts->x_aarch64_cmodel_var)
12990 {
12991 case AARCH64_CMODEL_TINY:
12992 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12993 break;
12994 case AARCH64_CMODEL_SMALL:
12995 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12996 aarch64_cmodel = (flag_pic == 2
12997 ? AARCH64_CMODEL_SMALL_PIC
12998 : AARCH64_CMODEL_SMALL_SPIC);
12999 #else
13000 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
13001 #endif
13002 break;
13003 case AARCH64_CMODEL_LARGE:
13004 sorry ("code model %qs with %<-f%s%>", "large",
13005 opts->x_flag_pic > 1 ? "PIC" : "pic");
13006 break;
13007 default:
13008 gcc_unreachable ();
13009 }
13010 }
13011 else
13012 aarch64_cmodel = opts->x_aarch64_cmodel_var;
13013 }
13014
13015 /* Implement TARGET_OPTION_SAVE. */
13016
13017 static void
13018 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
13019 {
13020 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
13021 ptr->x_aarch64_branch_protection_string
13022 = opts->x_aarch64_branch_protection_string;
13023 }
13024
13025 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
13026 using the information saved in PTR. */
13027
13028 static void
13029 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
13030 {
13031 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
13032 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13033 opts->x_explicit_arch = ptr->x_explicit_arch;
13034 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
13035 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
13036 opts->x_aarch64_branch_protection_string
13037 = ptr->x_aarch64_branch_protection_string;
13038 if (opts->x_aarch64_branch_protection_string)
13039 {
13040 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
13041 NULL);
13042 }
13043
13044 aarch64_override_options_internal (opts);
13045 }
13046
13047 /* Implement TARGET_OPTION_PRINT. */
13048
13049 static void
13050 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
13051 {
13052 const struct processor *cpu
13053 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
13054 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
13055 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
13056 std::string extension
13057 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
13058
13059 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
13060 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
13061 arch->name, extension.c_str ());
13062 }
13063
13064 static GTY(()) tree aarch64_previous_fndecl;
13065
13066 void
13067 aarch64_reset_previous_fndecl (void)
13068 {
13069 aarch64_previous_fndecl = NULL;
13070 }
13071
13072 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
13073 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
13074 make sure optab availability predicates are recomputed when necessary. */
13075
13076 void
13077 aarch64_save_restore_target_globals (tree new_tree)
13078 {
13079 if (TREE_TARGET_GLOBALS (new_tree))
13080 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
13081 else if (new_tree == target_option_default_node)
13082 restore_target_globals (&default_target_globals);
13083 else
13084 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
13085 }
13086
13087 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
13088 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
13089 of the function, if such exists. This function may be called multiple
13090 times on a single function so use aarch64_previous_fndecl to avoid
13091 setting up identical state. */
13092
13093 static void
13094 aarch64_set_current_function (tree fndecl)
13095 {
13096 if (!fndecl || fndecl == aarch64_previous_fndecl)
13097 return;
13098
13099 tree old_tree = (aarch64_previous_fndecl
13100 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
13101 : NULL_TREE);
13102
13103 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13104
13105 /* If current function has no attributes but the previous one did,
13106 use the default node. */
13107 if (!new_tree && old_tree)
13108 new_tree = target_option_default_node;
13109
13110 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
13111 the default have been handled by aarch64_save_restore_target_globals from
13112 aarch64_pragma_target_parse. */
13113 if (old_tree == new_tree)
13114 return;
13115
13116 aarch64_previous_fndecl = fndecl;
13117
13118 /* First set the target options. */
13119 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
13120
13121 aarch64_save_restore_target_globals (new_tree);
13122 }
13123
13124 /* Enum describing the various ways we can handle attributes.
13125 In many cases we can reuse the generic option handling machinery. */
13126
13127 enum aarch64_attr_opt_type
13128 {
13129 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
13130 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
13131 aarch64_attr_enum, /* Attribute sets an enum variable. */
13132 aarch64_attr_custom /* Attribute requires a custom handling function. */
13133 };
13134
13135 /* All the information needed to handle a target attribute.
13136 NAME is the name of the attribute.
13137 ATTR_TYPE specifies the type of behavior of the attribute as described
13138 in the definition of enum aarch64_attr_opt_type.
13139 ALLOW_NEG is true if the attribute supports a "no-" form.
13140 HANDLER is the function that takes the attribute string as an argument
13141 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
13142 OPT_NUM is the enum specifying the option that the attribute modifies.
13143 This is needed for attributes that mirror the behavior of a command-line
13144 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
13145 aarch64_attr_enum. */
13146
13147 struct aarch64_attribute_info
13148 {
13149 const char *name;
13150 enum aarch64_attr_opt_type attr_type;
13151 bool allow_neg;
13152 bool (*handler) (const char *);
13153 enum opt_code opt_num;
13154 };
13155
13156 /* Handle the ARCH_STR argument to the arch= target attribute. */
13157
13158 static bool
13159 aarch64_handle_attr_arch (const char *str)
13160 {
13161 const struct processor *tmp_arch = NULL;
13162 std::string invalid_extension;
13163 enum aarch64_parse_opt_result parse_res
13164 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
13165
13166 if (parse_res == AARCH64_PARSE_OK)
13167 {
13168 gcc_assert (tmp_arch);
13169 selected_arch = tmp_arch;
13170 explicit_arch = selected_arch->arch;
13171 return true;
13172 }
13173
13174 switch (parse_res)
13175 {
13176 case AARCH64_PARSE_MISSING_ARG:
13177 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
13178 break;
13179 case AARCH64_PARSE_INVALID_ARG:
13180 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
13181 aarch64_print_hint_for_arch (str);
13182 break;
13183 case AARCH64_PARSE_INVALID_FEATURE:
13184 error ("invalid feature modifier %s of value (\"%s\") in "
13185 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13186 aarch64_print_hint_for_extensions (invalid_extension);
13187 break;
13188 default:
13189 gcc_unreachable ();
13190 }
13191
13192 return false;
13193 }
13194
13195 /* Handle the argument CPU_STR to the cpu= target attribute. */
13196
13197 static bool
13198 aarch64_handle_attr_cpu (const char *str)
13199 {
13200 const struct processor *tmp_cpu = NULL;
13201 std::string invalid_extension;
13202 enum aarch64_parse_opt_result parse_res
13203 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
13204
13205 if (parse_res == AARCH64_PARSE_OK)
13206 {
13207 gcc_assert (tmp_cpu);
13208 selected_tune = tmp_cpu;
13209 explicit_tune_core = selected_tune->ident;
13210
13211 selected_arch = &all_architectures[tmp_cpu->arch];
13212 explicit_arch = selected_arch->arch;
13213 return true;
13214 }
13215
13216 switch (parse_res)
13217 {
13218 case AARCH64_PARSE_MISSING_ARG:
13219 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
13220 break;
13221 case AARCH64_PARSE_INVALID_ARG:
13222 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
13223 aarch64_print_hint_for_core (str);
13224 break;
13225 case AARCH64_PARSE_INVALID_FEATURE:
13226 error ("invalid feature modifier %s of value (\"%s\") in "
13227 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13228 aarch64_print_hint_for_extensions (invalid_extension);
13229 break;
13230 default:
13231 gcc_unreachable ();
13232 }
13233
13234 return false;
13235 }
13236
13237 /* Handle the argument STR to the branch-protection= attribute. */
13238
13239 static bool
13240 aarch64_handle_attr_branch_protection (const char* str)
13241 {
13242 char *err_str = (char *) xmalloc (strlen (str));
13243 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
13244 &err_str);
13245 bool success = false;
13246 switch (res)
13247 {
13248 case AARCH64_PARSE_MISSING_ARG:
13249 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
13250 " attribute");
13251 break;
13252 case AARCH64_PARSE_INVALID_ARG:
13253 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
13254 "=\")%> pragma or attribute", err_str);
13255 break;
13256 case AARCH64_PARSE_OK:
13257 success = true;
13258 /* Fall through. */
13259 case AARCH64_PARSE_INVALID_FEATURE:
13260 break;
13261 default:
13262 gcc_unreachable ();
13263 }
13264 free (err_str);
13265 return success;
13266 }
13267
13268 /* Handle the argument STR to the tune= target attribute. */
13269
13270 static bool
13271 aarch64_handle_attr_tune (const char *str)
13272 {
13273 const struct processor *tmp_tune = NULL;
13274 enum aarch64_parse_opt_result parse_res
13275 = aarch64_parse_tune (str, &tmp_tune);
13276
13277 if (parse_res == AARCH64_PARSE_OK)
13278 {
13279 gcc_assert (tmp_tune);
13280 selected_tune = tmp_tune;
13281 explicit_tune_core = selected_tune->ident;
13282 return true;
13283 }
13284
13285 switch (parse_res)
13286 {
13287 case AARCH64_PARSE_INVALID_ARG:
13288 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13289 aarch64_print_hint_for_core (str);
13290 break;
13291 default:
13292 gcc_unreachable ();
13293 }
13294
13295 return false;
13296 }
13297
13298 /* Parse an architecture extensions target attribute string specified in STR.
13299 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13300 if successful. Update aarch64_isa_flags to reflect the ISA features
13301 modified. */
13302
13303 static bool
13304 aarch64_handle_attr_isa_flags (char *str)
13305 {
13306 enum aarch64_parse_opt_result parse_res;
13307 uint64_t isa_flags = aarch64_isa_flags;
13308
13309 /* We allow "+nothing" in the beginning to clear out all architectural
13310 features if the user wants to handpick specific features. */
13311 if (strncmp ("+nothing", str, 8) == 0)
13312 {
13313 isa_flags = 0;
13314 str += 8;
13315 }
13316
13317 std::string invalid_extension;
13318 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13319
13320 if (parse_res == AARCH64_PARSE_OK)
13321 {
13322 aarch64_isa_flags = isa_flags;
13323 return true;
13324 }
13325
13326 switch (parse_res)
13327 {
13328 case AARCH64_PARSE_MISSING_ARG:
13329 error ("missing value in %<target()%> pragma or attribute");
13330 break;
13331
13332 case AARCH64_PARSE_INVALID_FEATURE:
13333 error ("invalid feature modifier %s of value (\"%s\") in "
13334 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13335 break;
13336
13337 default:
13338 gcc_unreachable ();
13339 }
13340
13341 return false;
13342 }
13343
13344 /* The target attributes that we support. On top of these we also support just
13345 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13346 handled explicitly in aarch64_process_one_target_attr. */
13347
13348 static const struct aarch64_attribute_info aarch64_attributes[] =
13349 {
13350 { "general-regs-only", aarch64_attr_mask, false, NULL,
13351 OPT_mgeneral_regs_only },
13352 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13353 OPT_mfix_cortex_a53_835769 },
13354 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13355 OPT_mfix_cortex_a53_843419 },
13356 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13357 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13358 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13359 OPT_momit_leaf_frame_pointer },
13360 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13361 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13362 OPT_march_ },
13363 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13364 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13365 OPT_mtune_ },
13366 { "branch-protection", aarch64_attr_custom, false,
13367 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13368 { "sign-return-address", aarch64_attr_enum, false, NULL,
13369 OPT_msign_return_address_ },
13370 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13371 };
13372
13373 /* Parse ARG_STR which contains the definition of one target attribute.
13374 Show appropriate errors if any or return true if the attribute is valid. */
13375
13376 static bool
13377 aarch64_process_one_target_attr (char *arg_str)
13378 {
13379 bool invert = false;
13380
13381 size_t len = strlen (arg_str);
13382
13383 if (len == 0)
13384 {
13385 error ("malformed %<target()%> pragma or attribute");
13386 return false;
13387 }
13388
13389 char *str_to_check = (char *) alloca (len + 1);
13390 strcpy (str_to_check, arg_str);
13391
13392 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13393 It is easier to detect and handle it explicitly here rather than going
13394 through the machinery for the rest of the target attributes in this
13395 function. */
13396 if (*str_to_check == '+')
13397 return aarch64_handle_attr_isa_flags (str_to_check);
13398
13399 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13400 {
13401 invert = true;
13402 str_to_check += 3;
13403 }
13404 char *arg = strchr (str_to_check, '=');
13405
13406 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13407 and point ARG to "foo". */
13408 if (arg)
13409 {
13410 *arg = '\0';
13411 arg++;
13412 }
13413 const struct aarch64_attribute_info *p_attr;
13414 bool found = false;
13415 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13416 {
13417 /* If the names don't match up, or the user has given an argument
13418 to an attribute that doesn't accept one, or didn't give an argument
13419 to an attribute that expects one, fail to match. */
13420 if (strcmp (str_to_check, p_attr->name) != 0)
13421 continue;
13422
13423 found = true;
13424 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13425 || p_attr->attr_type == aarch64_attr_enum;
13426
13427 if (attr_need_arg_p ^ (arg != NULL))
13428 {
13429 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13430 return false;
13431 }
13432
13433 /* If the name matches but the attribute does not allow "no-" versions
13434 then we can't match. */
13435 if (invert && !p_attr->allow_neg)
13436 {
13437 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13438 return false;
13439 }
13440
13441 switch (p_attr->attr_type)
13442 {
13443 /* Has a custom handler registered.
13444 For example, cpu=, arch=, tune=. */
13445 case aarch64_attr_custom:
13446 gcc_assert (p_attr->handler);
13447 if (!p_attr->handler (arg))
13448 return false;
13449 break;
13450
13451 /* Either set or unset a boolean option. */
13452 case aarch64_attr_bool:
13453 {
13454 struct cl_decoded_option decoded;
13455
13456 generate_option (p_attr->opt_num, NULL, !invert,
13457 CL_TARGET, &decoded);
13458 aarch64_handle_option (&global_options, &global_options_set,
13459 &decoded, input_location);
13460 break;
13461 }
13462 /* Set or unset a bit in the target_flags. aarch64_handle_option
13463 should know what mask to apply given the option number. */
13464 case aarch64_attr_mask:
13465 {
13466 struct cl_decoded_option decoded;
13467 /* We only need to specify the option number.
13468 aarch64_handle_option will know which mask to apply. */
13469 decoded.opt_index = p_attr->opt_num;
13470 decoded.value = !invert;
13471 aarch64_handle_option (&global_options, &global_options_set,
13472 &decoded, input_location);
13473 break;
13474 }
13475 /* Use the option setting machinery to set an option to an enum. */
13476 case aarch64_attr_enum:
13477 {
13478 gcc_assert (arg);
13479 bool valid;
13480 int value;
13481 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13482 &value, CL_TARGET);
13483 if (valid)
13484 {
13485 set_option (&global_options, NULL, p_attr->opt_num, value,
13486 NULL, DK_UNSPECIFIED, input_location,
13487 global_dc);
13488 }
13489 else
13490 {
13491 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13492 }
13493 break;
13494 }
13495 default:
13496 gcc_unreachable ();
13497 }
13498 }
13499
13500 /* If we reached here we either have found an attribute and validated
13501 it or didn't match any. If we matched an attribute but its arguments
13502 were malformed we will have returned false already. */
13503 return found;
13504 }
13505
13506 /* Count how many times the character C appears in
13507 NULL-terminated string STR. */
13508
13509 static unsigned int
13510 num_occurences_in_str (char c, char *str)
13511 {
13512 unsigned int res = 0;
13513 while (*str != '\0')
13514 {
13515 if (*str == c)
13516 res++;
13517
13518 str++;
13519 }
13520
13521 return res;
13522 }
13523
13524 /* Parse the tree in ARGS that contains the target attribute information
13525 and update the global target options space. */
13526
13527 bool
13528 aarch64_process_target_attr (tree args)
13529 {
13530 if (TREE_CODE (args) == TREE_LIST)
13531 {
13532 do
13533 {
13534 tree head = TREE_VALUE (args);
13535 if (head)
13536 {
13537 if (!aarch64_process_target_attr (head))
13538 return false;
13539 }
13540 args = TREE_CHAIN (args);
13541 } while (args);
13542
13543 return true;
13544 }
13545
13546 if (TREE_CODE (args) != STRING_CST)
13547 {
13548 error ("attribute %<target%> argument not a string");
13549 return false;
13550 }
13551
13552 size_t len = strlen (TREE_STRING_POINTER (args));
13553 char *str_to_check = (char *) alloca (len + 1);
13554 strcpy (str_to_check, TREE_STRING_POINTER (args));
13555
13556 if (len == 0)
13557 {
13558 error ("malformed %<target()%> pragma or attribute");
13559 return false;
13560 }
13561
13562 /* Used to catch empty spaces between commas i.e.
13563 attribute ((target ("attr1,,attr2"))). */
13564 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13565
13566 /* Handle multiple target attributes separated by ','. */
13567 char *token = strtok_r (str_to_check, ",", &str_to_check);
13568
13569 unsigned int num_attrs = 0;
13570 while (token)
13571 {
13572 num_attrs++;
13573 if (!aarch64_process_one_target_attr (token))
13574 {
13575 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13576 return false;
13577 }
13578
13579 token = strtok_r (NULL, ",", &str_to_check);
13580 }
13581
13582 if (num_attrs != num_commas + 1)
13583 {
13584 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13585 return false;
13586 }
13587
13588 return true;
13589 }
13590
13591 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13592 process attribute ((target ("..."))). */
13593
13594 static bool
13595 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13596 {
13597 struct cl_target_option cur_target;
13598 bool ret;
13599 tree old_optimize;
13600 tree new_target, new_optimize;
13601 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13602
13603 /* If what we're processing is the current pragma string then the
13604 target option node is already stored in target_option_current_node
13605 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13606 having to re-parse the string. This is especially useful to keep
13607 arm_neon.h compile times down since that header contains a lot
13608 of intrinsics enclosed in pragmas. */
13609 if (!existing_target && args == current_target_pragma)
13610 {
13611 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13612 return true;
13613 }
13614 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13615
13616 old_optimize = build_optimization_node (&global_options);
13617 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13618
13619 /* If the function changed the optimization levels as well as setting
13620 target options, start with the optimizations specified. */
13621 if (func_optimize && func_optimize != old_optimize)
13622 cl_optimization_restore (&global_options,
13623 TREE_OPTIMIZATION (func_optimize));
13624
13625 /* Save the current target options to restore at the end. */
13626 cl_target_option_save (&cur_target, &global_options);
13627
13628 /* If fndecl already has some target attributes applied to it, unpack
13629 them so that we add this attribute on top of them, rather than
13630 overwriting them. */
13631 if (existing_target)
13632 {
13633 struct cl_target_option *existing_options
13634 = TREE_TARGET_OPTION (existing_target);
13635
13636 if (existing_options)
13637 cl_target_option_restore (&global_options, existing_options);
13638 }
13639 else
13640 cl_target_option_restore (&global_options,
13641 TREE_TARGET_OPTION (target_option_current_node));
13642
13643 ret = aarch64_process_target_attr (args);
13644
13645 /* Set up any additional state. */
13646 if (ret)
13647 {
13648 aarch64_override_options_internal (&global_options);
13649 /* Initialize SIMD builtins if we haven't already.
13650 Set current_target_pragma to NULL for the duration so that
13651 the builtin initialization code doesn't try to tag the functions
13652 being built with the attributes specified by any current pragma, thus
13653 going into an infinite recursion. */
13654 if (TARGET_SIMD)
13655 {
13656 tree saved_current_target_pragma = current_target_pragma;
13657 current_target_pragma = NULL;
13658 aarch64_init_simd_builtins ();
13659 current_target_pragma = saved_current_target_pragma;
13660 }
13661 new_target = build_target_option_node (&global_options);
13662 }
13663 else
13664 new_target = NULL;
13665
13666 new_optimize = build_optimization_node (&global_options);
13667
13668 if (fndecl && ret)
13669 {
13670 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13671
13672 if (old_optimize != new_optimize)
13673 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13674 }
13675
13676 cl_target_option_restore (&global_options, &cur_target);
13677
13678 if (old_optimize != new_optimize)
13679 cl_optimization_restore (&global_options,
13680 TREE_OPTIMIZATION (old_optimize));
13681 return ret;
13682 }
13683
13684 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13685 tri-bool options (yes, no, don't care) and the default value is
13686 DEF, determine whether to reject inlining. */
13687
13688 static bool
13689 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13690 int dont_care, int def)
13691 {
13692 /* If the callee doesn't care, always allow inlining. */
13693 if (callee == dont_care)
13694 return true;
13695
13696 /* If the caller doesn't care, always allow inlining. */
13697 if (caller == dont_care)
13698 return true;
13699
13700 /* Otherwise, allow inlining if either the callee and caller values
13701 agree, or if the callee is using the default value. */
13702 return (callee == caller || callee == def);
13703 }
13704
13705 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13706 to inline CALLEE into CALLER based on target-specific info.
13707 Make sure that the caller and callee have compatible architectural
13708 features. Then go through the other possible target attributes
13709 and see if they can block inlining. Try not to reject always_inline
13710 callees unless they are incompatible architecturally. */
13711
13712 static bool
13713 aarch64_can_inline_p (tree caller, tree callee)
13714 {
13715 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13716 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13717
13718 struct cl_target_option *caller_opts
13719 = TREE_TARGET_OPTION (caller_tree ? caller_tree
13720 : target_option_default_node);
13721
13722 struct cl_target_option *callee_opts
13723 = TREE_TARGET_OPTION (callee_tree ? callee_tree
13724 : target_option_default_node);
13725
13726 /* Callee's ISA flags should be a subset of the caller's. */
13727 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13728 != callee_opts->x_aarch64_isa_flags)
13729 return false;
13730
13731 /* Allow non-strict aligned functions inlining into strict
13732 aligned ones. */
13733 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13734 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13735 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13736 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13737 return false;
13738
13739 bool always_inline = lookup_attribute ("always_inline",
13740 DECL_ATTRIBUTES (callee));
13741
13742 /* If the architectural features match up and the callee is always_inline
13743 then the other attributes don't matter. */
13744 if (always_inline)
13745 return true;
13746
13747 if (caller_opts->x_aarch64_cmodel_var
13748 != callee_opts->x_aarch64_cmodel_var)
13749 return false;
13750
13751 if (caller_opts->x_aarch64_tls_dialect
13752 != callee_opts->x_aarch64_tls_dialect)
13753 return false;
13754
13755 /* Honour explicit requests to workaround errata. */
13756 if (!aarch64_tribools_ok_for_inlining_p (
13757 caller_opts->x_aarch64_fix_a53_err835769,
13758 callee_opts->x_aarch64_fix_a53_err835769,
13759 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13760 return false;
13761
13762 if (!aarch64_tribools_ok_for_inlining_p (
13763 caller_opts->x_aarch64_fix_a53_err843419,
13764 callee_opts->x_aarch64_fix_a53_err843419,
13765 2, TARGET_FIX_ERR_A53_843419))
13766 return false;
13767
13768 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13769 caller and calle and they don't match up, reject inlining. */
13770 if (!aarch64_tribools_ok_for_inlining_p (
13771 caller_opts->x_flag_omit_leaf_frame_pointer,
13772 callee_opts->x_flag_omit_leaf_frame_pointer,
13773 2, 1))
13774 return false;
13775
13776 /* If the callee has specific tuning overrides, respect them. */
13777 if (callee_opts->x_aarch64_override_tune_string != NULL
13778 && caller_opts->x_aarch64_override_tune_string == NULL)
13779 return false;
13780
13781 /* If the user specified tuning override strings for the
13782 caller and callee and they don't match up, reject inlining.
13783 We just do a string compare here, we don't analyze the meaning
13784 of the string, as it would be too costly for little gain. */
13785 if (callee_opts->x_aarch64_override_tune_string
13786 && caller_opts->x_aarch64_override_tune_string
13787 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13788 caller_opts->x_aarch64_override_tune_string) != 0))
13789 return false;
13790
13791 return true;
13792 }
13793
13794 /* Return true if SYMBOL_REF X binds locally. */
13795
13796 static bool
13797 aarch64_symbol_binds_local_p (const_rtx x)
13798 {
13799 return (SYMBOL_REF_DECL (x)
13800 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13801 : SYMBOL_REF_LOCAL_P (x));
13802 }
13803
13804 /* Return true if SYMBOL_REF X is thread local */
13805 static bool
13806 aarch64_tls_symbol_p (rtx x)
13807 {
13808 if (! TARGET_HAVE_TLS)
13809 return false;
13810
13811 if (GET_CODE (x) != SYMBOL_REF)
13812 return false;
13813
13814 return SYMBOL_REF_TLS_MODEL (x) != 0;
13815 }
13816
13817 /* Classify a TLS symbol into one of the TLS kinds. */
13818 enum aarch64_symbol_type
13819 aarch64_classify_tls_symbol (rtx x)
13820 {
13821 enum tls_model tls_kind = tls_symbolic_operand_type (x);
13822
13823 switch (tls_kind)
13824 {
13825 case TLS_MODEL_GLOBAL_DYNAMIC:
13826 case TLS_MODEL_LOCAL_DYNAMIC:
13827 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13828
13829 case TLS_MODEL_INITIAL_EXEC:
13830 switch (aarch64_cmodel)
13831 {
13832 case AARCH64_CMODEL_TINY:
13833 case AARCH64_CMODEL_TINY_PIC:
13834 return SYMBOL_TINY_TLSIE;
13835 default:
13836 return SYMBOL_SMALL_TLSIE;
13837 }
13838
13839 case TLS_MODEL_LOCAL_EXEC:
13840 if (aarch64_tls_size == 12)
13841 return SYMBOL_TLSLE12;
13842 else if (aarch64_tls_size == 24)
13843 return SYMBOL_TLSLE24;
13844 else if (aarch64_tls_size == 32)
13845 return SYMBOL_TLSLE32;
13846 else if (aarch64_tls_size == 48)
13847 return SYMBOL_TLSLE48;
13848 else
13849 gcc_unreachable ();
13850
13851 case TLS_MODEL_EMULATED:
13852 case TLS_MODEL_NONE:
13853 return SYMBOL_FORCE_TO_MEM;
13854
13855 default:
13856 gcc_unreachable ();
13857 }
13858 }
13859
13860 /* Return the correct method for accessing X + OFFSET, where X is either
13861 a SYMBOL_REF or LABEL_REF. */
13862
13863 enum aarch64_symbol_type
13864 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13865 {
13866 if (GET_CODE (x) == LABEL_REF)
13867 {
13868 switch (aarch64_cmodel)
13869 {
13870 case AARCH64_CMODEL_LARGE:
13871 return SYMBOL_FORCE_TO_MEM;
13872
13873 case AARCH64_CMODEL_TINY_PIC:
13874 case AARCH64_CMODEL_TINY:
13875 return SYMBOL_TINY_ABSOLUTE;
13876
13877 case AARCH64_CMODEL_SMALL_SPIC:
13878 case AARCH64_CMODEL_SMALL_PIC:
13879 case AARCH64_CMODEL_SMALL:
13880 return SYMBOL_SMALL_ABSOLUTE;
13881
13882 default:
13883 gcc_unreachable ();
13884 }
13885 }
13886
13887 if (GET_CODE (x) == SYMBOL_REF)
13888 {
13889 if (aarch64_tls_symbol_p (x))
13890 return aarch64_classify_tls_symbol (x);
13891
13892 switch (aarch64_cmodel)
13893 {
13894 case AARCH64_CMODEL_TINY:
13895 /* When we retrieve symbol + offset address, we have to make sure
13896 the offset does not cause overflow of the final address. But
13897 we have no way of knowing the address of symbol at compile time
13898 so we can't accurately say if the distance between the PC and
13899 symbol + offset is outside the addressible range of +/-1M in the
13900 TINY code model. So we rely on images not being greater than
13901 1M and cap the offset at 1M and anything beyond 1M will have to
13902 be loaded using an alternative mechanism. Furthermore if the
13903 symbol is a weak reference to something that isn't known to
13904 resolve to a symbol in this module, then force to memory. */
13905 if ((SYMBOL_REF_WEAK (x)
13906 && !aarch64_symbol_binds_local_p (x))
13907 || !IN_RANGE (offset, -1048575, 1048575))
13908 return SYMBOL_FORCE_TO_MEM;
13909 return SYMBOL_TINY_ABSOLUTE;
13910
13911 case AARCH64_CMODEL_SMALL:
13912 /* Same reasoning as the tiny code model, but the offset cap here is
13913 4G. */
13914 if ((SYMBOL_REF_WEAK (x)
13915 && !aarch64_symbol_binds_local_p (x))
13916 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13917 HOST_WIDE_INT_C (4294967264)))
13918 return SYMBOL_FORCE_TO_MEM;
13919 return SYMBOL_SMALL_ABSOLUTE;
13920
13921 case AARCH64_CMODEL_TINY_PIC:
13922 if (!aarch64_symbol_binds_local_p (x))
13923 return SYMBOL_TINY_GOT;
13924 return SYMBOL_TINY_ABSOLUTE;
13925
13926 case AARCH64_CMODEL_SMALL_SPIC:
13927 case AARCH64_CMODEL_SMALL_PIC:
13928 if (!aarch64_symbol_binds_local_p (x))
13929 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13930 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13931 return SYMBOL_SMALL_ABSOLUTE;
13932
13933 case AARCH64_CMODEL_LARGE:
13934 /* This is alright even in PIC code as the constant
13935 pool reference is always PC relative and within
13936 the same translation unit. */
13937 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13938 return SYMBOL_SMALL_ABSOLUTE;
13939 else
13940 return SYMBOL_FORCE_TO_MEM;
13941
13942 default:
13943 gcc_unreachable ();
13944 }
13945 }
13946
13947 /* By default push everything into the constant pool. */
13948 return SYMBOL_FORCE_TO_MEM;
13949 }
13950
13951 bool
13952 aarch64_constant_address_p (rtx x)
13953 {
13954 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13955 }
13956
13957 bool
13958 aarch64_legitimate_pic_operand_p (rtx x)
13959 {
13960 if (GET_CODE (x) == SYMBOL_REF
13961 || (GET_CODE (x) == CONST
13962 && GET_CODE (XEXP (x, 0)) == PLUS
13963 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13964 return false;
13965
13966 return true;
13967 }
13968
13969 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13970 that should be rematerialized rather than spilled. */
13971
13972 static bool
13973 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13974 {
13975 /* Support CSE and rematerialization of common constants. */
13976 if (CONST_INT_P (x)
13977 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13978 || GET_CODE (x) == CONST_VECTOR)
13979 return true;
13980
13981 /* Do not allow vector struct mode constants for Advanced SIMD.
13982 We could support 0 and -1 easily, but they need support in
13983 aarch64-simd.md. */
13984 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13985 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13986 return false;
13987
13988 /* Only accept variable-length vector constants if they can be
13989 handled directly.
13990
13991 ??? It would be possible to handle rematerialization of other
13992 constants via secondary reloads. */
13993 if (vec_flags & VEC_ANY_SVE)
13994 return aarch64_simd_valid_immediate (x, NULL);
13995
13996 if (GET_CODE (x) == HIGH)
13997 x = XEXP (x, 0);
13998
13999 /* Accept polynomial constants that can be calculated by using the
14000 destination of a move as the sole temporary. Constants that
14001 require a second temporary cannot be rematerialized (they can't be
14002 forced to memory and also aren't legitimate constants). */
14003 poly_int64 offset;
14004 if (poly_int_rtx_p (x, &offset))
14005 return aarch64_offset_temporaries (false, offset) <= 1;
14006
14007 /* If an offset is being added to something else, we need to allow the
14008 base to be moved into the destination register, meaning that there
14009 are no free temporaries for the offset. */
14010 x = strip_offset (x, &offset);
14011 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
14012 return false;
14013
14014 /* Do not allow const (plus (anchor_symbol, const_int)). */
14015 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
14016 return false;
14017
14018 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
14019 so spilling them is better than rematerialization. */
14020 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
14021 return true;
14022
14023 /* Label references are always constant. */
14024 if (GET_CODE (x) == LABEL_REF)
14025 return true;
14026
14027 return false;
14028 }
14029
14030 rtx
14031 aarch64_load_tp (rtx target)
14032 {
14033 if (!target
14034 || GET_MODE (target) != Pmode
14035 || !register_operand (target, Pmode))
14036 target = gen_reg_rtx (Pmode);
14037
14038 /* Can return in any reg. */
14039 emit_insn (gen_aarch64_load_tp_hard (target));
14040 return target;
14041 }
14042
14043 /* On AAPCS systems, this is the "struct __va_list". */
14044 static GTY(()) tree va_list_type;
14045
14046 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
14047 Return the type to use as __builtin_va_list.
14048
14049 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
14050
14051 struct __va_list
14052 {
14053 void *__stack;
14054 void *__gr_top;
14055 void *__vr_top;
14056 int __gr_offs;
14057 int __vr_offs;
14058 }; */
14059
14060 static tree
14061 aarch64_build_builtin_va_list (void)
14062 {
14063 tree va_list_name;
14064 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14065
14066 /* Create the type. */
14067 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
14068 /* Give it the required name. */
14069 va_list_name = build_decl (BUILTINS_LOCATION,
14070 TYPE_DECL,
14071 get_identifier ("__va_list"),
14072 va_list_type);
14073 DECL_ARTIFICIAL (va_list_name) = 1;
14074 TYPE_NAME (va_list_type) = va_list_name;
14075 TYPE_STUB_DECL (va_list_type) = va_list_name;
14076
14077 /* Create the fields. */
14078 f_stack = build_decl (BUILTINS_LOCATION,
14079 FIELD_DECL, get_identifier ("__stack"),
14080 ptr_type_node);
14081 f_grtop = build_decl (BUILTINS_LOCATION,
14082 FIELD_DECL, get_identifier ("__gr_top"),
14083 ptr_type_node);
14084 f_vrtop = build_decl (BUILTINS_LOCATION,
14085 FIELD_DECL, get_identifier ("__vr_top"),
14086 ptr_type_node);
14087 f_groff = build_decl (BUILTINS_LOCATION,
14088 FIELD_DECL, get_identifier ("__gr_offs"),
14089 integer_type_node);
14090 f_vroff = build_decl (BUILTINS_LOCATION,
14091 FIELD_DECL, get_identifier ("__vr_offs"),
14092 integer_type_node);
14093
14094 /* Tell tree-stdarg pass about our internal offset fields.
14095 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
14096 purpose to identify whether the code is updating va_list internal
14097 offset fields through irregular way. */
14098 va_list_gpr_counter_field = f_groff;
14099 va_list_fpr_counter_field = f_vroff;
14100
14101 DECL_ARTIFICIAL (f_stack) = 1;
14102 DECL_ARTIFICIAL (f_grtop) = 1;
14103 DECL_ARTIFICIAL (f_vrtop) = 1;
14104 DECL_ARTIFICIAL (f_groff) = 1;
14105 DECL_ARTIFICIAL (f_vroff) = 1;
14106
14107 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
14108 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
14109 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
14110 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
14111 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
14112
14113 TYPE_FIELDS (va_list_type) = f_stack;
14114 DECL_CHAIN (f_stack) = f_grtop;
14115 DECL_CHAIN (f_grtop) = f_vrtop;
14116 DECL_CHAIN (f_vrtop) = f_groff;
14117 DECL_CHAIN (f_groff) = f_vroff;
14118
14119 /* Compute its layout. */
14120 layout_type (va_list_type);
14121
14122 return va_list_type;
14123 }
14124
14125 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
14126 static void
14127 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
14128 {
14129 const CUMULATIVE_ARGS *cum;
14130 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14131 tree stack, grtop, vrtop, groff, vroff;
14132 tree t;
14133 int gr_save_area_size = cfun->va_list_gpr_size;
14134 int vr_save_area_size = cfun->va_list_fpr_size;
14135 int vr_offset;
14136
14137 cum = &crtl->args.info;
14138 if (cfun->va_list_gpr_size)
14139 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
14140 cfun->va_list_gpr_size);
14141 if (cfun->va_list_fpr_size)
14142 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
14143 * UNITS_PER_VREG, cfun->va_list_fpr_size);
14144
14145 if (!TARGET_FLOAT)
14146 {
14147 gcc_assert (cum->aapcs_nvrn == 0);
14148 vr_save_area_size = 0;
14149 }
14150
14151 f_stack = TYPE_FIELDS (va_list_type_node);
14152 f_grtop = DECL_CHAIN (f_stack);
14153 f_vrtop = DECL_CHAIN (f_grtop);
14154 f_groff = DECL_CHAIN (f_vrtop);
14155 f_vroff = DECL_CHAIN (f_groff);
14156
14157 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
14158 NULL_TREE);
14159 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
14160 NULL_TREE);
14161 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
14162 NULL_TREE);
14163 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
14164 NULL_TREE);
14165 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
14166 NULL_TREE);
14167
14168 /* Emit code to initialize STACK, which points to the next varargs stack
14169 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
14170 by named arguments. STACK is 8-byte aligned. */
14171 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
14172 if (cum->aapcs_stack_size > 0)
14173 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
14174 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
14175 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14176
14177 /* Emit code to initialize GRTOP, the top of the GR save area.
14178 virtual_incoming_args_rtx should have been 16 byte aligned. */
14179 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
14180 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
14181 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14182
14183 /* Emit code to initialize VRTOP, the top of the VR save area.
14184 This address is gr_save_area_bytes below GRTOP, rounded
14185 down to the next 16-byte boundary. */
14186 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
14187 vr_offset = ROUND_UP (gr_save_area_size,
14188 STACK_BOUNDARY / BITS_PER_UNIT);
14189
14190 if (vr_offset)
14191 t = fold_build_pointer_plus_hwi (t, -vr_offset);
14192 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
14193 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14194
14195 /* Emit code to initialize GROFF, the offset from GRTOP of the
14196 next GPR argument. */
14197 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
14198 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
14199 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14200
14201 /* Likewise emit code to initialize VROFF, the offset from FTOP
14202 of the next VR argument. */
14203 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
14204 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
14205 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
14206 }
14207
14208 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
14209
14210 static tree
14211 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
14212 gimple_seq *post_p ATTRIBUTE_UNUSED)
14213 {
14214 tree addr;
14215 bool indirect_p;
14216 bool is_ha; /* is HFA or HVA. */
14217 bool dw_align; /* double-word align. */
14218 machine_mode ag_mode = VOIDmode;
14219 int nregs;
14220 machine_mode mode;
14221
14222 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
14223 tree stack, f_top, f_off, off, arg, roundup, on_stack;
14224 HOST_WIDE_INT size, rsize, adjust, align;
14225 tree t, u, cond1, cond2;
14226
14227 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
14228 if (indirect_p)
14229 type = build_pointer_type (type);
14230
14231 mode = TYPE_MODE (type);
14232
14233 f_stack = TYPE_FIELDS (va_list_type_node);
14234 f_grtop = DECL_CHAIN (f_stack);
14235 f_vrtop = DECL_CHAIN (f_grtop);
14236 f_groff = DECL_CHAIN (f_vrtop);
14237 f_vroff = DECL_CHAIN (f_groff);
14238
14239 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
14240 f_stack, NULL_TREE);
14241 size = int_size_in_bytes (type);
14242
14243 bool abi_break;
14244 align
14245 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
14246
14247 dw_align = false;
14248 adjust = 0;
14249 if (aarch64_vfp_is_call_or_return_candidate (mode,
14250 type,
14251 &ag_mode,
14252 &nregs,
14253 &is_ha))
14254 {
14255 /* No frontends can create types with variable-sized modes, so we
14256 shouldn't be asked to pass or return them. */
14257 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
14258
14259 /* TYPE passed in fp/simd registers. */
14260 if (!TARGET_FLOAT)
14261 aarch64_err_no_fpadvsimd (mode);
14262
14263 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14264 unshare_expr (valist), f_vrtop, NULL_TREE);
14265 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14266 unshare_expr (valist), f_vroff, NULL_TREE);
14267
14268 rsize = nregs * UNITS_PER_VREG;
14269
14270 if (is_ha)
14271 {
14272 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14273 adjust = UNITS_PER_VREG - ag_size;
14274 }
14275 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14276 && size < UNITS_PER_VREG)
14277 {
14278 adjust = UNITS_PER_VREG - size;
14279 }
14280 }
14281 else
14282 {
14283 /* TYPE passed in general registers. */
14284 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14285 unshare_expr (valist), f_grtop, NULL_TREE);
14286 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14287 unshare_expr (valist), f_groff, NULL_TREE);
14288 rsize = ROUND_UP (size, UNITS_PER_WORD);
14289 nregs = rsize / UNITS_PER_WORD;
14290
14291 if (align > 8)
14292 {
14293 if (abi_break && warn_psabi)
14294 inform (input_location, "parameter passing for argument of type "
14295 "%qT changed in GCC 9.1", type);
14296 dw_align = true;
14297 }
14298
14299 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14300 && size < UNITS_PER_WORD)
14301 {
14302 adjust = UNITS_PER_WORD - size;
14303 }
14304 }
14305
14306 /* Get a local temporary for the field value. */
14307 off = get_initialized_tmp_var (f_off, pre_p, NULL);
14308
14309 /* Emit code to branch if off >= 0. */
14310 t = build2 (GE_EXPR, boolean_type_node, off,
14311 build_int_cst (TREE_TYPE (off), 0));
14312 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14313
14314 if (dw_align)
14315 {
14316 /* Emit: offs = (offs + 15) & -16. */
14317 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14318 build_int_cst (TREE_TYPE (off), 15));
14319 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14320 build_int_cst (TREE_TYPE (off), -16));
14321 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14322 }
14323 else
14324 roundup = NULL;
14325
14326 /* Update ap.__[g|v]r_offs */
14327 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14328 build_int_cst (TREE_TYPE (off), rsize));
14329 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14330
14331 /* String up. */
14332 if (roundup)
14333 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14334
14335 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14336 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14337 build_int_cst (TREE_TYPE (f_off), 0));
14338 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14339
14340 /* String up: make sure the assignment happens before the use. */
14341 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14342 COND_EXPR_ELSE (cond1) = t;
14343
14344 /* Prepare the trees handling the argument that is passed on the stack;
14345 the top level node will store in ON_STACK. */
14346 arg = get_initialized_tmp_var (stack, pre_p, NULL);
14347 if (align > 8)
14348 {
14349 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14350 t = fold_build_pointer_plus_hwi (arg, 15);
14351 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14352 build_int_cst (TREE_TYPE (t), -16));
14353 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14354 }
14355 else
14356 roundup = NULL;
14357 /* Advance ap.__stack */
14358 t = fold_build_pointer_plus_hwi (arg, size + 7);
14359 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14360 build_int_cst (TREE_TYPE (t), -8));
14361 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14362 /* String up roundup and advance. */
14363 if (roundup)
14364 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14365 /* String up with arg */
14366 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14367 /* Big-endianness related address adjustment. */
14368 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14369 && size < UNITS_PER_WORD)
14370 {
14371 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14372 size_int (UNITS_PER_WORD - size));
14373 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14374 }
14375
14376 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14377 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14378
14379 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14380 t = off;
14381 if (adjust)
14382 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14383 build_int_cst (TREE_TYPE (off), adjust));
14384
14385 t = fold_convert (sizetype, t);
14386 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14387
14388 if (is_ha)
14389 {
14390 /* type ha; // treat as "struct {ftype field[n];}"
14391 ... [computing offs]
14392 for (i = 0; i <nregs; ++i, offs += 16)
14393 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14394 return ha; */
14395 int i;
14396 tree tmp_ha, field_t, field_ptr_t;
14397
14398 /* Declare a local variable. */
14399 tmp_ha = create_tmp_var_raw (type, "ha");
14400 gimple_add_tmp_var (tmp_ha);
14401
14402 /* Establish the base type. */
14403 switch (ag_mode)
14404 {
14405 case E_SFmode:
14406 field_t = float_type_node;
14407 field_ptr_t = float_ptr_type_node;
14408 break;
14409 case E_DFmode:
14410 field_t = double_type_node;
14411 field_ptr_t = double_ptr_type_node;
14412 break;
14413 case E_TFmode:
14414 field_t = long_double_type_node;
14415 field_ptr_t = long_double_ptr_type_node;
14416 break;
14417 case E_HFmode:
14418 field_t = aarch64_fp16_type_node;
14419 field_ptr_t = aarch64_fp16_ptr_type_node;
14420 break;
14421 case E_V2SImode:
14422 case E_V4SImode:
14423 {
14424 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14425 field_t = build_vector_type_for_mode (innertype, ag_mode);
14426 field_ptr_t = build_pointer_type (field_t);
14427 }
14428 break;
14429 default:
14430 gcc_assert (0);
14431 }
14432
14433 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14434 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14435 addr = t;
14436 t = fold_convert (field_ptr_t, addr);
14437 t = build2 (MODIFY_EXPR, field_t,
14438 build1 (INDIRECT_REF, field_t, tmp_ha),
14439 build1 (INDIRECT_REF, field_t, t));
14440
14441 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14442 for (i = 1; i < nregs; ++i)
14443 {
14444 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14445 u = fold_convert (field_ptr_t, addr);
14446 u = build2 (MODIFY_EXPR, field_t,
14447 build2 (MEM_REF, field_t, tmp_ha,
14448 build_int_cst (field_ptr_t,
14449 (i *
14450 int_size_in_bytes (field_t)))),
14451 build1 (INDIRECT_REF, field_t, u));
14452 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14453 }
14454
14455 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14456 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14457 }
14458
14459 COND_EXPR_ELSE (cond2) = t;
14460 addr = fold_convert (build_pointer_type (type), cond1);
14461 addr = build_va_arg_indirect_ref (addr);
14462
14463 if (indirect_p)
14464 addr = build_va_arg_indirect_ref (addr);
14465
14466 return addr;
14467 }
14468
14469 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14470
14471 static void
14472 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14473 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14474 int no_rtl)
14475 {
14476 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14477 CUMULATIVE_ARGS local_cum;
14478 int gr_saved = cfun->va_list_gpr_size;
14479 int vr_saved = cfun->va_list_fpr_size;
14480
14481 /* The caller has advanced CUM up to, but not beyond, the last named
14482 argument. Advance a local copy of CUM past the last "real" named
14483 argument, to find out how many registers are left over. */
14484 local_cum = *cum;
14485 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14486
14487 /* Found out how many registers we need to save.
14488 Honor tree-stdvar analysis results. */
14489 if (cfun->va_list_gpr_size)
14490 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14491 cfun->va_list_gpr_size / UNITS_PER_WORD);
14492 if (cfun->va_list_fpr_size)
14493 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14494 cfun->va_list_fpr_size / UNITS_PER_VREG);
14495
14496 if (!TARGET_FLOAT)
14497 {
14498 gcc_assert (local_cum.aapcs_nvrn == 0);
14499 vr_saved = 0;
14500 }
14501
14502 if (!no_rtl)
14503 {
14504 if (gr_saved > 0)
14505 {
14506 rtx ptr, mem;
14507
14508 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14509 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14510 - gr_saved * UNITS_PER_WORD);
14511 mem = gen_frame_mem (BLKmode, ptr);
14512 set_mem_alias_set (mem, get_varargs_alias_set ());
14513
14514 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14515 mem, gr_saved);
14516 }
14517 if (vr_saved > 0)
14518 {
14519 /* We can't use move_block_from_reg, because it will use
14520 the wrong mode, storing D regs only. */
14521 machine_mode mode = TImode;
14522 int off, i, vr_start;
14523
14524 /* Set OFF to the offset from virtual_incoming_args_rtx of
14525 the first vector register. The VR save area lies below
14526 the GR one, and is aligned to 16 bytes. */
14527 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14528 STACK_BOUNDARY / BITS_PER_UNIT);
14529 off -= vr_saved * UNITS_PER_VREG;
14530
14531 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14532 for (i = 0; i < vr_saved; ++i)
14533 {
14534 rtx ptr, mem;
14535
14536 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14537 mem = gen_frame_mem (mode, ptr);
14538 set_mem_alias_set (mem, get_varargs_alias_set ());
14539 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14540 off += UNITS_PER_VREG;
14541 }
14542 }
14543 }
14544
14545 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14546 any complication of having crtl->args.pretend_args_size changed. */
14547 cfun->machine->frame.saved_varargs_size
14548 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14549 STACK_BOUNDARY / BITS_PER_UNIT)
14550 + vr_saved * UNITS_PER_VREG);
14551 }
14552
14553 static void
14554 aarch64_conditional_register_usage (void)
14555 {
14556 int i;
14557 if (!TARGET_FLOAT)
14558 {
14559 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14560 {
14561 fixed_regs[i] = 1;
14562 call_used_regs[i] = 1;
14563 }
14564 }
14565 if (!TARGET_SVE)
14566 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14567 {
14568 fixed_regs[i] = 1;
14569 call_used_regs[i] = 1;
14570 }
14571
14572 /* When tracking speculation, we need a couple of call-clobbered registers
14573 to track the speculation state. It would be nice to just use
14574 IP0 and IP1, but currently there are numerous places that just
14575 assume these registers are free for other uses (eg pointer
14576 authentication). */
14577 if (aarch64_track_speculation)
14578 {
14579 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14580 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14581 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14582 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14583 }
14584 }
14585
14586 /* Walk down the type tree of TYPE counting consecutive base elements.
14587 If *MODEP is VOIDmode, then set it to the first valid floating point
14588 type. If a non-floating point type is found, or if a floating point
14589 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14590 otherwise return the count in the sub-tree. */
14591 static int
14592 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14593 {
14594 machine_mode mode;
14595 HOST_WIDE_INT size;
14596
14597 switch (TREE_CODE (type))
14598 {
14599 case REAL_TYPE:
14600 mode = TYPE_MODE (type);
14601 if (mode != DFmode && mode != SFmode
14602 && mode != TFmode && mode != HFmode)
14603 return -1;
14604
14605 if (*modep == VOIDmode)
14606 *modep = mode;
14607
14608 if (*modep == mode)
14609 return 1;
14610
14611 break;
14612
14613 case COMPLEX_TYPE:
14614 mode = TYPE_MODE (TREE_TYPE (type));
14615 if (mode != DFmode && mode != SFmode
14616 && mode != TFmode && mode != HFmode)
14617 return -1;
14618
14619 if (*modep == VOIDmode)
14620 *modep = mode;
14621
14622 if (*modep == mode)
14623 return 2;
14624
14625 break;
14626
14627 case VECTOR_TYPE:
14628 /* Use V2SImode and V4SImode as representatives of all 64-bit
14629 and 128-bit vector types. */
14630 size = int_size_in_bytes (type);
14631 switch (size)
14632 {
14633 case 8:
14634 mode = V2SImode;
14635 break;
14636 case 16:
14637 mode = V4SImode;
14638 break;
14639 default:
14640 return -1;
14641 }
14642
14643 if (*modep == VOIDmode)
14644 *modep = mode;
14645
14646 /* Vector modes are considered to be opaque: two vectors are
14647 equivalent for the purposes of being homogeneous aggregates
14648 if they are the same size. */
14649 if (*modep == mode)
14650 return 1;
14651
14652 break;
14653
14654 case ARRAY_TYPE:
14655 {
14656 int count;
14657 tree index = TYPE_DOMAIN (type);
14658
14659 /* Can't handle incomplete types nor sizes that are not
14660 fixed. */
14661 if (!COMPLETE_TYPE_P (type)
14662 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14663 return -1;
14664
14665 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14666 if (count == -1
14667 || !index
14668 || !TYPE_MAX_VALUE (index)
14669 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14670 || !TYPE_MIN_VALUE (index)
14671 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14672 || count < 0)
14673 return -1;
14674
14675 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14676 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14677
14678 /* There must be no padding. */
14679 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14680 count * GET_MODE_BITSIZE (*modep)))
14681 return -1;
14682
14683 return count;
14684 }
14685
14686 case RECORD_TYPE:
14687 {
14688 int count = 0;
14689 int sub_count;
14690 tree field;
14691
14692 /* Can't handle incomplete types nor sizes that are not
14693 fixed. */
14694 if (!COMPLETE_TYPE_P (type)
14695 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14696 return -1;
14697
14698 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14699 {
14700 if (TREE_CODE (field) != FIELD_DECL)
14701 continue;
14702
14703 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14704 if (sub_count < 0)
14705 return -1;
14706 count += sub_count;
14707 }
14708
14709 /* There must be no padding. */
14710 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14711 count * GET_MODE_BITSIZE (*modep)))
14712 return -1;
14713
14714 return count;
14715 }
14716
14717 case UNION_TYPE:
14718 case QUAL_UNION_TYPE:
14719 {
14720 /* These aren't very interesting except in a degenerate case. */
14721 int count = 0;
14722 int sub_count;
14723 tree field;
14724
14725 /* Can't handle incomplete types nor sizes that are not
14726 fixed. */
14727 if (!COMPLETE_TYPE_P (type)
14728 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14729 return -1;
14730
14731 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14732 {
14733 if (TREE_CODE (field) != FIELD_DECL)
14734 continue;
14735
14736 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14737 if (sub_count < 0)
14738 return -1;
14739 count = count > sub_count ? count : sub_count;
14740 }
14741
14742 /* There must be no padding. */
14743 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14744 count * GET_MODE_BITSIZE (*modep)))
14745 return -1;
14746
14747 return count;
14748 }
14749
14750 default:
14751 break;
14752 }
14753
14754 return -1;
14755 }
14756
14757 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14758 type as described in AAPCS64 \S 4.1.2.
14759
14760 See the comment above aarch64_composite_type_p for the notes on MODE. */
14761
14762 static bool
14763 aarch64_short_vector_p (const_tree type,
14764 machine_mode mode)
14765 {
14766 poly_int64 size = -1;
14767
14768 if (type && TREE_CODE (type) == VECTOR_TYPE)
14769 size = int_size_in_bytes (type);
14770 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14771 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14772 size = GET_MODE_SIZE (mode);
14773
14774 return known_eq (size, 8) || known_eq (size, 16);
14775 }
14776
14777 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14778 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14779 array types. The C99 floating-point complex types are also considered
14780 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14781 types, which are GCC extensions and out of the scope of AAPCS64, are
14782 treated as composite types here as well.
14783
14784 Note that MODE itself is not sufficient in determining whether a type
14785 is such a composite type or not. This is because
14786 stor-layout.c:compute_record_mode may have already changed the MODE
14787 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14788 structure with only one field may have its MODE set to the mode of the
14789 field. Also an integer mode whose size matches the size of the
14790 RECORD_TYPE type may be used to substitute the original mode
14791 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14792 solely relied on. */
14793
14794 static bool
14795 aarch64_composite_type_p (const_tree type,
14796 machine_mode mode)
14797 {
14798 if (aarch64_short_vector_p (type, mode))
14799 return false;
14800
14801 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14802 return true;
14803
14804 if (mode == BLKmode
14805 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14806 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14807 return true;
14808
14809 return false;
14810 }
14811
14812 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14813 shall be passed or returned in simd/fp register(s) (providing these
14814 parameter passing registers are available).
14815
14816 Upon successful return, *COUNT returns the number of needed registers,
14817 *BASE_MODE returns the mode of the individual register and when IS_HAF
14818 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14819 floating-point aggregate or a homogeneous short-vector aggregate. */
14820
14821 static bool
14822 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14823 const_tree type,
14824 machine_mode *base_mode,
14825 int *count,
14826 bool *is_ha)
14827 {
14828 machine_mode new_mode = VOIDmode;
14829 bool composite_p = aarch64_composite_type_p (type, mode);
14830
14831 if (is_ha != NULL) *is_ha = false;
14832
14833 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14834 || aarch64_short_vector_p (type, mode))
14835 {
14836 *count = 1;
14837 new_mode = mode;
14838 }
14839 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14840 {
14841 if (is_ha != NULL) *is_ha = true;
14842 *count = 2;
14843 new_mode = GET_MODE_INNER (mode);
14844 }
14845 else if (type && composite_p)
14846 {
14847 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14848
14849 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14850 {
14851 if (is_ha != NULL) *is_ha = true;
14852 *count = ag_count;
14853 }
14854 else
14855 return false;
14856 }
14857 else
14858 return false;
14859
14860 *base_mode = new_mode;
14861 return true;
14862 }
14863
14864 /* Implement TARGET_STRUCT_VALUE_RTX. */
14865
14866 static rtx
14867 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14868 int incoming ATTRIBUTE_UNUSED)
14869 {
14870 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14871 }
14872
14873 /* Implements target hook vector_mode_supported_p. */
14874 static bool
14875 aarch64_vector_mode_supported_p (machine_mode mode)
14876 {
14877 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14878 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14879 }
14880
14881 /* Return the full-width SVE vector mode for element mode MODE, if one
14882 exists. */
14883 opt_machine_mode
14884 aarch64_full_sve_mode (scalar_mode mode)
14885 {
14886 switch (mode)
14887 {
14888 case E_DFmode:
14889 return VNx2DFmode;
14890 case E_SFmode:
14891 return VNx4SFmode;
14892 case E_HFmode:
14893 return VNx8HFmode;
14894 case E_DImode:
14895 return VNx2DImode;
14896 case E_SImode:
14897 return VNx4SImode;
14898 case E_HImode:
14899 return VNx8HImode;
14900 case E_QImode:
14901 return VNx16QImode;
14902 default:
14903 return opt_machine_mode ();
14904 }
14905 }
14906
14907 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14908 if it exists. */
14909 opt_machine_mode
14910 aarch64_vq_mode (scalar_mode mode)
14911 {
14912 switch (mode)
14913 {
14914 case E_DFmode:
14915 return V2DFmode;
14916 case E_SFmode:
14917 return V4SFmode;
14918 case E_HFmode:
14919 return V8HFmode;
14920 case E_SImode:
14921 return V4SImode;
14922 case E_HImode:
14923 return V8HImode;
14924 case E_QImode:
14925 return V16QImode;
14926 case E_DImode:
14927 return V2DImode;
14928 default:
14929 return opt_machine_mode ();
14930 }
14931 }
14932
14933 /* Return appropriate SIMD container
14934 for MODE within a vector of WIDTH bits. */
14935 static machine_mode
14936 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14937 {
14938 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14939 return aarch64_full_sve_mode (mode).else_mode (word_mode);
14940
14941 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14942 if (TARGET_SIMD)
14943 {
14944 if (known_eq (width, 128))
14945 return aarch64_vq_mode (mode).else_mode (word_mode);
14946 else
14947 switch (mode)
14948 {
14949 case E_SFmode:
14950 return V2SFmode;
14951 case E_HFmode:
14952 return V4HFmode;
14953 case E_SImode:
14954 return V2SImode;
14955 case E_HImode:
14956 return V4HImode;
14957 case E_QImode:
14958 return V8QImode;
14959 default:
14960 break;
14961 }
14962 }
14963 return word_mode;
14964 }
14965
14966 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14967 static machine_mode
14968 aarch64_preferred_simd_mode (scalar_mode mode)
14969 {
14970 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14971 return aarch64_simd_container_mode (mode, bits);
14972 }
14973
14974 /* Return a list of possible vector sizes for the vectorizer
14975 to iterate over. */
14976 static void
14977 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14978 {
14979 if (TARGET_SVE)
14980 sizes->safe_push (BYTES_PER_SVE_VECTOR);
14981 sizes->safe_push (16);
14982 sizes->safe_push (8);
14983 }
14984
14985 /* Implement TARGET_MANGLE_TYPE. */
14986
14987 static const char *
14988 aarch64_mangle_type (const_tree type)
14989 {
14990 /* The AArch64 ABI documents say that "__va_list" has to be
14991 mangled as if it is in the "std" namespace. */
14992 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14993 return "St9__va_list";
14994
14995 /* Half-precision float. */
14996 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14997 return "Dh";
14998
14999 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
15000 builtin types. */
15001 if (TYPE_NAME (type) != NULL)
15002 return aarch64_mangle_builtin_type (type);
15003
15004 /* Use the default mangling. */
15005 return NULL;
15006 }
15007
15008 /* Find the first rtx_insn before insn that will generate an assembly
15009 instruction. */
15010
15011 static rtx_insn *
15012 aarch64_prev_real_insn (rtx_insn *insn)
15013 {
15014 if (!insn)
15015 return NULL;
15016
15017 do
15018 {
15019 insn = prev_real_insn (insn);
15020 }
15021 while (insn && recog_memoized (insn) < 0);
15022
15023 return insn;
15024 }
15025
15026 static bool
15027 is_madd_op (enum attr_type t1)
15028 {
15029 unsigned int i;
15030 /* A number of these may be AArch32 only. */
15031 enum attr_type mlatypes[] = {
15032 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
15033 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
15034 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
15035 };
15036
15037 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
15038 {
15039 if (t1 == mlatypes[i])
15040 return true;
15041 }
15042
15043 return false;
15044 }
15045
15046 /* Check if there is a register dependency between a load and the insn
15047 for which we hold recog_data. */
15048
15049 static bool
15050 dep_between_memop_and_curr (rtx memop)
15051 {
15052 rtx load_reg;
15053 int opno;
15054
15055 gcc_assert (GET_CODE (memop) == SET);
15056
15057 if (!REG_P (SET_DEST (memop)))
15058 return false;
15059
15060 load_reg = SET_DEST (memop);
15061 for (opno = 1; opno < recog_data.n_operands; opno++)
15062 {
15063 rtx operand = recog_data.operand[opno];
15064 if (REG_P (operand)
15065 && reg_overlap_mentioned_p (load_reg, operand))
15066 return true;
15067
15068 }
15069 return false;
15070 }
15071
15072
15073 /* When working around the Cortex-A53 erratum 835769,
15074 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
15075 instruction and has a preceding memory instruction such that a NOP
15076 should be inserted between them. */
15077
15078 bool
15079 aarch64_madd_needs_nop (rtx_insn* insn)
15080 {
15081 enum attr_type attr_type;
15082 rtx_insn *prev;
15083 rtx body;
15084
15085 if (!TARGET_FIX_ERR_A53_835769)
15086 return false;
15087
15088 if (!INSN_P (insn) || recog_memoized (insn) < 0)
15089 return false;
15090
15091 attr_type = get_attr_type (insn);
15092 if (!is_madd_op (attr_type))
15093 return false;
15094
15095 prev = aarch64_prev_real_insn (insn);
15096 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
15097 Restore recog state to INSN to avoid state corruption. */
15098 extract_constrain_insn_cached (insn);
15099
15100 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
15101 return false;
15102
15103 body = single_set (prev);
15104
15105 /* If the previous insn is a memory op and there is no dependency between
15106 it and the DImode madd, emit a NOP between them. If body is NULL then we
15107 have a complex memory operation, probably a load/store pair.
15108 Be conservative for now and emit a NOP. */
15109 if (GET_MODE (recog_data.operand[0]) == DImode
15110 && (!body || !dep_between_memop_and_curr (body)))
15111 return true;
15112
15113 return false;
15114
15115 }
15116
15117
15118 /* Implement FINAL_PRESCAN_INSN. */
15119
15120 void
15121 aarch64_final_prescan_insn (rtx_insn *insn)
15122 {
15123 if (aarch64_madd_needs_nop (insn))
15124 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
15125 }
15126
15127
15128 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
15129 instruction. */
15130
15131 bool
15132 aarch64_sve_index_immediate_p (rtx base_or_step)
15133 {
15134 return (CONST_INT_P (base_or_step)
15135 && IN_RANGE (INTVAL (base_or_step), -16, 15));
15136 }
15137
15138 /* Return true if X is a valid immediate for the SVE ADD and SUB
15139 instructions. Negate X first if NEGATE_P is true. */
15140
15141 bool
15142 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
15143 {
15144 rtx elt;
15145
15146 if (!const_vec_duplicate_p (x, &elt)
15147 || !CONST_INT_P (elt))
15148 return false;
15149
15150 HOST_WIDE_INT val = INTVAL (elt);
15151 if (negate_p)
15152 val = -val;
15153 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
15154
15155 if (val & 0xff)
15156 return IN_RANGE (val, 0, 0xff);
15157 return IN_RANGE (val, 0, 0xff00);
15158 }
15159
15160 /* Return true if X is a valid immediate operand for an SVE logical
15161 instruction such as AND. */
15162
15163 bool
15164 aarch64_sve_bitmask_immediate_p (rtx x)
15165 {
15166 rtx elt;
15167
15168 return (const_vec_duplicate_p (x, &elt)
15169 && CONST_INT_P (elt)
15170 && aarch64_bitmask_imm (INTVAL (elt),
15171 GET_MODE_INNER (GET_MODE (x))));
15172 }
15173
15174 /* Return true if X is a valid immediate for the SVE DUP and CPY
15175 instructions. */
15176
15177 bool
15178 aarch64_sve_dup_immediate_p (rtx x)
15179 {
15180 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
15181 if (!CONST_INT_P (x))
15182 return false;
15183
15184 HOST_WIDE_INT val = INTVAL (x);
15185 if (val & 0xff)
15186 return IN_RANGE (val, -0x80, 0x7f);
15187 return IN_RANGE (val, -0x8000, 0x7f00);
15188 }
15189
15190 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
15191 SIGNED_P says whether the operand is signed rather than unsigned. */
15192
15193 bool
15194 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
15195 {
15196 rtx elt;
15197
15198 return (const_vec_duplicate_p (x, &elt)
15199 && CONST_INT_P (elt)
15200 && (signed_p
15201 ? IN_RANGE (INTVAL (elt), -16, 15)
15202 : IN_RANGE (INTVAL (elt), 0, 127)));
15203 }
15204
15205 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
15206 instruction. Negate X first if NEGATE_P is true. */
15207
15208 bool
15209 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
15210 {
15211 rtx elt;
15212 REAL_VALUE_TYPE r;
15213
15214 if (!const_vec_duplicate_p (x, &elt)
15215 || GET_CODE (elt) != CONST_DOUBLE)
15216 return false;
15217
15218 r = *CONST_DOUBLE_REAL_VALUE (elt);
15219
15220 if (negate_p)
15221 r = real_value_negate (&r);
15222
15223 if (real_equal (&r, &dconst1))
15224 return true;
15225 if (real_equal (&r, &dconsthalf))
15226 return true;
15227 return false;
15228 }
15229
15230 /* Return true if X is a valid immediate operand for an SVE FMUL
15231 instruction. */
15232
15233 bool
15234 aarch64_sve_float_mul_immediate_p (rtx x)
15235 {
15236 rtx elt;
15237
15238 return (const_vec_duplicate_p (x, &elt)
15239 && GET_CODE (elt) == CONST_DOUBLE
15240 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
15241 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
15242 }
15243
15244 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
15245 for the Advanced SIMD operation described by WHICH and INSN. If INFO
15246 is nonnull, use it to describe valid immediates. */
15247 static bool
15248 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
15249 simd_immediate_info *info,
15250 enum simd_immediate_check which,
15251 simd_immediate_info::insn_type insn)
15252 {
15253 /* Try a 4-byte immediate with LSL. */
15254 for (unsigned int shift = 0; shift < 32; shift += 8)
15255 if ((val32 & (0xff << shift)) == val32)
15256 {
15257 if (info)
15258 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15259 simd_immediate_info::LSL, shift);
15260 return true;
15261 }
15262
15263 /* Try a 2-byte immediate with LSL. */
15264 unsigned int imm16 = val32 & 0xffff;
15265 if (imm16 == (val32 >> 16))
15266 for (unsigned int shift = 0; shift < 16; shift += 8)
15267 if ((imm16 & (0xff << shift)) == imm16)
15268 {
15269 if (info)
15270 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15271 simd_immediate_info::LSL, shift);
15272 return true;
15273 }
15274
15275 /* Try a 4-byte immediate with MSL, except for cases that MVN
15276 can handle. */
15277 if (which == AARCH64_CHECK_MOV)
15278 for (unsigned int shift = 8; shift < 24; shift += 8)
15279 {
15280 unsigned int low = (1 << shift) - 1;
15281 if (((val32 & (0xff << shift)) | low) == val32)
15282 {
15283 if (info)
15284 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15285 simd_immediate_info::MSL, shift);
15286 return true;
15287 }
15288 }
15289
15290 return false;
15291 }
15292
15293 /* Return true if replicating VAL64 is a valid immediate for the
15294 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15295 use it to describe valid immediates. */
15296 static bool
15297 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15298 simd_immediate_info *info,
15299 enum simd_immediate_check which)
15300 {
15301 unsigned int val32 = val64 & 0xffffffff;
15302 unsigned int val16 = val64 & 0xffff;
15303 unsigned int val8 = val64 & 0xff;
15304
15305 if (val32 == (val64 >> 32))
15306 {
15307 if ((which & AARCH64_CHECK_ORR) != 0
15308 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15309 simd_immediate_info::MOV))
15310 return true;
15311
15312 if ((which & AARCH64_CHECK_BIC) != 0
15313 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15314 simd_immediate_info::MVN))
15315 return true;
15316
15317 /* Try using a replicated byte. */
15318 if (which == AARCH64_CHECK_MOV
15319 && val16 == (val32 >> 16)
15320 && val8 == (val16 >> 8))
15321 {
15322 if (info)
15323 *info = simd_immediate_info (QImode, val8);
15324 return true;
15325 }
15326 }
15327
15328 /* Try using a bit-to-bytemask. */
15329 if (which == AARCH64_CHECK_MOV)
15330 {
15331 unsigned int i;
15332 for (i = 0; i < 64; i += 8)
15333 {
15334 unsigned char byte = (val64 >> i) & 0xff;
15335 if (byte != 0 && byte != 0xff)
15336 break;
15337 }
15338 if (i == 64)
15339 {
15340 if (info)
15341 *info = simd_immediate_info (DImode, val64);
15342 return true;
15343 }
15344 }
15345 return false;
15346 }
15347
15348 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15349 instruction. If INFO is nonnull, use it to describe valid immediates. */
15350
15351 static bool
15352 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15353 simd_immediate_info *info)
15354 {
15355 scalar_int_mode mode = DImode;
15356 unsigned int val32 = val64 & 0xffffffff;
15357 if (val32 == (val64 >> 32))
15358 {
15359 mode = SImode;
15360 unsigned int val16 = val32 & 0xffff;
15361 if (val16 == (val32 >> 16))
15362 {
15363 mode = HImode;
15364 unsigned int val8 = val16 & 0xff;
15365 if (val8 == (val16 >> 8))
15366 mode = QImode;
15367 }
15368 }
15369 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15370 if (IN_RANGE (val, -0x80, 0x7f))
15371 {
15372 /* DUP with no shift. */
15373 if (info)
15374 *info = simd_immediate_info (mode, val);
15375 return true;
15376 }
15377 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15378 {
15379 /* DUP with LSL #8. */
15380 if (info)
15381 *info = simd_immediate_info (mode, val);
15382 return true;
15383 }
15384 if (aarch64_bitmask_imm (val64, mode))
15385 {
15386 /* DUPM. */
15387 if (info)
15388 *info = simd_immediate_info (mode, val);
15389 return true;
15390 }
15391 return false;
15392 }
15393
15394 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15395 it to describe valid immediates. */
15396
15397 static bool
15398 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15399 {
15400 if (x == CONST0_RTX (GET_MODE (x)))
15401 {
15402 if (info)
15403 *info = simd_immediate_info (DImode, 0);
15404 return true;
15405 }
15406
15407 /* Analyze the value as a VNx16BImode. This should be relatively
15408 efficient, since rtx_vector_builder has enough built-in capacity
15409 to store all VLA predicate constants without needing the heap. */
15410 rtx_vector_builder builder;
15411 if (!aarch64_get_sve_pred_bits (builder, x))
15412 return false;
15413
15414 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15415 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15416 {
15417 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15418 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15419 if (pattern != AARCH64_NUM_SVPATTERNS)
15420 {
15421 if (info)
15422 {
15423 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15424 *info = simd_immediate_info (int_mode, pattern);
15425 }
15426 return true;
15427 }
15428 }
15429 return false;
15430 }
15431
15432 /* Return true if OP is a valid SIMD immediate for the operation
15433 described by WHICH. If INFO is nonnull, use it to describe valid
15434 immediates. */
15435 bool
15436 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15437 enum simd_immediate_check which)
15438 {
15439 machine_mode mode = GET_MODE (op);
15440 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15441 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15442 return false;
15443
15444 if (vec_flags & VEC_SVE_PRED)
15445 return aarch64_sve_pred_valid_immediate (op, info);
15446
15447 scalar_mode elt_mode = GET_MODE_INNER (mode);
15448 rtx base, step;
15449 unsigned int n_elts;
15450 if (GET_CODE (op) == CONST_VECTOR
15451 && CONST_VECTOR_DUPLICATE_P (op))
15452 n_elts = CONST_VECTOR_NPATTERNS (op);
15453 else if ((vec_flags & VEC_SVE_DATA)
15454 && const_vec_series_p (op, &base, &step))
15455 {
15456 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15457 if (!aarch64_sve_index_immediate_p (base)
15458 || !aarch64_sve_index_immediate_p (step))
15459 return false;
15460
15461 if (info)
15462 *info = simd_immediate_info (elt_mode, base, step);
15463 return true;
15464 }
15465 else if (GET_CODE (op) == CONST_VECTOR
15466 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15467 /* N_ELTS set above. */;
15468 else
15469 return false;
15470
15471 scalar_float_mode elt_float_mode;
15472 if (n_elts == 1
15473 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15474 {
15475 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15476 if (aarch64_float_const_zero_rtx_p (elt)
15477 || aarch64_float_const_representable_p (elt))
15478 {
15479 if (info)
15480 *info = simd_immediate_info (elt_float_mode, elt);
15481 return true;
15482 }
15483 }
15484
15485 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15486 if (elt_size > 8)
15487 return false;
15488
15489 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15490
15491 /* Expand the vector constant out into a byte vector, with the least
15492 significant byte of the register first. */
15493 auto_vec<unsigned char, 16> bytes;
15494 bytes.reserve (n_elts * elt_size);
15495 for (unsigned int i = 0; i < n_elts; i++)
15496 {
15497 /* The vector is provided in gcc endian-neutral fashion.
15498 For aarch64_be Advanced SIMD, it must be laid out in the vector
15499 register in reverse order. */
15500 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15501 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15502
15503 if (elt_mode != elt_int_mode)
15504 elt = gen_lowpart (elt_int_mode, elt);
15505
15506 if (!CONST_INT_P (elt))
15507 return false;
15508
15509 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15510 for (unsigned int byte = 0; byte < elt_size; byte++)
15511 {
15512 bytes.quick_push (elt_val & 0xff);
15513 elt_val >>= BITS_PER_UNIT;
15514 }
15515 }
15516
15517 /* The immediate must repeat every eight bytes. */
15518 unsigned int nbytes = bytes.length ();
15519 for (unsigned i = 8; i < nbytes; ++i)
15520 if (bytes[i] != bytes[i - 8])
15521 return false;
15522
15523 /* Get the repeating 8-byte value as an integer. No endian correction
15524 is needed here because bytes is already in lsb-first order. */
15525 unsigned HOST_WIDE_INT val64 = 0;
15526 for (unsigned int i = 0; i < 8; i++)
15527 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15528 << (i * BITS_PER_UNIT));
15529
15530 if (vec_flags & VEC_SVE_DATA)
15531 return aarch64_sve_valid_immediate (val64, info);
15532 else
15533 return aarch64_advsimd_valid_immediate (val64, info, which);
15534 }
15535
15536 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15537 has a step in the range of INDEX. Return the index expression if so,
15538 otherwise return null. */
15539 rtx
15540 aarch64_check_zero_based_sve_index_immediate (rtx x)
15541 {
15542 rtx base, step;
15543 if (const_vec_series_p (x, &base, &step)
15544 && base == const0_rtx
15545 && aarch64_sve_index_immediate_p (step))
15546 return step;
15547 return NULL_RTX;
15548 }
15549
15550 /* Check of immediate shift constants are within range. */
15551 bool
15552 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15553 {
15554 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15555 if (left)
15556 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15557 else
15558 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15559 }
15560
15561 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15562 operation of width WIDTH at bit position POS. */
15563
15564 rtx
15565 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15566 {
15567 gcc_assert (CONST_INT_P (width));
15568 gcc_assert (CONST_INT_P (pos));
15569
15570 unsigned HOST_WIDE_INT mask
15571 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15572 return GEN_INT (mask << UINTVAL (pos));
15573 }
15574
15575 bool
15576 aarch64_mov_operand_p (rtx x, machine_mode mode)
15577 {
15578 if (GET_CODE (x) == HIGH
15579 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15580 return true;
15581
15582 if (CONST_INT_P (x))
15583 return true;
15584
15585 if (VECTOR_MODE_P (GET_MODE (x)))
15586 {
15587 /* Require predicate constants to be VNx16BI before RA, so that we
15588 force everything to have a canonical form. */
15589 if (!lra_in_progress
15590 && !reload_completed
15591 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15592 && GET_MODE (x) != VNx16BImode)
15593 return false;
15594
15595 return aarch64_simd_valid_immediate (x, NULL);
15596 }
15597
15598 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15599 return true;
15600
15601 if (aarch64_sve_cnt_immediate_p (x))
15602 return true;
15603
15604 return aarch64_classify_symbolic_expression (x)
15605 == SYMBOL_TINY_ABSOLUTE;
15606 }
15607
15608 /* Return a const_int vector of VAL. */
15609 rtx
15610 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15611 {
15612 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15613 return gen_const_vec_duplicate (mode, c);
15614 }
15615
15616 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15617
15618 bool
15619 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15620 {
15621 machine_mode vmode;
15622
15623 vmode = aarch64_simd_container_mode (mode, 64);
15624 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15625 return aarch64_simd_valid_immediate (op_v, NULL);
15626 }
15627
15628 /* Construct and return a PARALLEL RTX vector with elements numbering the
15629 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15630 the vector - from the perspective of the architecture. This does not
15631 line up with GCC's perspective on lane numbers, so we end up with
15632 different masks depending on our target endian-ness. The diagram
15633 below may help. We must draw the distinction when building masks
15634 which select one half of the vector. An instruction selecting
15635 architectural low-lanes for a big-endian target, must be described using
15636 a mask selecting GCC high-lanes.
15637
15638 Big-Endian Little-Endian
15639
15640 GCC 0 1 2 3 3 2 1 0
15641 | x | x | x | x | | x | x | x | x |
15642 Architecture 3 2 1 0 3 2 1 0
15643
15644 Low Mask: { 2, 3 } { 0, 1 }
15645 High Mask: { 0, 1 } { 2, 3 }
15646
15647 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15648
15649 rtx
15650 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15651 {
15652 rtvec v = rtvec_alloc (nunits / 2);
15653 int high_base = nunits / 2;
15654 int low_base = 0;
15655 int base;
15656 rtx t1;
15657 int i;
15658
15659 if (BYTES_BIG_ENDIAN)
15660 base = high ? low_base : high_base;
15661 else
15662 base = high ? high_base : low_base;
15663
15664 for (i = 0; i < nunits / 2; i++)
15665 RTVEC_ELT (v, i) = GEN_INT (base + i);
15666
15667 t1 = gen_rtx_PARALLEL (mode, v);
15668 return t1;
15669 }
15670
15671 /* Check OP for validity as a PARALLEL RTX vector with elements
15672 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15673 from the perspective of the architecture. See the diagram above
15674 aarch64_simd_vect_par_cnst_half for more details. */
15675
15676 bool
15677 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15678 bool high)
15679 {
15680 int nelts;
15681 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15682 return false;
15683
15684 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15685 HOST_WIDE_INT count_op = XVECLEN (op, 0);
15686 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15687 int i = 0;
15688
15689 if (count_op != count_ideal)
15690 return false;
15691
15692 for (i = 0; i < count_ideal; i++)
15693 {
15694 rtx elt_op = XVECEXP (op, 0, i);
15695 rtx elt_ideal = XVECEXP (ideal, 0, i);
15696
15697 if (!CONST_INT_P (elt_op)
15698 || INTVAL (elt_ideal) != INTVAL (elt_op))
15699 return false;
15700 }
15701 return true;
15702 }
15703
15704 /* Return a PARALLEL containing NELTS elements, with element I equal
15705 to BASE + I * STEP. */
15706
15707 rtx
15708 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15709 {
15710 rtvec vec = rtvec_alloc (nelts);
15711 for (unsigned int i = 0; i < nelts; ++i)
15712 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15713 return gen_rtx_PARALLEL (VOIDmode, vec);
15714 }
15715
15716 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15717 series with step STEP. */
15718
15719 bool
15720 aarch64_stepped_int_parallel_p (rtx op, int step)
15721 {
15722 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15723 return false;
15724
15725 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15726 for (int i = 1; i < XVECLEN (op, 0); ++i)
15727 if (!CONST_INT_P (XVECEXP (op, 0, i))
15728 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15729 return false;
15730
15731 return true;
15732 }
15733
15734 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15735 HIGH (exclusive). */
15736 void
15737 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15738 const_tree exp)
15739 {
15740 HOST_WIDE_INT lane;
15741 gcc_assert (CONST_INT_P (operand));
15742 lane = INTVAL (operand);
15743
15744 if (lane < low || lane >= high)
15745 {
15746 if (exp)
15747 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15748 else
15749 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15750 }
15751 }
15752
15753 /* Peform endian correction on lane number N, which indexes a vector
15754 of mode MODE, and return the result as an SImode rtx. */
15755
15756 rtx
15757 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15758 {
15759 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15760 }
15761
15762 /* Return TRUE if OP is a valid vector addressing mode. */
15763
15764 bool
15765 aarch64_simd_mem_operand_p (rtx op)
15766 {
15767 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15768 || REG_P (XEXP (op, 0)));
15769 }
15770
15771 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15772
15773 bool
15774 aarch64_sve_ld1r_operand_p (rtx op)
15775 {
15776 struct aarch64_address_info addr;
15777 scalar_mode mode;
15778
15779 return (MEM_P (op)
15780 && is_a <scalar_mode> (GET_MODE (op), &mode)
15781 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15782 && addr.type == ADDRESS_REG_IMM
15783 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15784 }
15785
15786 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15787 bool
15788 aarch64_sve_ld1rq_operand_p (rtx op)
15789 {
15790 struct aarch64_address_info addr;
15791 scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15792 if (!MEM_P (op)
15793 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15794 return false;
15795
15796 if (addr.type == ADDRESS_REG_IMM)
15797 return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15798
15799 if (addr.type == ADDRESS_REG_REG)
15800 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15801
15802 return false;
15803 }
15804
15805 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15806 The conditions for STR are the same. */
15807 bool
15808 aarch64_sve_ldr_operand_p (rtx op)
15809 {
15810 struct aarch64_address_info addr;
15811
15812 return (MEM_P (op)
15813 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15814 false, ADDR_QUERY_ANY)
15815 && addr.type == ADDRESS_REG_IMM);
15816 }
15817
15818 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15819 We need to be able to access the individual pieces, so the range
15820 is different from LD[234] and ST[234]. */
15821 bool
15822 aarch64_sve_struct_memory_operand_p (rtx op)
15823 {
15824 if (!MEM_P (op))
15825 return false;
15826
15827 machine_mode mode = GET_MODE (op);
15828 struct aarch64_address_info addr;
15829 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15830 ADDR_QUERY_ANY)
15831 || addr.type != ADDRESS_REG_IMM)
15832 return false;
15833
15834 poly_int64 first = addr.const_offset;
15835 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15836 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15837 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15838 }
15839
15840 /* Emit a register copy from operand to operand, taking care not to
15841 early-clobber source registers in the process.
15842
15843 COUNT is the number of components into which the copy needs to be
15844 decomposed. */
15845 void
15846 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15847 unsigned int count)
15848 {
15849 unsigned int i;
15850 int rdest = REGNO (operands[0]);
15851 int rsrc = REGNO (operands[1]);
15852
15853 if (!reg_overlap_mentioned_p (operands[0], operands[1])
15854 || rdest < rsrc)
15855 for (i = 0; i < count; i++)
15856 emit_move_insn (gen_rtx_REG (mode, rdest + i),
15857 gen_rtx_REG (mode, rsrc + i));
15858 else
15859 for (i = 0; i < count; i++)
15860 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15861 gen_rtx_REG (mode, rsrc + count - i - 1));
15862 }
15863
15864 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15865 one of VSTRUCT modes: OI, CI, or XI. */
15866 int
15867 aarch64_simd_attr_length_rglist (machine_mode mode)
15868 {
15869 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15870 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15871 }
15872
15873 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15874 alignment of a vector to 128 bits. SVE predicates have an alignment of
15875 16 bits. */
15876 static HOST_WIDE_INT
15877 aarch64_simd_vector_alignment (const_tree type)
15878 {
15879 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15880 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15881 be set for non-predicate vectors of booleans. Modes are the most
15882 direct way we have of identifying real SVE predicate types. */
15883 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15884 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15885 }
15886
15887 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15888 static poly_uint64
15889 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15890 {
15891 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15892 {
15893 /* If the length of the vector is fixed, try to align to that length,
15894 otherwise don't try to align at all. */
15895 HOST_WIDE_INT result;
15896 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15897 result = TYPE_ALIGN (TREE_TYPE (type));
15898 return result;
15899 }
15900 return TYPE_ALIGN (type);
15901 }
15902
15903 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15904 static bool
15905 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15906 {
15907 if (is_packed)
15908 return false;
15909
15910 /* For fixed-length vectors, check that the vectorizer will aim for
15911 full-vector alignment. This isn't true for generic GCC vectors
15912 that are wider than the ABI maximum of 128 bits. */
15913 poly_uint64 preferred_alignment =
15914 aarch64_vectorize_preferred_vector_alignment (type);
15915 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15916 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15917 preferred_alignment))
15918 return false;
15919
15920 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15921 return true;
15922 }
15923
15924 /* Return true if the vector misalignment factor is supported by the
15925 target. */
15926 static bool
15927 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15928 const_tree type, int misalignment,
15929 bool is_packed)
15930 {
15931 if (TARGET_SIMD && STRICT_ALIGNMENT)
15932 {
15933 /* Return if movmisalign pattern is not supported for this mode. */
15934 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15935 return false;
15936
15937 /* Misalignment factor is unknown at compile time. */
15938 if (misalignment == -1)
15939 return false;
15940 }
15941 return default_builtin_support_vector_misalignment (mode, type, misalignment,
15942 is_packed);
15943 }
15944
15945 /* If VALS is a vector constant that can be loaded into a register
15946 using DUP, generate instructions to do so and return an RTX to
15947 assign to the register. Otherwise return NULL_RTX. */
15948 static rtx
15949 aarch64_simd_dup_constant (rtx vals)
15950 {
15951 machine_mode mode = GET_MODE (vals);
15952 machine_mode inner_mode = GET_MODE_INNER (mode);
15953 rtx x;
15954
15955 if (!const_vec_duplicate_p (vals, &x))
15956 return NULL_RTX;
15957
15958 /* We can load this constant by using DUP and a constant in a
15959 single ARM register. This will be cheaper than a vector
15960 load. */
15961 x = copy_to_mode_reg (inner_mode, x);
15962 return gen_vec_duplicate (mode, x);
15963 }
15964
15965
15966 /* Generate code to load VALS, which is a PARALLEL containing only
15967 constants (for vec_init) or CONST_VECTOR, efficiently into a
15968 register. Returns an RTX to copy into the register, or NULL_RTX
15969 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15970 static rtx
15971 aarch64_simd_make_constant (rtx vals)
15972 {
15973 machine_mode mode = GET_MODE (vals);
15974 rtx const_dup;
15975 rtx const_vec = NULL_RTX;
15976 int n_const = 0;
15977 int i;
15978
15979 if (GET_CODE (vals) == CONST_VECTOR)
15980 const_vec = vals;
15981 else if (GET_CODE (vals) == PARALLEL)
15982 {
15983 /* A CONST_VECTOR must contain only CONST_INTs and
15984 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15985 Only store valid constants in a CONST_VECTOR. */
15986 int n_elts = XVECLEN (vals, 0);
15987 for (i = 0; i < n_elts; ++i)
15988 {
15989 rtx x = XVECEXP (vals, 0, i);
15990 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15991 n_const++;
15992 }
15993 if (n_const == n_elts)
15994 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15995 }
15996 else
15997 gcc_unreachable ();
15998
15999 if (const_vec != NULL_RTX
16000 && aarch64_simd_valid_immediate (const_vec, NULL))
16001 /* Load using MOVI/MVNI. */
16002 return const_vec;
16003 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
16004 /* Loaded using DUP. */
16005 return const_dup;
16006 else if (const_vec != NULL_RTX)
16007 /* Load from constant pool. We cannot take advantage of single-cycle
16008 LD1 because we need a PC-relative addressing mode. */
16009 return const_vec;
16010 else
16011 /* A PARALLEL containing something not valid inside CONST_VECTOR.
16012 We cannot construct an initializer. */
16013 return NULL_RTX;
16014 }
16015
16016 /* Expand a vector initialisation sequence, such that TARGET is
16017 initialised to contain VALS. */
16018
16019 void
16020 aarch64_expand_vector_init (rtx target, rtx vals)
16021 {
16022 machine_mode mode = GET_MODE (target);
16023 scalar_mode inner_mode = GET_MODE_INNER (mode);
16024 /* The number of vector elements. */
16025 int n_elts = XVECLEN (vals, 0);
16026 /* The number of vector elements which are not constant. */
16027 int n_var = 0;
16028 rtx any_const = NULL_RTX;
16029 /* The first element of vals. */
16030 rtx v0 = XVECEXP (vals, 0, 0);
16031 bool all_same = true;
16032
16033 /* This is a special vec_init<M><N> where N is not an element mode but a
16034 vector mode with half the elements of M. We expect to find two entries
16035 of mode N in VALS and we must put their concatentation into TARGET. */
16036 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
16037 {
16038 gcc_assert (known_eq (GET_MODE_SIZE (mode),
16039 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
16040 rtx lo = XVECEXP (vals, 0, 0);
16041 rtx hi = XVECEXP (vals, 0, 1);
16042 machine_mode narrow_mode = GET_MODE (lo);
16043 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
16044 gcc_assert (narrow_mode == GET_MODE (hi));
16045
16046 /* When we want to concatenate a half-width vector with zeroes we can
16047 use the aarch64_combinez[_be] patterns. Just make sure that the
16048 zeroes are in the right half. */
16049 if (BYTES_BIG_ENDIAN
16050 && aarch64_simd_imm_zero (lo, narrow_mode)
16051 && general_operand (hi, narrow_mode))
16052 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
16053 else if (!BYTES_BIG_ENDIAN
16054 && aarch64_simd_imm_zero (hi, narrow_mode)
16055 && general_operand (lo, narrow_mode))
16056 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
16057 else
16058 {
16059 /* Else create the two half-width registers and combine them. */
16060 if (!REG_P (lo))
16061 lo = force_reg (GET_MODE (lo), lo);
16062 if (!REG_P (hi))
16063 hi = force_reg (GET_MODE (hi), hi);
16064
16065 if (BYTES_BIG_ENDIAN)
16066 std::swap (lo, hi);
16067 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
16068 }
16069 return;
16070 }
16071
16072 /* Count the number of variable elements to initialise. */
16073 for (int i = 0; i < n_elts; ++i)
16074 {
16075 rtx x = XVECEXP (vals, 0, i);
16076 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
16077 ++n_var;
16078 else
16079 any_const = x;
16080
16081 all_same &= rtx_equal_p (x, v0);
16082 }
16083
16084 /* No variable elements, hand off to aarch64_simd_make_constant which knows
16085 how best to handle this. */
16086 if (n_var == 0)
16087 {
16088 rtx constant = aarch64_simd_make_constant (vals);
16089 if (constant != NULL_RTX)
16090 {
16091 emit_move_insn (target, constant);
16092 return;
16093 }
16094 }
16095
16096 /* Splat a single non-constant element if we can. */
16097 if (all_same)
16098 {
16099 rtx x = copy_to_mode_reg (inner_mode, v0);
16100 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16101 return;
16102 }
16103
16104 enum insn_code icode = optab_handler (vec_set_optab, mode);
16105 gcc_assert (icode != CODE_FOR_nothing);
16106
16107 /* If there are only variable elements, try to optimize
16108 the insertion using dup for the most common element
16109 followed by insertions. */
16110
16111 /* The algorithm will fill matches[*][0] with the earliest matching element,
16112 and matches[X][1] with the count of duplicate elements (if X is the
16113 earliest element which has duplicates). */
16114
16115 if (n_var == n_elts && n_elts <= 16)
16116 {
16117 int matches[16][2] = {0};
16118 for (int i = 0; i < n_elts; i++)
16119 {
16120 for (int j = 0; j <= i; j++)
16121 {
16122 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
16123 {
16124 matches[i][0] = j;
16125 matches[j][1]++;
16126 break;
16127 }
16128 }
16129 }
16130 int maxelement = 0;
16131 int maxv = 0;
16132 for (int i = 0; i < n_elts; i++)
16133 if (matches[i][1] > maxv)
16134 {
16135 maxelement = i;
16136 maxv = matches[i][1];
16137 }
16138
16139 /* Create a duplicate of the most common element, unless all elements
16140 are equally useless to us, in which case just immediately set the
16141 vector register using the first element. */
16142
16143 if (maxv == 1)
16144 {
16145 /* For vectors of two 64-bit elements, we can do even better. */
16146 if (n_elts == 2
16147 && (inner_mode == E_DImode
16148 || inner_mode == E_DFmode))
16149
16150 {
16151 rtx x0 = XVECEXP (vals, 0, 0);
16152 rtx x1 = XVECEXP (vals, 0, 1);
16153 /* Combine can pick up this case, but handling it directly
16154 here leaves clearer RTL.
16155
16156 This is load_pair_lanes<mode>, and also gives us a clean-up
16157 for store_pair_lanes<mode>. */
16158 if (memory_operand (x0, inner_mode)
16159 && memory_operand (x1, inner_mode)
16160 && !STRICT_ALIGNMENT
16161 && rtx_equal_p (XEXP (x1, 0),
16162 plus_constant (Pmode,
16163 XEXP (x0, 0),
16164 GET_MODE_SIZE (inner_mode))))
16165 {
16166 rtx t;
16167 if (inner_mode == DFmode)
16168 t = gen_load_pair_lanesdf (target, x0, x1);
16169 else
16170 t = gen_load_pair_lanesdi (target, x0, x1);
16171 emit_insn (t);
16172 return;
16173 }
16174 }
16175 /* The subreg-move sequence below will move into lane zero of the
16176 vector register. For big-endian we want that position to hold
16177 the last element of VALS. */
16178 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
16179 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16180 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
16181 }
16182 else
16183 {
16184 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
16185 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
16186 }
16187
16188 /* Insert the rest. */
16189 for (int i = 0; i < n_elts; i++)
16190 {
16191 rtx x = XVECEXP (vals, 0, i);
16192 if (matches[i][0] == maxelement)
16193 continue;
16194 x = copy_to_mode_reg (inner_mode, x);
16195 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16196 }
16197 return;
16198 }
16199
16200 /* Initialise a vector which is part-variable. We want to first try
16201 to build those lanes which are constant in the most efficient way we
16202 can. */
16203 if (n_var != n_elts)
16204 {
16205 rtx copy = copy_rtx (vals);
16206
16207 /* Load constant part of vector. We really don't care what goes into the
16208 parts we will overwrite, but we're more likely to be able to load the
16209 constant efficiently if it has fewer, larger, repeating parts
16210 (see aarch64_simd_valid_immediate). */
16211 for (int i = 0; i < n_elts; i++)
16212 {
16213 rtx x = XVECEXP (vals, 0, i);
16214 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16215 continue;
16216 rtx subst = any_const;
16217 for (int bit = n_elts / 2; bit > 0; bit /= 2)
16218 {
16219 /* Look in the copied vector, as more elements are const. */
16220 rtx test = XVECEXP (copy, 0, i ^ bit);
16221 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
16222 {
16223 subst = test;
16224 break;
16225 }
16226 }
16227 XVECEXP (copy, 0, i) = subst;
16228 }
16229 aarch64_expand_vector_init (target, copy);
16230 }
16231
16232 /* Insert the variable lanes directly. */
16233 for (int i = 0; i < n_elts; i++)
16234 {
16235 rtx x = XVECEXP (vals, 0, i);
16236 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
16237 continue;
16238 x = copy_to_mode_reg (inner_mode, x);
16239 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
16240 }
16241 }
16242
16243 /* Emit RTL corresponding to:
16244 insr TARGET, ELEM. */
16245
16246 static void
16247 emit_insr (rtx target, rtx elem)
16248 {
16249 machine_mode mode = GET_MODE (target);
16250 scalar_mode elem_mode = GET_MODE_INNER (mode);
16251 elem = force_reg (elem_mode, elem);
16252
16253 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
16254 gcc_assert (icode != CODE_FOR_nothing);
16255 emit_insn (GEN_FCN (icode) (target, target, elem));
16256 }
16257
16258 /* Subroutine of aarch64_sve_expand_vector_init for handling
16259 trailing constants.
16260 This function works as follows:
16261 (a) Create a new vector consisting of trailing constants.
16262 (b) Initialize TARGET with the constant vector using emit_move_insn.
16263 (c) Insert remaining elements in TARGET using insr.
16264 NELTS is the total number of elements in original vector while
16265 while NELTS_REQD is the number of elements that are actually
16266 significant.
16267
16268 ??? The heuristic used is to do above only if number of constants
16269 is at least half the total number of elements. May need fine tuning. */
16270
16271 static bool
16272 aarch64_sve_expand_vector_init_handle_trailing_constants
16273 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16274 {
16275 machine_mode mode = GET_MODE (target);
16276 scalar_mode elem_mode = GET_MODE_INNER (mode);
16277 int n_trailing_constants = 0;
16278
16279 for (int i = nelts_reqd - 1;
16280 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16281 i--)
16282 n_trailing_constants++;
16283
16284 if (n_trailing_constants >= nelts_reqd / 2)
16285 {
16286 rtx_vector_builder v (mode, 1, nelts);
16287 for (int i = 0; i < nelts; i++)
16288 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16289 rtx const_vec = v.build ();
16290 emit_move_insn (target, const_vec);
16291
16292 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16293 emit_insr (target, builder.elt (i));
16294
16295 return true;
16296 }
16297
16298 return false;
16299 }
16300
16301 /* Subroutine of aarch64_sve_expand_vector_init.
16302 Works as follows:
16303 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16304 (b) Skip trailing elements from BUILDER, which are the same as
16305 element NELTS_REQD - 1.
16306 (c) Insert earlier elements in reverse order in TARGET using insr. */
16307
16308 static void
16309 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16310 const rtx_vector_builder &builder,
16311 int nelts_reqd)
16312 {
16313 machine_mode mode = GET_MODE (target);
16314 scalar_mode elem_mode = GET_MODE_INNER (mode);
16315
16316 struct expand_operand ops[2];
16317 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16318 gcc_assert (icode != CODE_FOR_nothing);
16319
16320 create_output_operand (&ops[0], target, mode);
16321 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16322 expand_insn (icode, 2, ops);
16323
16324 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16325 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16326 emit_insr (target, builder.elt (i));
16327 }
16328
16329 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16330 when all trailing elements of builder are same.
16331 This works as follows:
16332 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16333 (b) Insert remaining elements in TARGET using insr.
16334
16335 ??? The heuristic used is to do above if number of same trailing elements
16336 is at least 3/4 of total number of elements, loosely based on
16337 heuristic from mostly_zeros_p. May need fine-tuning. */
16338
16339 static bool
16340 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16341 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16342 {
16343 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16344 if (ndups >= (3 * nelts_reqd) / 4)
16345 {
16346 aarch64_sve_expand_vector_init_insert_elems (target, builder,
16347 nelts_reqd - ndups + 1);
16348 return true;
16349 }
16350
16351 return false;
16352 }
16353
16354 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16355 of elements in BUILDER.
16356
16357 The function tries to initialize TARGET from BUILDER if it fits one
16358 of the special cases outlined below.
16359
16360 Failing that, the function divides BUILDER into two sub-vectors:
16361 v_even = even elements of BUILDER;
16362 v_odd = odd elements of BUILDER;
16363
16364 and recursively calls itself with v_even and v_odd.
16365
16366 if (recursive call succeeded for v_even or v_odd)
16367 TARGET = zip (v_even, v_odd)
16368
16369 The function returns true if it managed to build TARGET from BUILDER
16370 with one of the special cases, false otherwise.
16371
16372 Example: {a, 1, b, 2, c, 3, d, 4}
16373
16374 The vector gets divided into:
16375 v_even = {a, b, c, d}
16376 v_odd = {1, 2, 3, 4}
16377
16378 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16379 initialize tmp2 from constant vector v_odd using emit_move_insn.
16380
16381 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16382 4 elements, so we construct tmp1 from v_even using insr:
16383 tmp1 = dup(d)
16384 insr tmp1, c
16385 insr tmp1, b
16386 insr tmp1, a
16387
16388 And finally:
16389 TARGET = zip (tmp1, tmp2)
16390 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16391
16392 static bool
16393 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16394 int nelts, int nelts_reqd)
16395 {
16396 machine_mode mode = GET_MODE (target);
16397
16398 /* Case 1: Vector contains trailing constants. */
16399
16400 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16401 (target, builder, nelts, nelts_reqd))
16402 return true;
16403
16404 /* Case 2: Vector contains leading constants. */
16405
16406 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16407 for (int i = 0; i < nelts_reqd; i++)
16408 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16409 rev_builder.finalize ();
16410
16411 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16412 (target, rev_builder, nelts, nelts_reqd))
16413 {
16414 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16415 return true;
16416 }
16417
16418 /* Case 3: Vector contains trailing same element. */
16419
16420 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16421 (target, builder, nelts_reqd))
16422 return true;
16423
16424 /* Case 4: Vector contains leading same element. */
16425
16426 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16427 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16428 {
16429 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16430 return true;
16431 }
16432
16433 /* Avoid recursing below 4-elements.
16434 ??? The threshold 4 may need fine-tuning. */
16435
16436 if (nelts_reqd <= 4)
16437 return false;
16438
16439 rtx_vector_builder v_even (mode, 1, nelts);
16440 rtx_vector_builder v_odd (mode, 1, nelts);
16441
16442 for (int i = 0; i < nelts * 2; i += 2)
16443 {
16444 v_even.quick_push (builder.elt (i));
16445 v_odd.quick_push (builder.elt (i + 1));
16446 }
16447
16448 v_even.finalize ();
16449 v_odd.finalize ();
16450
16451 rtx tmp1 = gen_reg_rtx (mode);
16452 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16453 nelts, nelts_reqd / 2);
16454
16455 rtx tmp2 = gen_reg_rtx (mode);
16456 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16457 nelts, nelts_reqd / 2);
16458
16459 if (!did_even_p && !did_odd_p)
16460 return false;
16461
16462 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16463 special cases and zip v_even, v_odd. */
16464
16465 if (!did_even_p)
16466 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16467
16468 if (!did_odd_p)
16469 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16470
16471 rtvec v = gen_rtvec (2, tmp1, tmp2);
16472 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16473 return true;
16474 }
16475
16476 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16477
16478 void
16479 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16480 {
16481 machine_mode mode = GET_MODE (target);
16482 int nelts = XVECLEN (vals, 0);
16483
16484 rtx_vector_builder v (mode, 1, nelts);
16485 for (int i = 0; i < nelts; i++)
16486 v.quick_push (XVECEXP (vals, 0, i));
16487 v.finalize ();
16488
16489 /* If neither sub-vectors of v could be initialized specially,
16490 then use INSR to insert all elements from v into TARGET.
16491 ??? This might not be optimal for vectors with large
16492 initializers like 16-element or above.
16493 For nelts < 4, it probably isn't useful to handle specially. */
16494
16495 if (nelts < 4
16496 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16497 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16498 }
16499
16500 /* Check whether VALUE is a vector constant in which every element
16501 is either a power of 2 or a negated power of 2. If so, return
16502 a constant vector of log2s, and flip CODE between PLUS and MINUS
16503 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
16504
16505 static rtx
16506 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
16507 {
16508 if (GET_CODE (value) != CONST_VECTOR)
16509 return NULL_RTX;
16510
16511 rtx_vector_builder builder;
16512 if (!builder.new_unary_operation (GET_MODE (value), value, false))
16513 return NULL_RTX;
16514
16515 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
16516 /* 1 if the result of the multiplication must be negated,
16517 0 if it mustn't, or -1 if we don't yet care. */
16518 int negate = -1;
16519 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
16520 for (unsigned int i = 0; i < encoded_nelts; ++i)
16521 {
16522 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
16523 if (!CONST_SCALAR_INT_P (elt))
16524 return NULL_RTX;
16525 rtx_mode_t val (elt, int_mode);
16526 wide_int pow2 = wi::neg (val);
16527 if (val != pow2)
16528 {
16529 /* It matters whether we negate or not. Make that choice,
16530 and make sure that it's consistent with previous elements. */
16531 if (negate == !wi::neg_p (val))
16532 return NULL_RTX;
16533 negate = wi::neg_p (val);
16534 if (!negate)
16535 pow2 = val;
16536 }
16537 /* POW2 is now the value that we want to be a power of 2. */
16538 int shift = wi::exact_log2 (pow2);
16539 if (shift < 0)
16540 return NULL_RTX;
16541 builder.quick_push (gen_int_mode (shift, int_mode));
16542 }
16543 if (negate == -1)
16544 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
16545 code = PLUS;
16546 else if (negate == 1)
16547 code = code == PLUS ? MINUS : PLUS;
16548 return builder.build ();
16549 }
16550
16551 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
16552 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
16553 operands array, in the same order as for fma_optab. Return true if
16554 the function emitted all the necessary instructions, false if the caller
16555 should generate the pattern normally with the new OPERANDS array. */
16556
16557 bool
16558 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
16559 {
16560 machine_mode mode = GET_MODE (operands[0]);
16561 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
16562 {
16563 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
16564 NULL_RTX, true, OPTAB_DIRECT);
16565 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
16566 operands[3], product, operands[0], true,
16567 OPTAB_DIRECT);
16568 return true;
16569 }
16570 operands[2] = force_reg (mode, operands[2]);
16571 return false;
16572 }
16573
16574 /* Likewise, but for a conditional pattern. */
16575
16576 bool
16577 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
16578 {
16579 machine_mode mode = GET_MODE (operands[0]);
16580 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
16581 {
16582 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
16583 NULL_RTX, true, OPTAB_DIRECT);
16584 emit_insn (gen_cond (code, mode, operands[0], operands[1],
16585 operands[4], product, operands[5]));
16586 return true;
16587 }
16588 operands[3] = force_reg (mode, operands[3]);
16589 return false;
16590 }
16591
16592 static unsigned HOST_WIDE_INT
16593 aarch64_shift_truncation_mask (machine_mode mode)
16594 {
16595 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16596 return 0;
16597 return GET_MODE_UNIT_BITSIZE (mode) - 1;
16598 }
16599
16600 /* Select a format to encode pointers in exception handling data. */
16601 int
16602 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16603 {
16604 int type;
16605 switch (aarch64_cmodel)
16606 {
16607 case AARCH64_CMODEL_TINY:
16608 case AARCH64_CMODEL_TINY_PIC:
16609 case AARCH64_CMODEL_SMALL:
16610 case AARCH64_CMODEL_SMALL_PIC:
16611 case AARCH64_CMODEL_SMALL_SPIC:
16612 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16613 for everything. */
16614 type = DW_EH_PE_sdata4;
16615 break;
16616 default:
16617 /* No assumptions here. 8-byte relocs required. */
16618 type = DW_EH_PE_sdata8;
16619 break;
16620 }
16621 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16622 }
16623
16624 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16625
16626 static void
16627 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16628 {
16629 if (aarch64_simd_decl_p (decl))
16630 {
16631 fprintf (stream, "\t.variant_pcs\t");
16632 assemble_name (stream, name);
16633 fprintf (stream, "\n");
16634 }
16635 }
16636
16637 /* The last .arch and .tune assembly strings that we printed. */
16638 static std::string aarch64_last_printed_arch_string;
16639 static std::string aarch64_last_printed_tune_string;
16640
16641 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16642 by the function fndecl. */
16643
16644 void
16645 aarch64_declare_function_name (FILE *stream, const char* name,
16646 tree fndecl)
16647 {
16648 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16649
16650 struct cl_target_option *targ_options;
16651 if (target_parts)
16652 targ_options = TREE_TARGET_OPTION (target_parts);
16653 else
16654 targ_options = TREE_TARGET_OPTION (target_option_current_node);
16655 gcc_assert (targ_options);
16656
16657 const struct processor *this_arch
16658 = aarch64_get_arch (targ_options->x_explicit_arch);
16659
16660 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16661 std::string extension
16662 = aarch64_get_extension_string_for_isa_flags (isa_flags,
16663 this_arch->flags);
16664 /* Only update the assembler .arch string if it is distinct from the last
16665 such string we printed. */
16666 std::string to_print = this_arch->name + extension;
16667 if (to_print != aarch64_last_printed_arch_string)
16668 {
16669 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16670 aarch64_last_printed_arch_string = to_print;
16671 }
16672
16673 /* Print the cpu name we're tuning for in the comments, might be
16674 useful to readers of the generated asm. Do it only when it changes
16675 from function to function and verbose assembly is requested. */
16676 const struct processor *this_tune
16677 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16678
16679 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16680 {
16681 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16682 this_tune->name);
16683 aarch64_last_printed_tune_string = this_tune->name;
16684 }
16685
16686 aarch64_asm_output_variant_pcs (stream, fndecl, name);
16687
16688 /* Don't forget the type directive for ELF. */
16689 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16690 ASM_OUTPUT_LABEL (stream, name);
16691 }
16692
16693 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16694
16695 void
16696 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16697 {
16698 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16699 const char *value = IDENTIFIER_POINTER (target);
16700 aarch64_asm_output_variant_pcs (stream, decl, name);
16701 ASM_OUTPUT_DEF (stream, name, value);
16702 }
16703
16704 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16705 function symbol references. */
16706
16707 void
16708 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16709 {
16710 default_elf_asm_output_external (stream, decl, name);
16711 aarch64_asm_output_variant_pcs (stream, decl, name);
16712 }
16713
16714 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16715 Used to output the .cfi_b_key_frame directive when signing the current
16716 function with the B key. */
16717
16718 void
16719 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16720 {
16721 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16722 && aarch64_ra_sign_key == AARCH64_KEY_B)
16723 asm_fprintf (f, "\t.cfi_b_key_frame\n");
16724 }
16725
16726 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16727
16728 static void
16729 aarch64_start_file (void)
16730 {
16731 struct cl_target_option *default_options
16732 = TREE_TARGET_OPTION (target_option_default_node);
16733
16734 const struct processor *default_arch
16735 = aarch64_get_arch (default_options->x_explicit_arch);
16736 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16737 std::string extension
16738 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16739 default_arch->flags);
16740
16741 aarch64_last_printed_arch_string = default_arch->name + extension;
16742 aarch64_last_printed_tune_string = "";
16743 asm_fprintf (asm_out_file, "\t.arch %s\n",
16744 aarch64_last_printed_arch_string.c_str ());
16745
16746 default_file_start ();
16747 }
16748
16749 /* Emit load exclusive. */
16750
16751 static void
16752 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16753 rtx mem, rtx model_rtx)
16754 {
16755 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16756 }
16757
16758 /* Emit store exclusive. */
16759
16760 static void
16761 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16762 rtx rval, rtx mem, rtx model_rtx)
16763 {
16764 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16765 }
16766
16767 /* Mark the previous jump instruction as unlikely. */
16768
16769 static void
16770 aarch64_emit_unlikely_jump (rtx insn)
16771 {
16772 rtx_insn *jump = emit_jump_insn (insn);
16773 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16774 }
16775
16776 /* Expand a compare and swap pattern. */
16777
16778 void
16779 aarch64_expand_compare_and_swap (rtx operands[])
16780 {
16781 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16782 machine_mode mode, r_mode;
16783
16784 bval = operands[0];
16785 rval = operands[1];
16786 mem = operands[2];
16787 oldval = operands[3];
16788 newval = operands[4];
16789 is_weak = operands[5];
16790 mod_s = operands[6];
16791 mod_f = operands[7];
16792 mode = GET_MODE (mem);
16793
16794 /* Normally the succ memory model must be stronger than fail, but in the
16795 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16796 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16797 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16798 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16799 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16800
16801 r_mode = mode;
16802 if (mode == QImode || mode == HImode)
16803 {
16804 r_mode = SImode;
16805 rval = gen_reg_rtx (r_mode);
16806 }
16807
16808 if (TARGET_LSE)
16809 {
16810 /* The CAS insn requires oldval and rval overlap, but we need to
16811 have a copy of oldval saved across the operation to tell if
16812 the operation is successful. */
16813 if (reg_overlap_mentioned_p (rval, oldval))
16814 rval = copy_to_mode_reg (r_mode, oldval);
16815 else
16816 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16817
16818 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16819 newval, mod_s));
16820 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16821 }
16822 else
16823 {
16824 /* The oldval predicate varies by mode. Test it and force to reg. */
16825 insn_code code = code_for_aarch64_compare_and_swap (mode);
16826 if (!insn_data[code].operand[2].predicate (oldval, mode))
16827 oldval = force_reg (mode, oldval);
16828
16829 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16830 is_weak, mod_s, mod_f));
16831 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16832 }
16833
16834 if (r_mode != mode)
16835 rval = gen_lowpart (mode, rval);
16836 emit_move_insn (operands[1], rval);
16837
16838 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16839 emit_insn (gen_rtx_SET (bval, x));
16840 }
16841
16842 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16843 sequence implementing an atomic operation. */
16844
16845 static void
16846 aarch64_emit_post_barrier (enum memmodel model)
16847 {
16848 const enum memmodel base_model = memmodel_base (model);
16849
16850 if (is_mm_sync (model)
16851 && (base_model == MEMMODEL_ACQUIRE
16852 || base_model == MEMMODEL_ACQ_REL
16853 || base_model == MEMMODEL_SEQ_CST))
16854 {
16855 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16856 }
16857 }
16858
16859 /* Split a compare and swap pattern. */
16860
16861 void
16862 aarch64_split_compare_and_swap (rtx operands[])
16863 {
16864 rtx rval, mem, oldval, newval, scratch;
16865 machine_mode mode;
16866 bool is_weak;
16867 rtx_code_label *label1, *label2;
16868 rtx x, cond;
16869 enum memmodel model;
16870 rtx model_rtx;
16871
16872 rval = operands[0];
16873 mem = operands[1];
16874 oldval = operands[2];
16875 newval = operands[3];
16876 is_weak = (operands[4] != const0_rtx);
16877 model_rtx = operands[5];
16878 scratch = operands[7];
16879 mode = GET_MODE (mem);
16880 model = memmodel_from_int (INTVAL (model_rtx));
16881
16882 /* When OLDVAL is zero and we want the strong version we can emit a tighter
16883 loop:
16884 .label1:
16885 LD[A]XR rval, [mem]
16886 CBNZ rval, .label2
16887 ST[L]XR scratch, newval, [mem]
16888 CBNZ scratch, .label1
16889 .label2:
16890 CMP rval, 0. */
16891 bool strong_zero_p = !is_weak && oldval == const0_rtx;
16892
16893 label1 = NULL;
16894 if (!is_weak)
16895 {
16896 label1 = gen_label_rtx ();
16897 emit_label (label1);
16898 }
16899 label2 = gen_label_rtx ();
16900
16901 /* The initial load can be relaxed for a __sync operation since a final
16902 barrier will be emitted to stop code hoisting. */
16903 if (is_mm_sync (model))
16904 aarch64_emit_load_exclusive (mode, rval, mem,
16905 GEN_INT (MEMMODEL_RELAXED));
16906 else
16907 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16908
16909 if (strong_zero_p)
16910 {
16911 if (aarch64_track_speculation)
16912 {
16913 /* Emit an explicit compare instruction, so that we can correctly
16914 track the condition codes. */
16915 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16916 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16917 }
16918 else
16919 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16920
16921 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16922 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16923 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16924 }
16925 else
16926 {
16927 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16928 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16929 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16930 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16931 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16932 }
16933
16934 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16935
16936 if (!is_weak)
16937 {
16938 if (aarch64_track_speculation)
16939 {
16940 /* Emit an explicit compare instruction, so that we can correctly
16941 track the condition codes. */
16942 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16943 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16944 }
16945 else
16946 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16947
16948 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16949 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16950 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16951 }
16952 else
16953 {
16954 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16955 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16956 emit_insn (gen_rtx_SET (cond, x));
16957 }
16958
16959 emit_label (label2);
16960 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16961 to set the condition flags. If this is not used it will be removed by
16962 later passes. */
16963 if (strong_zero_p)
16964 {
16965 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16966 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16967 emit_insn (gen_rtx_SET (cond, x));
16968 }
16969 /* Emit any final barrier needed for a __sync operation. */
16970 if (is_mm_sync (model))
16971 aarch64_emit_post_barrier (model);
16972 }
16973
16974 /* Split an atomic operation. */
16975
16976 void
16977 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16978 rtx value, rtx model_rtx, rtx cond)
16979 {
16980 machine_mode mode = GET_MODE (mem);
16981 machine_mode wmode = (mode == DImode ? DImode : SImode);
16982 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16983 const bool is_sync = is_mm_sync (model);
16984 rtx_code_label *label;
16985 rtx x;
16986
16987 /* Split the atomic operation into a sequence. */
16988 label = gen_label_rtx ();
16989 emit_label (label);
16990
16991 if (new_out)
16992 new_out = gen_lowpart (wmode, new_out);
16993 if (old_out)
16994 old_out = gen_lowpart (wmode, old_out);
16995 else
16996 old_out = new_out;
16997 value = simplify_gen_subreg (wmode, value, mode, 0);
16998
16999 /* The initial load can be relaxed for a __sync operation since a final
17000 barrier will be emitted to stop code hoisting. */
17001 if (is_sync)
17002 aarch64_emit_load_exclusive (mode, old_out, mem,
17003 GEN_INT (MEMMODEL_RELAXED));
17004 else
17005 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
17006
17007 switch (code)
17008 {
17009 case SET:
17010 new_out = value;
17011 break;
17012
17013 case NOT:
17014 x = gen_rtx_AND (wmode, old_out, value);
17015 emit_insn (gen_rtx_SET (new_out, x));
17016 x = gen_rtx_NOT (wmode, new_out);
17017 emit_insn (gen_rtx_SET (new_out, x));
17018 break;
17019
17020 case MINUS:
17021 if (CONST_INT_P (value))
17022 {
17023 value = GEN_INT (-INTVAL (value));
17024 code = PLUS;
17025 }
17026 /* Fall through. */
17027
17028 default:
17029 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
17030 emit_insn (gen_rtx_SET (new_out, x));
17031 break;
17032 }
17033
17034 aarch64_emit_store_exclusive (mode, cond, mem,
17035 gen_lowpart (mode, new_out), model_rtx);
17036
17037 if (aarch64_track_speculation)
17038 {
17039 /* Emit an explicit compare instruction, so that we can correctly
17040 track the condition codes. */
17041 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
17042 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
17043 }
17044 else
17045 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
17046
17047 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17048 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
17049 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17050
17051 /* Emit any final barrier needed for a __sync operation. */
17052 if (is_sync)
17053 aarch64_emit_post_barrier (model);
17054 }
17055
17056 static void
17057 aarch64_init_libfuncs (void)
17058 {
17059 /* Half-precision float operations. The compiler handles all operations
17060 with NULL libfuncs by converting to SFmode. */
17061
17062 /* Conversions. */
17063 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
17064 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
17065
17066 /* Arithmetic. */
17067 set_optab_libfunc (add_optab, HFmode, NULL);
17068 set_optab_libfunc (sdiv_optab, HFmode, NULL);
17069 set_optab_libfunc (smul_optab, HFmode, NULL);
17070 set_optab_libfunc (neg_optab, HFmode, NULL);
17071 set_optab_libfunc (sub_optab, HFmode, NULL);
17072
17073 /* Comparisons. */
17074 set_optab_libfunc (eq_optab, HFmode, NULL);
17075 set_optab_libfunc (ne_optab, HFmode, NULL);
17076 set_optab_libfunc (lt_optab, HFmode, NULL);
17077 set_optab_libfunc (le_optab, HFmode, NULL);
17078 set_optab_libfunc (ge_optab, HFmode, NULL);
17079 set_optab_libfunc (gt_optab, HFmode, NULL);
17080 set_optab_libfunc (unord_optab, HFmode, NULL);
17081 }
17082
17083 /* Target hook for c_mode_for_suffix. */
17084 static machine_mode
17085 aarch64_c_mode_for_suffix (char suffix)
17086 {
17087 if (suffix == 'q')
17088 return TFmode;
17089
17090 return VOIDmode;
17091 }
17092
17093 /* We can only represent floating point constants which will fit in
17094 "quarter-precision" values. These values are characterised by
17095 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
17096 by:
17097
17098 (-1)^s * (n/16) * 2^r
17099
17100 Where:
17101 's' is the sign bit.
17102 'n' is an integer in the range 16 <= n <= 31.
17103 'r' is an integer in the range -3 <= r <= 4. */
17104
17105 /* Return true iff X can be represented by a quarter-precision
17106 floating point immediate operand X. Note, we cannot represent 0.0. */
17107 bool
17108 aarch64_float_const_representable_p (rtx x)
17109 {
17110 /* This represents our current view of how many bits
17111 make up the mantissa. */
17112 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
17113 int exponent;
17114 unsigned HOST_WIDE_INT mantissa, mask;
17115 REAL_VALUE_TYPE r, m;
17116 bool fail;
17117
17118 x = unwrap_const_vec_duplicate (x);
17119 if (!CONST_DOUBLE_P (x))
17120 return false;
17121
17122 if (GET_MODE (x) == VOIDmode
17123 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
17124 return false;
17125
17126 r = *CONST_DOUBLE_REAL_VALUE (x);
17127
17128 /* We cannot represent infinities, NaNs or +/-zero. We won't
17129 know if we have +zero until we analyse the mantissa, but we
17130 can reject the other invalid values. */
17131 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
17132 || REAL_VALUE_MINUS_ZERO (r))
17133 return false;
17134
17135 /* Extract exponent. */
17136 r = real_value_abs (&r);
17137 exponent = REAL_EXP (&r);
17138
17139 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
17140 highest (sign) bit, with a fixed binary point at bit point_pos.
17141 m1 holds the low part of the mantissa, m2 the high part.
17142 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
17143 bits for the mantissa, this can fail (low bits will be lost). */
17144 real_ldexp (&m, &r, point_pos - exponent);
17145 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
17146
17147 /* If the low part of the mantissa has bits set we cannot represent
17148 the value. */
17149 if (w.ulow () != 0)
17150 return false;
17151 /* We have rejected the lower HOST_WIDE_INT, so update our
17152 understanding of how many bits lie in the mantissa and
17153 look only at the high HOST_WIDE_INT. */
17154 mantissa = w.elt (1);
17155 point_pos -= HOST_BITS_PER_WIDE_INT;
17156
17157 /* We can only represent values with a mantissa of the form 1.xxxx. */
17158 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
17159 if ((mantissa & mask) != 0)
17160 return false;
17161
17162 /* Having filtered unrepresentable values, we may now remove all
17163 but the highest 5 bits. */
17164 mantissa >>= point_pos - 5;
17165
17166 /* We cannot represent the value 0.0, so reject it. This is handled
17167 elsewhere. */
17168 if (mantissa == 0)
17169 return false;
17170
17171 /* Then, as bit 4 is always set, we can mask it off, leaving
17172 the mantissa in the range [0, 15]. */
17173 mantissa &= ~(1 << 4);
17174 gcc_assert (mantissa <= 15);
17175
17176 /* GCC internally does not use IEEE754-like encoding (where normalized
17177 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
17178 Our mantissa values are shifted 4 places to the left relative to
17179 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
17180 by 5 places to correct for GCC's representation. */
17181 exponent = 5 - exponent;
17182
17183 return (exponent >= 0 && exponent <= 7);
17184 }
17185
17186 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
17187 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
17188 output MOVI/MVNI, ORR or BIC immediate. */
17189 char*
17190 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
17191 enum simd_immediate_check which)
17192 {
17193 bool is_valid;
17194 static char templ[40];
17195 const char *mnemonic;
17196 const char *shift_op;
17197 unsigned int lane_count = 0;
17198 char element_char;
17199
17200 struct simd_immediate_info info;
17201
17202 /* This will return true to show const_vector is legal for use as either
17203 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
17204 It will also update INFO to show how the immediate should be generated.
17205 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
17206 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
17207 gcc_assert (is_valid);
17208
17209 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17210 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
17211
17212 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17213 {
17214 gcc_assert (info.insn == simd_immediate_info::MOV
17215 && info.u.mov.shift == 0);
17216 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
17217 move immediate path. */
17218 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17219 info.u.mov.value = GEN_INT (0);
17220 else
17221 {
17222 const unsigned int buf_size = 20;
17223 char float_buf[buf_size] = {'\0'};
17224 real_to_decimal_for_mode (float_buf,
17225 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17226 buf_size, buf_size, 1, info.elt_mode);
17227
17228 if (lane_count == 1)
17229 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
17230 else
17231 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
17232 lane_count, element_char, float_buf);
17233 return templ;
17234 }
17235 }
17236
17237 gcc_assert (CONST_INT_P (info.u.mov.value));
17238
17239 if (which == AARCH64_CHECK_MOV)
17240 {
17241 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
17242 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
17243 ? "msl" : "lsl");
17244 if (lane_count == 1)
17245 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
17246 mnemonic, UINTVAL (info.u.mov.value));
17247 else if (info.u.mov.shift)
17248 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17249 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
17250 element_char, UINTVAL (info.u.mov.value), shift_op,
17251 info.u.mov.shift);
17252 else
17253 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
17254 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
17255 element_char, UINTVAL (info.u.mov.value));
17256 }
17257 else
17258 {
17259 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
17260 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
17261 if (info.u.mov.shift)
17262 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17263 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
17264 element_char, UINTVAL (info.u.mov.value), "lsl",
17265 info.u.mov.shift);
17266 else
17267 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
17268 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
17269 element_char, UINTVAL (info.u.mov.value));
17270 }
17271 return templ;
17272 }
17273
17274 char*
17275 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
17276 {
17277
17278 /* If a floating point number was passed and we desire to use it in an
17279 integer mode do the conversion to integer. */
17280 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
17281 {
17282 unsigned HOST_WIDE_INT ival;
17283 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
17284 gcc_unreachable ();
17285 immediate = gen_int_mode (ival, mode);
17286 }
17287
17288 machine_mode vmode;
17289 /* use a 64 bit mode for everything except for DI/DF mode, where we use
17290 a 128 bit vector mode. */
17291 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
17292
17293 vmode = aarch64_simd_container_mode (mode, width);
17294 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
17295 return aarch64_output_simd_mov_immediate (v_op, width);
17296 }
17297
17298 /* Return the output string to use for moving immediate CONST_VECTOR
17299 into an SVE register. */
17300
17301 char *
17302 aarch64_output_sve_mov_immediate (rtx const_vector)
17303 {
17304 static char templ[40];
17305 struct simd_immediate_info info;
17306 char element_char;
17307
17308 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
17309 gcc_assert (is_valid);
17310
17311 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
17312
17313 machine_mode vec_mode = GET_MODE (const_vector);
17314 if (aarch64_sve_pred_mode_p (vec_mode))
17315 {
17316 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
17317 if (info.insn == simd_immediate_info::MOV)
17318 {
17319 gcc_assert (info.u.mov.value == const0_rtx);
17320 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
17321 }
17322 else
17323 {
17324 gcc_assert (info.insn == simd_immediate_info::PTRUE);
17325 unsigned int total_bytes;
17326 if (info.u.pattern == AARCH64_SV_ALL
17327 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
17328 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
17329 total_bytes / GET_MODE_SIZE (info.elt_mode));
17330 else
17331 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
17332 svpattern_token (info.u.pattern));
17333 }
17334 return buf;
17335 }
17336
17337 if (info.insn == simd_immediate_info::INDEX)
17338 {
17339 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
17340 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
17341 element_char, INTVAL (info.u.index.base),
17342 INTVAL (info.u.index.step));
17343 return templ;
17344 }
17345
17346 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
17347 {
17348 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
17349 info.u.mov.value = GEN_INT (0);
17350 else
17351 {
17352 const int buf_size = 20;
17353 char float_buf[buf_size] = {};
17354 real_to_decimal_for_mode (float_buf,
17355 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17356 buf_size, buf_size, 1, info.elt_mode);
17357
17358 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17359 element_char, float_buf);
17360 return templ;
17361 }
17362 }
17363
17364 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17365 element_char, INTVAL (info.u.mov.value));
17366 return templ;
17367 }
17368
17369 /* Split operands into moves from op[1] + op[2] into op[0]. */
17370
17371 void
17372 aarch64_split_combinev16qi (rtx operands[3])
17373 {
17374 unsigned int dest = REGNO (operands[0]);
17375 unsigned int src1 = REGNO (operands[1]);
17376 unsigned int src2 = REGNO (operands[2]);
17377 machine_mode halfmode = GET_MODE (operands[1]);
17378 unsigned int halfregs = REG_NREGS (operands[1]);
17379 rtx destlo, desthi;
17380
17381 gcc_assert (halfmode == V16QImode);
17382
17383 if (src1 == dest && src2 == dest + halfregs)
17384 {
17385 /* No-op move. Can't split to nothing; emit something. */
17386 emit_note (NOTE_INSN_DELETED);
17387 return;
17388 }
17389
17390 /* Preserve register attributes for variable tracking. */
17391 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17392 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17393 GET_MODE_SIZE (halfmode));
17394
17395 /* Special case of reversed high/low parts. */
17396 if (reg_overlap_mentioned_p (operands[2], destlo)
17397 && reg_overlap_mentioned_p (operands[1], desthi))
17398 {
17399 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17400 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17401 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17402 }
17403 else if (!reg_overlap_mentioned_p (operands[2], destlo))
17404 {
17405 /* Try to avoid unnecessary moves if part of the result
17406 is in the right place already. */
17407 if (src1 != dest)
17408 emit_move_insn (destlo, operands[1]);
17409 if (src2 != dest + halfregs)
17410 emit_move_insn (desthi, operands[2]);
17411 }
17412 else
17413 {
17414 if (src2 != dest + halfregs)
17415 emit_move_insn (desthi, operands[2]);
17416 if (src1 != dest)
17417 emit_move_insn (destlo, operands[1]);
17418 }
17419 }
17420
17421 /* vec_perm support. */
17422
17423 struct expand_vec_perm_d
17424 {
17425 rtx target, op0, op1;
17426 vec_perm_indices perm;
17427 machine_mode vmode;
17428 unsigned int vec_flags;
17429 bool one_vector_p;
17430 bool testing_p;
17431 };
17432
17433 /* Generate a variable permutation. */
17434
17435 static void
17436 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17437 {
17438 machine_mode vmode = GET_MODE (target);
17439 bool one_vector_p = rtx_equal_p (op0, op1);
17440
17441 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17442 gcc_checking_assert (GET_MODE (op0) == vmode);
17443 gcc_checking_assert (GET_MODE (op1) == vmode);
17444 gcc_checking_assert (GET_MODE (sel) == vmode);
17445 gcc_checking_assert (TARGET_SIMD);
17446
17447 if (one_vector_p)
17448 {
17449 if (vmode == V8QImode)
17450 {
17451 /* Expand the argument to a V16QI mode by duplicating it. */
17452 rtx pair = gen_reg_rtx (V16QImode);
17453 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17454 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17455 }
17456 else
17457 {
17458 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17459 }
17460 }
17461 else
17462 {
17463 rtx pair;
17464
17465 if (vmode == V8QImode)
17466 {
17467 pair = gen_reg_rtx (V16QImode);
17468 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17469 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17470 }
17471 else
17472 {
17473 pair = gen_reg_rtx (OImode);
17474 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17475 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17476 }
17477 }
17478 }
17479
17480 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17481 NELT is the number of elements in the vector. */
17482
17483 void
17484 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17485 unsigned int nelt)
17486 {
17487 machine_mode vmode = GET_MODE (target);
17488 bool one_vector_p = rtx_equal_p (op0, op1);
17489 rtx mask;
17490
17491 /* The TBL instruction does not use a modulo index, so we must take care
17492 of that ourselves. */
17493 mask = aarch64_simd_gen_const_vector_dup (vmode,
17494 one_vector_p ? nelt - 1 : 2 * nelt - 1);
17495 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17496
17497 /* For big-endian, we also need to reverse the index within the vector
17498 (but not which vector). */
17499 if (BYTES_BIG_ENDIAN)
17500 {
17501 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17502 if (!one_vector_p)
17503 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17504 sel = expand_simple_binop (vmode, XOR, sel, mask,
17505 NULL, 0, OPTAB_LIB_WIDEN);
17506 }
17507 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17508 }
17509
17510 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17511
17512 static void
17513 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17514 {
17515 emit_insn (gen_rtx_SET (target,
17516 gen_rtx_UNSPEC (GET_MODE (target),
17517 gen_rtvec (2, op0, op1), code)));
17518 }
17519
17520 /* Expand an SVE vec_perm with the given operands. */
17521
17522 void
17523 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17524 {
17525 machine_mode data_mode = GET_MODE (target);
17526 machine_mode sel_mode = GET_MODE (sel);
17527 /* Enforced by the pattern condition. */
17528 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17529
17530 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17531 size of the two value vectors, i.e. the upper bits of the indices
17532 are effectively ignored. SVE TBL instead produces 0 for any
17533 out-of-range indices, so we need to modulo all the vec_perm indices
17534 to ensure they are all in range. */
17535 rtx sel_reg = force_reg (sel_mode, sel);
17536
17537 /* Check if the sel only references the first values vector. */
17538 if (GET_CODE (sel) == CONST_VECTOR
17539 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17540 {
17541 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17542 return;
17543 }
17544
17545 /* Check if the two values vectors are the same. */
17546 if (rtx_equal_p (op0, op1))
17547 {
17548 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17549 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17550 NULL, 0, OPTAB_DIRECT);
17551 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17552 return;
17553 }
17554
17555 /* Run TBL on for each value vector and combine the results. */
17556
17557 rtx res0 = gen_reg_rtx (data_mode);
17558 rtx res1 = gen_reg_rtx (data_mode);
17559 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17560 if (GET_CODE (sel) != CONST_VECTOR
17561 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17562 {
17563 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17564 2 * nunits - 1);
17565 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17566 NULL, 0, OPTAB_DIRECT);
17567 }
17568 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17569 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17570 NULL, 0, OPTAB_DIRECT);
17571 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17572 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17573 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17574 else
17575 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17576 }
17577
17578 /* Recognize patterns suitable for the TRN instructions. */
17579 static bool
17580 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17581 {
17582 HOST_WIDE_INT odd;
17583 poly_uint64 nelt = d->perm.length ();
17584 rtx out, in0, in1, x;
17585 machine_mode vmode = d->vmode;
17586
17587 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17588 return false;
17589
17590 /* Note that these are little-endian tests.
17591 We correct for big-endian later. */
17592 if (!d->perm[0].is_constant (&odd)
17593 || (odd != 0 && odd != 1)
17594 || !d->perm.series_p (0, 2, odd, 2)
17595 || !d->perm.series_p (1, 2, nelt + odd, 2))
17596 return false;
17597
17598 /* Success! */
17599 if (d->testing_p)
17600 return true;
17601
17602 in0 = d->op0;
17603 in1 = d->op1;
17604 /* We don't need a big-endian lane correction for SVE; see the comment
17605 at the head of aarch64-sve.md for details. */
17606 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17607 {
17608 x = in0, in0 = in1, in1 = x;
17609 odd = !odd;
17610 }
17611 out = d->target;
17612
17613 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17614 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17615 return true;
17616 }
17617
17618 /* Recognize patterns suitable for the UZP instructions. */
17619 static bool
17620 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17621 {
17622 HOST_WIDE_INT odd;
17623 rtx out, in0, in1, x;
17624 machine_mode vmode = d->vmode;
17625
17626 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17627 return false;
17628
17629 /* Note that these are little-endian tests.
17630 We correct for big-endian later. */
17631 if (!d->perm[0].is_constant (&odd)
17632 || (odd != 0 && odd != 1)
17633 || !d->perm.series_p (0, 1, odd, 2))
17634 return false;
17635
17636 /* Success! */
17637 if (d->testing_p)
17638 return true;
17639
17640 in0 = d->op0;
17641 in1 = d->op1;
17642 /* We don't need a big-endian lane correction for SVE; see the comment
17643 at the head of aarch64-sve.md for details. */
17644 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17645 {
17646 x = in0, in0 = in1, in1 = x;
17647 odd = !odd;
17648 }
17649 out = d->target;
17650
17651 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17652 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17653 return true;
17654 }
17655
17656 /* Recognize patterns suitable for the ZIP instructions. */
17657 static bool
17658 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17659 {
17660 unsigned int high;
17661 poly_uint64 nelt = d->perm.length ();
17662 rtx out, in0, in1, x;
17663 machine_mode vmode = d->vmode;
17664
17665 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17666 return false;
17667
17668 /* Note that these are little-endian tests.
17669 We correct for big-endian later. */
17670 poly_uint64 first = d->perm[0];
17671 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17672 || !d->perm.series_p (0, 2, first, 1)
17673 || !d->perm.series_p (1, 2, first + nelt, 1))
17674 return false;
17675 high = maybe_ne (first, 0U);
17676
17677 /* Success! */
17678 if (d->testing_p)
17679 return true;
17680
17681 in0 = d->op0;
17682 in1 = d->op1;
17683 /* We don't need a big-endian lane correction for SVE; see the comment
17684 at the head of aarch64-sve.md for details. */
17685 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17686 {
17687 x = in0, in0 = in1, in1 = x;
17688 high = !high;
17689 }
17690 out = d->target;
17691
17692 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17693 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17694 return true;
17695 }
17696
17697 /* Recognize patterns for the EXT insn. */
17698
17699 static bool
17700 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17701 {
17702 HOST_WIDE_INT location;
17703 rtx offset;
17704
17705 /* The first element always refers to the first vector.
17706 Check if the extracted indices are increasing by one. */
17707 if (d->vec_flags == VEC_SVE_PRED
17708 || !d->perm[0].is_constant (&location)
17709 || !d->perm.series_p (0, 1, location, 1))
17710 return false;
17711
17712 /* Success! */
17713 if (d->testing_p)
17714 return true;
17715
17716 /* The case where (location == 0) is a no-op for both big- and little-endian,
17717 and is removed by the mid-end at optimization levels -O1 and higher.
17718
17719 We don't need a big-endian lane correction for SVE; see the comment
17720 at the head of aarch64-sve.md for details. */
17721 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17722 {
17723 /* After setup, we want the high elements of the first vector (stored
17724 at the LSB end of the register), and the low elements of the second
17725 vector (stored at the MSB end of the register). So swap. */
17726 std::swap (d->op0, d->op1);
17727 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17728 to_constant () is safe since this is restricted to Advanced SIMD
17729 vectors. */
17730 location = d->perm.length ().to_constant () - location;
17731 }
17732
17733 offset = GEN_INT (location);
17734 emit_set_insn (d->target,
17735 gen_rtx_UNSPEC (d->vmode,
17736 gen_rtvec (3, d->op0, d->op1, offset),
17737 UNSPEC_EXT));
17738 return true;
17739 }
17740
17741 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17742 within each 64-bit, 32-bit or 16-bit granule. */
17743
17744 static bool
17745 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17746 {
17747 HOST_WIDE_INT diff;
17748 unsigned int i, size, unspec;
17749 machine_mode pred_mode;
17750
17751 if (d->vec_flags == VEC_SVE_PRED
17752 || !d->one_vector_p
17753 || !d->perm[0].is_constant (&diff))
17754 return false;
17755
17756 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17757 if (size == 8)
17758 {
17759 unspec = UNSPEC_REV64;
17760 pred_mode = VNx2BImode;
17761 }
17762 else if (size == 4)
17763 {
17764 unspec = UNSPEC_REV32;
17765 pred_mode = VNx4BImode;
17766 }
17767 else if (size == 2)
17768 {
17769 unspec = UNSPEC_REV16;
17770 pred_mode = VNx8BImode;
17771 }
17772 else
17773 return false;
17774
17775 unsigned int step = diff + 1;
17776 for (i = 0; i < step; ++i)
17777 if (!d->perm.series_p (i, step, diff - i, step))
17778 return false;
17779
17780 /* Success! */
17781 if (d->testing_p)
17782 return true;
17783
17784 if (d->vec_flags == VEC_SVE_DATA)
17785 {
17786 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
17787 rtx target = gen_reg_rtx (int_mode);
17788 if (BYTES_BIG_ENDIAN)
17789 /* The act of taking a subreg between INT_MODE and d->vmode
17790 is itself a reversing operation on big-endian targets;
17791 see the comment at the head of aarch64-sve.md for details.
17792 First reinterpret OP0 as INT_MODE without using a subreg
17793 and without changing the contents. */
17794 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
17795 else
17796 {
17797 /* For SVE we use REV[BHW] unspecs derived from the element size
17798 of v->mode and vector modes whose elements have SIZE bytes.
17799 This ensures that the vector modes match the predicate modes. */
17800 int unspec = aarch64_sve_rev_unspec (d->vmode);
17801 rtx pred = aarch64_ptrue_reg (pred_mode);
17802 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
17803 gen_lowpart (int_mode, d->op0)));
17804 }
17805 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
17806 return true;
17807 }
17808 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17809 emit_set_insn (d->target, src);
17810 return true;
17811 }
17812
17813 /* Recognize patterns for the REV insn, which reverses elements within
17814 a full vector. */
17815
17816 static bool
17817 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17818 {
17819 poly_uint64 nelt = d->perm.length ();
17820
17821 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17822 return false;
17823
17824 if (!d->perm.series_p (0, 1, nelt - 1, -1))
17825 return false;
17826
17827 /* Success! */
17828 if (d->testing_p)
17829 return true;
17830
17831 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17832 emit_set_insn (d->target, src);
17833 return true;
17834 }
17835
17836 static bool
17837 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17838 {
17839 rtx out = d->target;
17840 rtx in0;
17841 HOST_WIDE_INT elt;
17842 machine_mode vmode = d->vmode;
17843 rtx lane;
17844
17845 if (d->vec_flags == VEC_SVE_PRED
17846 || d->perm.encoding ().encoded_nelts () != 1
17847 || !d->perm[0].is_constant (&elt))
17848 return false;
17849
17850 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17851 return false;
17852
17853 /* Success! */
17854 if (d->testing_p)
17855 return true;
17856
17857 /* The generic preparation in aarch64_expand_vec_perm_const_1
17858 swaps the operand order and the permute indices if it finds
17859 d->perm[0] to be in the second operand. Thus, we can always
17860 use d->op0 and need not do any extra arithmetic to get the
17861 correct lane number. */
17862 in0 = d->op0;
17863 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
17864
17865 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17866 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17867 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17868 return true;
17869 }
17870
17871 static bool
17872 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17873 {
17874 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17875 machine_mode vmode = d->vmode;
17876
17877 /* Make sure that the indices are constant. */
17878 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17879 for (unsigned int i = 0; i < encoded_nelts; ++i)
17880 if (!d->perm[i].is_constant ())
17881 return false;
17882
17883 if (d->testing_p)
17884 return true;
17885
17886 /* Generic code will try constant permutation twice. Once with the
17887 original mode and again with the elements lowered to QImode.
17888 So wait and don't do the selector expansion ourselves. */
17889 if (vmode != V8QImode && vmode != V16QImode)
17890 return false;
17891
17892 /* to_constant is safe since this routine is specific to Advanced SIMD
17893 vectors. */
17894 unsigned int nelt = d->perm.length ().to_constant ();
17895 for (unsigned int i = 0; i < nelt; ++i)
17896 /* If big-endian and two vectors we end up with a weird mixed-endian
17897 mode on NEON. Reverse the index within each word but not the word
17898 itself. to_constant is safe because we checked is_constant above. */
17899 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17900 ? d->perm[i].to_constant () ^ (nelt - 1)
17901 : d->perm[i].to_constant ());
17902
17903 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17904 sel = force_reg (vmode, sel);
17905
17906 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17907 return true;
17908 }
17909
17910 /* Try to implement D using an SVE TBL instruction. */
17911
17912 static bool
17913 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17914 {
17915 unsigned HOST_WIDE_INT nelt;
17916
17917 /* Permuting two variable-length vectors could overflow the
17918 index range. */
17919 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17920 return false;
17921
17922 if (d->testing_p)
17923 return true;
17924
17925 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17926 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17927 if (d->one_vector_p)
17928 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17929 else
17930 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17931 return true;
17932 }
17933
17934 static bool
17935 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17936 {
17937 /* The pattern matching functions above are written to look for a small
17938 number to begin the sequence (0, 1, N/2). If we begin with an index
17939 from the second operand, we can swap the operands. */
17940 poly_int64 nelt = d->perm.length ();
17941 if (known_ge (d->perm[0], nelt))
17942 {
17943 d->perm.rotate_inputs (1);
17944 std::swap (d->op0, d->op1);
17945 }
17946
17947 if ((d->vec_flags == VEC_ADVSIMD
17948 || d->vec_flags == VEC_SVE_DATA
17949 || d->vec_flags == VEC_SVE_PRED)
17950 && known_gt (nelt, 1))
17951 {
17952 if (aarch64_evpc_rev_local (d))
17953 return true;
17954 else if (aarch64_evpc_rev_global (d))
17955 return true;
17956 else if (aarch64_evpc_ext (d))
17957 return true;
17958 else if (aarch64_evpc_dup (d))
17959 return true;
17960 else if (aarch64_evpc_zip (d))
17961 return true;
17962 else if (aarch64_evpc_uzp (d))
17963 return true;
17964 else if (aarch64_evpc_trn (d))
17965 return true;
17966 if (d->vec_flags == VEC_SVE_DATA)
17967 return aarch64_evpc_sve_tbl (d);
17968 else if (d->vec_flags == VEC_ADVSIMD)
17969 return aarch64_evpc_tbl (d);
17970 }
17971 return false;
17972 }
17973
17974 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
17975
17976 static bool
17977 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17978 rtx op1, const vec_perm_indices &sel)
17979 {
17980 struct expand_vec_perm_d d;
17981
17982 /* Check whether the mask can be applied to a single vector. */
17983 if (sel.ninputs () == 1
17984 || (op0 && rtx_equal_p (op0, op1)))
17985 d.one_vector_p = true;
17986 else if (sel.all_from_input_p (0))
17987 {
17988 d.one_vector_p = true;
17989 op1 = op0;
17990 }
17991 else if (sel.all_from_input_p (1))
17992 {
17993 d.one_vector_p = true;
17994 op0 = op1;
17995 }
17996 else
17997 d.one_vector_p = false;
17998
17999 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
18000 sel.nelts_per_input ());
18001 d.vmode = vmode;
18002 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
18003 d.target = target;
18004 d.op0 = op0;
18005 d.op1 = op1;
18006 d.testing_p = !target;
18007
18008 if (!d.testing_p)
18009 return aarch64_expand_vec_perm_const_1 (&d);
18010
18011 rtx_insn *last = get_last_insn ();
18012 bool ret = aarch64_expand_vec_perm_const_1 (&d);
18013 gcc_assert (last == get_last_insn ());
18014
18015 return ret;
18016 }
18017
18018 /* Generate a byte permute mask for a register of mode MODE,
18019 which has NUNITS units. */
18020
18021 rtx
18022 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
18023 {
18024 /* We have to reverse each vector because we dont have
18025 a permuted load that can reverse-load according to ABI rules. */
18026 rtx mask;
18027 rtvec v = rtvec_alloc (16);
18028 unsigned int i, j;
18029 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
18030
18031 gcc_assert (BYTES_BIG_ENDIAN);
18032 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
18033
18034 for (i = 0; i < nunits; i++)
18035 for (j = 0; j < usize; j++)
18036 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
18037 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
18038 return force_reg (V16QImode, mask);
18039 }
18040
18041 /* Expand an SVE integer comparison using the SVE equivalent of:
18042
18043 (set TARGET (CODE OP0 OP1)). */
18044
18045 void
18046 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
18047 {
18048 machine_mode pred_mode = GET_MODE (target);
18049 machine_mode data_mode = GET_MODE (op0);
18050 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
18051 op0, op1);
18052 if (!rtx_equal_p (target, res))
18053 emit_move_insn (target, res);
18054 }
18055
18056 /* Return the UNSPEC_COND_* code for comparison CODE. */
18057
18058 static unsigned int
18059 aarch64_unspec_cond_code (rtx_code code)
18060 {
18061 switch (code)
18062 {
18063 case NE:
18064 return UNSPEC_COND_FCMNE;
18065 case EQ:
18066 return UNSPEC_COND_FCMEQ;
18067 case LT:
18068 return UNSPEC_COND_FCMLT;
18069 case GT:
18070 return UNSPEC_COND_FCMGT;
18071 case LE:
18072 return UNSPEC_COND_FCMLE;
18073 case GE:
18074 return UNSPEC_COND_FCMGE;
18075 case UNORDERED:
18076 return UNSPEC_COND_FCMUO;
18077 default:
18078 gcc_unreachable ();
18079 }
18080 }
18081
18082 /* Emit:
18083
18084 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18085
18086 where <X> is the operation associated with comparison CODE.
18087 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18088
18089 static void
18090 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
18091 bool known_ptrue_p, rtx op0, rtx op1)
18092 {
18093 rtx flag = gen_int_mode (known_ptrue_p, SImode);
18094 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
18095 gen_rtvec (4, pred, flag, op0, op1),
18096 aarch64_unspec_cond_code (code));
18097 emit_set_insn (target, unspec);
18098 }
18099
18100 /* Emit the SVE equivalent of:
18101
18102 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
18103 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
18104 (set TARGET (ior:PRED_MODE TMP1 TMP2))
18105
18106 where <Xi> is the operation associated with comparison CODEi.
18107 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18108
18109 static void
18110 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
18111 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
18112 {
18113 machine_mode pred_mode = GET_MODE (pred);
18114 rtx tmp1 = gen_reg_rtx (pred_mode);
18115 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
18116 rtx tmp2 = gen_reg_rtx (pred_mode);
18117 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
18118 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
18119 }
18120
18121 /* Emit the SVE equivalent of:
18122
18123 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
18124 (set TARGET (not TMP))
18125
18126 where <X> is the operation associated with comparison CODE.
18127 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
18128
18129 static void
18130 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
18131 bool known_ptrue_p, rtx op0, rtx op1)
18132 {
18133 machine_mode pred_mode = GET_MODE (pred);
18134 rtx tmp = gen_reg_rtx (pred_mode);
18135 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
18136 aarch64_emit_unop (target, one_cmpl_optab, tmp);
18137 }
18138
18139 /* Expand an SVE floating-point comparison using the SVE equivalent of:
18140
18141 (set TARGET (CODE OP0 OP1))
18142
18143 If CAN_INVERT_P is true, the caller can also handle inverted results;
18144 return true if the result is in fact inverted. */
18145
18146 bool
18147 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
18148 rtx op0, rtx op1, bool can_invert_p)
18149 {
18150 machine_mode pred_mode = GET_MODE (target);
18151 machine_mode data_mode = GET_MODE (op0);
18152
18153 rtx ptrue = aarch64_ptrue_reg (pred_mode);
18154 switch (code)
18155 {
18156 case UNORDERED:
18157 /* UNORDERED has no immediate form. */
18158 op1 = force_reg (data_mode, op1);
18159 /* fall through */
18160 case LT:
18161 case LE:
18162 case GT:
18163 case GE:
18164 case EQ:
18165 case NE:
18166 {
18167 /* There is native support for the comparison. */
18168 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18169 return false;
18170 }
18171
18172 case LTGT:
18173 /* This is a trapping operation (LT or GT). */
18174 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
18175 return false;
18176
18177 case UNEQ:
18178 if (!flag_trapping_math)
18179 {
18180 /* This would trap for signaling NaNs. */
18181 op1 = force_reg (data_mode, op1);
18182 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
18183 ptrue, true, op0, op1);
18184 return false;
18185 }
18186 /* fall through */
18187 case UNLT:
18188 case UNLE:
18189 case UNGT:
18190 case UNGE:
18191 if (flag_trapping_math)
18192 {
18193 /* Work out which elements are ordered. */
18194 rtx ordered = gen_reg_rtx (pred_mode);
18195 op1 = force_reg (data_mode, op1);
18196 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
18197 ptrue, true, op0, op1);
18198
18199 /* Test the opposite condition for the ordered elements,
18200 then invert the result. */
18201 if (code == UNEQ)
18202 code = NE;
18203 else
18204 code = reverse_condition_maybe_unordered (code);
18205 if (can_invert_p)
18206 {
18207 aarch64_emit_sve_fp_cond (target, code,
18208 ordered, false, op0, op1);
18209 return true;
18210 }
18211 aarch64_emit_sve_invert_fp_cond (target, code,
18212 ordered, false, op0, op1);
18213 return false;
18214 }
18215 break;
18216
18217 case ORDERED:
18218 /* ORDERED has no immediate form. */
18219 op1 = force_reg (data_mode, op1);
18220 break;
18221
18222 default:
18223 gcc_unreachable ();
18224 }
18225
18226 /* There is native support for the inverse comparison. */
18227 code = reverse_condition_maybe_unordered (code);
18228 if (can_invert_p)
18229 {
18230 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
18231 return true;
18232 }
18233 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
18234 return false;
18235 }
18236
18237 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
18238 of the data being selected and CMP_MODE is the mode of the values being
18239 compared. */
18240
18241 void
18242 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
18243 rtx *ops)
18244 {
18245 machine_mode pred_mode
18246 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
18247 GET_MODE_SIZE (cmp_mode)).require ();
18248 rtx pred = gen_reg_rtx (pred_mode);
18249 if (FLOAT_MODE_P (cmp_mode))
18250 {
18251 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
18252 ops[4], ops[5], true))
18253 std::swap (ops[1], ops[2]);
18254 }
18255 else
18256 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
18257
18258 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
18259 ops[1] = force_reg (data_mode, ops[1]);
18260 /* The "false" value can only be zero if the "true" value is a constant. */
18261 if (register_operand (ops[1], data_mode)
18262 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
18263 ops[2] = force_reg (data_mode, ops[2]);
18264
18265 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
18266 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
18267 }
18268
18269 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
18270 true. However due to issues with register allocation it is preferable
18271 to avoid tieing integer scalar and FP scalar modes. Executing integer
18272 operations in general registers is better than treating them as scalar
18273 vector operations. This reduces latency and avoids redundant int<->FP
18274 moves. So tie modes if they are either the same class, or vector modes
18275 with other vector modes, vector structs or any scalar mode. */
18276
18277 static bool
18278 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
18279 {
18280 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
18281 return true;
18282
18283 /* We specifically want to allow elements of "structure" modes to
18284 be tieable to the structure. This more general condition allows
18285 other rarer situations too. The reason we don't extend this to
18286 predicate modes is that there are no predicate structure modes
18287 nor any specific instructions for extracting part of a predicate
18288 register. */
18289 if (aarch64_vector_data_mode_p (mode1)
18290 && aarch64_vector_data_mode_p (mode2))
18291 return true;
18292
18293 /* Also allow any scalar modes with vectors. */
18294 if (aarch64_vector_mode_supported_p (mode1)
18295 || aarch64_vector_mode_supported_p (mode2))
18296 return true;
18297
18298 return false;
18299 }
18300
18301 /* Return a new RTX holding the result of moving POINTER forward by
18302 AMOUNT bytes. */
18303
18304 static rtx
18305 aarch64_move_pointer (rtx pointer, poly_int64 amount)
18306 {
18307 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
18308
18309 return adjust_automodify_address (pointer, GET_MODE (pointer),
18310 next, amount);
18311 }
18312
18313 /* Return a new RTX holding the result of moving POINTER forward by the
18314 size of the mode it points to. */
18315
18316 static rtx
18317 aarch64_progress_pointer (rtx pointer)
18318 {
18319 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18320 }
18321
18322 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18323 MODE bytes. */
18324
18325 static void
18326 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18327 machine_mode mode)
18328 {
18329 rtx reg = gen_reg_rtx (mode);
18330
18331 /* "Cast" the pointers to the correct mode. */
18332 *src = adjust_address (*src, mode, 0);
18333 *dst = adjust_address (*dst, mode, 0);
18334 /* Emit the memcpy. */
18335 emit_move_insn (reg, *src);
18336 emit_move_insn (*dst, reg);
18337 /* Move the pointers forward. */
18338 *src = aarch64_progress_pointer (*src);
18339 *dst = aarch64_progress_pointer (*dst);
18340 }
18341
18342 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18343 we succeed, otherwise return false. */
18344
18345 bool
18346 aarch64_expand_cpymem (rtx *operands)
18347 {
18348 int n, mode_bits;
18349 rtx dst = operands[0];
18350 rtx src = operands[1];
18351 rtx base;
18352 machine_mode cur_mode = BLKmode, next_mode;
18353 bool speed_p = !optimize_function_for_size_p (cfun);
18354
18355 /* When optimizing for size, give a better estimate of the length of a
18356 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18357 will always require an even number of instructions to do now. And each
18358 operation requires both a load+store, so devide the max number by 2. */
18359 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18360
18361 /* We can't do anything smart if the amount to copy is not constant. */
18362 if (!CONST_INT_P (operands[2]))
18363 return false;
18364
18365 n = INTVAL (operands[2]);
18366
18367 /* Try to keep the number of instructions low. For all cases we will do at
18368 most two moves for the residual amount, since we'll always overlap the
18369 remainder. */
18370 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18371 return false;
18372
18373 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18374 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18375
18376 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18377 src = adjust_automodify_address (src, VOIDmode, base, 0);
18378
18379 /* Convert n to bits to make the rest of the code simpler. */
18380 n = n * BITS_PER_UNIT;
18381
18382 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18383 larger than TImode, but we should not use them for loads/stores here. */
18384 const int copy_limit = GET_MODE_BITSIZE (TImode);
18385
18386 while (n > 0)
18387 {
18388 /* Find the largest mode in which to do the copy in without over reading
18389 or writing. */
18390 opt_scalar_int_mode mode_iter;
18391 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18392 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18393 cur_mode = mode_iter.require ();
18394
18395 gcc_assert (cur_mode != BLKmode);
18396
18397 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18398 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18399
18400 n -= mode_bits;
18401
18402 /* Do certain trailing copies as overlapping if it's going to be
18403 cheaper. i.e. less instructions to do so. For instance doing a 15
18404 byte copy it's more efficient to do two overlapping 8 byte copies than
18405 8 + 6 + 1. */
18406 if (n > 0 && n <= 8 * BITS_PER_UNIT)
18407 {
18408 next_mode = smallest_mode_for_size (n, MODE_INT);
18409 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18410 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18411 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18412 n = n_bits;
18413 }
18414 }
18415
18416 return true;
18417 }
18418
18419 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18420 SImode stores. Handle the case when the constant has identical
18421 bottom and top halves. This is beneficial when the two stores can be
18422 merged into an STP and we avoid synthesising potentially expensive
18423 immediates twice. Return true if such a split is possible. */
18424
18425 bool
18426 aarch64_split_dimode_const_store (rtx dst, rtx src)
18427 {
18428 rtx lo = gen_lowpart (SImode, src);
18429 rtx hi = gen_highpart_mode (SImode, DImode, src);
18430
18431 bool size_p = optimize_function_for_size_p (cfun);
18432
18433 if (!rtx_equal_p (lo, hi))
18434 return false;
18435
18436 unsigned int orig_cost
18437 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18438 unsigned int lo_cost
18439 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18440
18441 /* We want to transform:
18442 MOV x1, 49370
18443 MOVK x1, 0x140, lsl 16
18444 MOVK x1, 0xc0da, lsl 32
18445 MOVK x1, 0x140, lsl 48
18446 STR x1, [x0]
18447 into:
18448 MOV w1, 49370
18449 MOVK w1, 0x140, lsl 16
18450 STP w1, w1, [x0]
18451 So we want to perform this only when we save two instructions
18452 or more. When optimizing for size, however, accept any code size
18453 savings we can. */
18454 if (size_p && orig_cost <= lo_cost)
18455 return false;
18456
18457 if (!size_p
18458 && (orig_cost <= lo_cost + 1))
18459 return false;
18460
18461 rtx mem_lo = adjust_address (dst, SImode, 0);
18462 if (!aarch64_mem_pair_operand (mem_lo, SImode))
18463 return false;
18464
18465 rtx tmp_reg = gen_reg_rtx (SImode);
18466 aarch64_expand_mov_immediate (tmp_reg, lo);
18467 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18468 /* Don't emit an explicit store pair as this may not be always profitable.
18469 Let the sched-fusion logic decide whether to merge them. */
18470 emit_move_insn (mem_lo, tmp_reg);
18471 emit_move_insn (mem_hi, tmp_reg);
18472
18473 return true;
18474 }
18475
18476 /* Generate RTL for a conditional branch with rtx comparison CODE in
18477 mode CC_MODE. The destination of the unlikely conditional branch
18478 is LABEL_REF. */
18479
18480 void
18481 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18482 rtx label_ref)
18483 {
18484 rtx x;
18485 x = gen_rtx_fmt_ee (code, VOIDmode,
18486 gen_rtx_REG (cc_mode, CC_REGNUM),
18487 const0_rtx);
18488
18489 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18490 gen_rtx_LABEL_REF (VOIDmode, label_ref),
18491 pc_rtx);
18492 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18493 }
18494
18495 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18496
18497 OP1 represents the TImode destination operand 1
18498 OP2 represents the TImode destination operand 2
18499 LOW_DEST represents the low half (DImode) of TImode operand 0
18500 LOW_IN1 represents the low half (DImode) of TImode operand 1
18501 LOW_IN2 represents the low half (DImode) of TImode operand 2
18502 HIGH_DEST represents the high half (DImode) of TImode operand 0
18503 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18504 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18505
18506 void
18507 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18508 rtx *low_in1, rtx *low_in2,
18509 rtx *high_dest, rtx *high_in1,
18510 rtx *high_in2)
18511 {
18512 *low_dest = gen_reg_rtx (DImode);
18513 *low_in1 = gen_lowpart (DImode, op1);
18514 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18515 subreg_lowpart_offset (DImode, TImode));
18516 *high_dest = gen_reg_rtx (DImode);
18517 *high_in1 = gen_highpart (DImode, op1);
18518 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18519 subreg_highpart_offset (DImode, TImode));
18520 }
18521
18522 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18523
18524 This function differs from 'arch64_addti_scratch_regs' in that
18525 OP1 can be an immediate constant (zero). We must call
18526 subreg_highpart_offset with DImode and TImode arguments, otherwise
18527 VOIDmode will be used for the const_int which generates an internal
18528 error from subreg_size_highpart_offset which does not expect a size of zero.
18529
18530 OP1 represents the TImode destination operand 1
18531 OP2 represents the TImode destination operand 2
18532 LOW_DEST represents the low half (DImode) of TImode operand 0
18533 LOW_IN1 represents the low half (DImode) of TImode operand 1
18534 LOW_IN2 represents the low half (DImode) of TImode operand 2
18535 HIGH_DEST represents the high half (DImode) of TImode operand 0
18536 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18537 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18538
18539
18540 void
18541 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18542 rtx *low_in1, rtx *low_in2,
18543 rtx *high_dest, rtx *high_in1,
18544 rtx *high_in2)
18545 {
18546 *low_dest = gen_reg_rtx (DImode);
18547 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18548 subreg_lowpart_offset (DImode, TImode));
18549
18550 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18551 subreg_lowpart_offset (DImode, TImode));
18552 *high_dest = gen_reg_rtx (DImode);
18553
18554 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18555 subreg_highpart_offset (DImode, TImode));
18556 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18557 subreg_highpart_offset (DImode, TImode));
18558 }
18559
18560 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18561
18562 OP0 represents the TImode destination operand 0
18563 LOW_DEST represents the low half (DImode) of TImode operand 0
18564 LOW_IN1 represents the low half (DImode) of TImode operand 1
18565 LOW_IN2 represents the low half (DImode) of TImode operand 2
18566 HIGH_DEST represents the high half (DImode) of TImode operand 0
18567 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18568 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18569 UNSIGNED_P is true if the operation is being performed on unsigned
18570 values. */
18571 void
18572 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18573 rtx low_in2, rtx high_dest, rtx high_in1,
18574 rtx high_in2, bool unsigned_p)
18575 {
18576 if (low_in2 == const0_rtx)
18577 {
18578 low_dest = low_in1;
18579 high_in2 = force_reg (DImode, high_in2);
18580 if (unsigned_p)
18581 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18582 else
18583 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18584 }
18585 else
18586 {
18587 if (CONST_INT_P (low_in2))
18588 {
18589 high_in2 = force_reg (DImode, high_in2);
18590 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18591 GEN_INT (-INTVAL (low_in2))));
18592 }
18593 else
18594 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18595
18596 if (unsigned_p)
18597 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18598 else
18599 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18600 }
18601
18602 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18603 emit_move_insn (gen_highpart (DImode, op0), high_dest);
18604
18605 }
18606
18607 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18608
18609 static unsigned HOST_WIDE_INT
18610 aarch64_asan_shadow_offset (void)
18611 {
18612 if (TARGET_ILP32)
18613 return (HOST_WIDE_INT_1 << 29);
18614 else
18615 return (HOST_WIDE_INT_1 << 36);
18616 }
18617
18618 static rtx
18619 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18620 int code, tree treeop0, tree treeop1)
18621 {
18622 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18623 rtx op0, op1;
18624 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18625 insn_code icode;
18626 struct expand_operand ops[4];
18627
18628 start_sequence ();
18629 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18630
18631 op_mode = GET_MODE (op0);
18632 if (op_mode == VOIDmode)
18633 op_mode = GET_MODE (op1);
18634
18635 switch (op_mode)
18636 {
18637 case E_QImode:
18638 case E_HImode:
18639 case E_SImode:
18640 cmp_mode = SImode;
18641 icode = CODE_FOR_cmpsi;
18642 break;
18643
18644 case E_DImode:
18645 cmp_mode = DImode;
18646 icode = CODE_FOR_cmpdi;
18647 break;
18648
18649 case E_SFmode:
18650 cmp_mode = SFmode;
18651 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18652 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18653 break;
18654
18655 case E_DFmode:
18656 cmp_mode = DFmode;
18657 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18658 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18659 break;
18660
18661 default:
18662 end_sequence ();
18663 return NULL_RTX;
18664 }
18665
18666 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18667 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18668 if (!op0 || !op1)
18669 {
18670 end_sequence ();
18671 return NULL_RTX;
18672 }
18673 *prep_seq = get_insns ();
18674 end_sequence ();
18675
18676 create_fixed_operand (&ops[0], op0);
18677 create_fixed_operand (&ops[1], op1);
18678
18679 start_sequence ();
18680 if (!maybe_expand_insn (icode, 2, ops))
18681 {
18682 end_sequence ();
18683 return NULL_RTX;
18684 }
18685 *gen_seq = get_insns ();
18686 end_sequence ();
18687
18688 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18689 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18690 }
18691
18692 static rtx
18693 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18694 int cmp_code, tree treeop0, tree treeop1, int bit_code)
18695 {
18696 rtx op0, op1, target;
18697 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18698 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18699 insn_code icode;
18700 struct expand_operand ops[6];
18701 int aarch64_cond;
18702
18703 push_to_sequence (*prep_seq);
18704 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18705
18706 op_mode = GET_MODE (op0);
18707 if (op_mode == VOIDmode)
18708 op_mode = GET_MODE (op1);
18709
18710 switch (op_mode)
18711 {
18712 case E_QImode:
18713 case E_HImode:
18714 case E_SImode:
18715 cmp_mode = SImode;
18716 icode = CODE_FOR_ccmpsi;
18717 break;
18718
18719 case E_DImode:
18720 cmp_mode = DImode;
18721 icode = CODE_FOR_ccmpdi;
18722 break;
18723
18724 case E_SFmode:
18725 cmp_mode = SFmode;
18726 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18727 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18728 break;
18729
18730 case E_DFmode:
18731 cmp_mode = DFmode;
18732 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18733 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18734 break;
18735
18736 default:
18737 end_sequence ();
18738 return NULL_RTX;
18739 }
18740
18741 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18742 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18743 if (!op0 || !op1)
18744 {
18745 end_sequence ();
18746 return NULL_RTX;
18747 }
18748 *prep_seq = get_insns ();
18749 end_sequence ();
18750
18751 target = gen_rtx_REG (cc_mode, CC_REGNUM);
18752 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18753
18754 if (bit_code != AND)
18755 {
18756 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18757 GET_MODE (XEXP (prev, 0))),
18758 VOIDmode, XEXP (prev, 0), const0_rtx);
18759 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18760 }
18761
18762 create_fixed_operand (&ops[0], XEXP (prev, 0));
18763 create_fixed_operand (&ops[1], target);
18764 create_fixed_operand (&ops[2], op0);
18765 create_fixed_operand (&ops[3], op1);
18766 create_fixed_operand (&ops[4], prev);
18767 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18768
18769 push_to_sequence (*gen_seq);
18770 if (!maybe_expand_insn (icode, 6, ops))
18771 {
18772 end_sequence ();
18773 return NULL_RTX;
18774 }
18775
18776 *gen_seq = get_insns ();
18777 end_sequence ();
18778
18779 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18780 }
18781
18782 #undef TARGET_GEN_CCMP_FIRST
18783 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18784
18785 #undef TARGET_GEN_CCMP_NEXT
18786 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18787
18788 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18789 instruction fusion of some sort. */
18790
18791 static bool
18792 aarch64_macro_fusion_p (void)
18793 {
18794 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18795 }
18796
18797
18798 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
18799 should be kept together during scheduling. */
18800
18801 static bool
18802 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18803 {
18804 rtx set_dest;
18805 rtx prev_set = single_set (prev);
18806 rtx curr_set = single_set (curr);
18807 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
18808 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18809
18810 if (!aarch64_macro_fusion_p ())
18811 return false;
18812
18813 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18814 {
18815 /* We are trying to match:
18816 prev (mov) == (set (reg r0) (const_int imm16))
18817 curr (movk) == (set (zero_extract (reg r0)
18818 (const_int 16)
18819 (const_int 16))
18820 (const_int imm16_1)) */
18821
18822 set_dest = SET_DEST (curr_set);
18823
18824 if (GET_CODE (set_dest) == ZERO_EXTRACT
18825 && CONST_INT_P (SET_SRC (curr_set))
18826 && CONST_INT_P (SET_SRC (prev_set))
18827 && CONST_INT_P (XEXP (set_dest, 2))
18828 && INTVAL (XEXP (set_dest, 2)) == 16
18829 && REG_P (XEXP (set_dest, 0))
18830 && REG_P (SET_DEST (prev_set))
18831 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18832 {
18833 return true;
18834 }
18835 }
18836
18837 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18838 {
18839
18840 /* We're trying to match:
18841 prev (adrp) == (set (reg r1)
18842 (high (symbol_ref ("SYM"))))
18843 curr (add) == (set (reg r0)
18844 (lo_sum (reg r1)
18845 (symbol_ref ("SYM"))))
18846 Note that r0 need not necessarily be the same as r1, especially
18847 during pre-regalloc scheduling. */
18848
18849 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18850 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18851 {
18852 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18853 && REG_P (XEXP (SET_SRC (curr_set), 0))
18854 && REGNO (XEXP (SET_SRC (curr_set), 0))
18855 == REGNO (SET_DEST (prev_set))
18856 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18857 XEXP (SET_SRC (curr_set), 1)))
18858 return true;
18859 }
18860 }
18861
18862 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18863 {
18864
18865 /* We're trying to match:
18866 prev (movk) == (set (zero_extract (reg r0)
18867 (const_int 16)
18868 (const_int 32))
18869 (const_int imm16_1))
18870 curr (movk) == (set (zero_extract (reg r0)
18871 (const_int 16)
18872 (const_int 48))
18873 (const_int imm16_2)) */
18874
18875 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18876 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18877 && REG_P (XEXP (SET_DEST (prev_set), 0))
18878 && REG_P (XEXP (SET_DEST (curr_set), 0))
18879 && REGNO (XEXP (SET_DEST (prev_set), 0))
18880 == REGNO (XEXP (SET_DEST (curr_set), 0))
18881 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18882 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18883 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18884 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18885 && CONST_INT_P (SET_SRC (prev_set))
18886 && CONST_INT_P (SET_SRC (curr_set)))
18887 return true;
18888
18889 }
18890 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18891 {
18892 /* We're trying to match:
18893 prev (adrp) == (set (reg r0)
18894 (high (symbol_ref ("SYM"))))
18895 curr (ldr) == (set (reg r1)
18896 (mem (lo_sum (reg r0)
18897 (symbol_ref ("SYM")))))
18898 or
18899 curr (ldr) == (set (reg r1)
18900 (zero_extend (mem
18901 (lo_sum (reg r0)
18902 (symbol_ref ("SYM")))))) */
18903 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18904 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18905 {
18906 rtx curr_src = SET_SRC (curr_set);
18907
18908 if (GET_CODE (curr_src) == ZERO_EXTEND)
18909 curr_src = XEXP (curr_src, 0);
18910
18911 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18912 && REG_P (XEXP (XEXP (curr_src, 0), 0))
18913 && REGNO (XEXP (XEXP (curr_src, 0), 0))
18914 == REGNO (SET_DEST (prev_set))
18915 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18916 XEXP (SET_SRC (prev_set), 0)))
18917 return true;
18918 }
18919 }
18920
18921 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18922 && any_condjump_p (curr))
18923 {
18924 unsigned int condreg1, condreg2;
18925 rtx cc_reg_1;
18926 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18927 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18928
18929 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18930 && prev
18931 && modified_in_p (cc_reg_1, prev))
18932 {
18933 enum attr_type prev_type = get_attr_type (prev);
18934
18935 /* FIXME: this misses some which is considered simple arthematic
18936 instructions for ThunderX. Simple shifts are missed here. */
18937 if (prev_type == TYPE_ALUS_SREG
18938 || prev_type == TYPE_ALUS_IMM
18939 || prev_type == TYPE_LOGICS_REG
18940 || prev_type == TYPE_LOGICS_IMM)
18941 return true;
18942 }
18943 }
18944
18945 if (prev_set
18946 && curr_set
18947 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18948 && any_condjump_p (curr))
18949 {
18950 /* We're trying to match:
18951 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18952 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18953 (const_int 0))
18954 (label_ref ("SYM"))
18955 (pc)) */
18956 if (SET_DEST (curr_set) == (pc_rtx)
18957 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18958 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18959 && REG_P (SET_DEST (prev_set))
18960 && REGNO (SET_DEST (prev_set))
18961 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18962 {
18963 /* Fuse ALU operations followed by conditional branch instruction. */
18964 switch (get_attr_type (prev))
18965 {
18966 case TYPE_ALU_IMM:
18967 case TYPE_ALU_SREG:
18968 case TYPE_ADC_REG:
18969 case TYPE_ADC_IMM:
18970 case TYPE_ADCS_REG:
18971 case TYPE_ADCS_IMM:
18972 case TYPE_LOGIC_REG:
18973 case TYPE_LOGIC_IMM:
18974 case TYPE_CSEL:
18975 case TYPE_ADR:
18976 case TYPE_MOV_IMM:
18977 case TYPE_SHIFT_REG:
18978 case TYPE_SHIFT_IMM:
18979 case TYPE_BFM:
18980 case TYPE_RBIT:
18981 case TYPE_REV:
18982 case TYPE_EXTEND:
18983 return true;
18984
18985 default:;
18986 }
18987 }
18988 }
18989
18990 return false;
18991 }
18992
18993 /* Return true iff the instruction fusion described by OP is enabled. */
18994
18995 bool
18996 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18997 {
18998 return (aarch64_tune_params.fusible_ops & op) != 0;
18999 }
19000
19001 /* If MEM is in the form of [base+offset], extract the two parts
19002 of address and set to BASE and OFFSET, otherwise return false
19003 after clearing BASE and OFFSET. */
19004
19005 bool
19006 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
19007 {
19008 rtx addr;
19009
19010 gcc_assert (MEM_P (mem));
19011
19012 addr = XEXP (mem, 0);
19013
19014 if (REG_P (addr))
19015 {
19016 *base = addr;
19017 *offset = const0_rtx;
19018 return true;
19019 }
19020
19021 if (GET_CODE (addr) == PLUS
19022 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
19023 {
19024 *base = XEXP (addr, 0);
19025 *offset = XEXP (addr, 1);
19026 return true;
19027 }
19028
19029 *base = NULL_RTX;
19030 *offset = NULL_RTX;
19031
19032 return false;
19033 }
19034
19035 /* Types for scheduling fusion. */
19036 enum sched_fusion_type
19037 {
19038 SCHED_FUSION_NONE = 0,
19039 SCHED_FUSION_LD_SIGN_EXTEND,
19040 SCHED_FUSION_LD_ZERO_EXTEND,
19041 SCHED_FUSION_LD,
19042 SCHED_FUSION_ST,
19043 SCHED_FUSION_NUM
19044 };
19045
19046 /* If INSN is a load or store of address in the form of [base+offset],
19047 extract the two parts and set to BASE and OFFSET. Return scheduling
19048 fusion type this INSN is. */
19049
19050 static enum sched_fusion_type
19051 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
19052 {
19053 rtx x, dest, src;
19054 enum sched_fusion_type fusion = SCHED_FUSION_LD;
19055
19056 gcc_assert (INSN_P (insn));
19057 x = PATTERN (insn);
19058 if (GET_CODE (x) != SET)
19059 return SCHED_FUSION_NONE;
19060
19061 src = SET_SRC (x);
19062 dest = SET_DEST (x);
19063
19064 machine_mode dest_mode = GET_MODE (dest);
19065
19066 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
19067 return SCHED_FUSION_NONE;
19068
19069 if (GET_CODE (src) == SIGN_EXTEND)
19070 {
19071 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
19072 src = XEXP (src, 0);
19073 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19074 return SCHED_FUSION_NONE;
19075 }
19076 else if (GET_CODE (src) == ZERO_EXTEND)
19077 {
19078 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
19079 src = XEXP (src, 0);
19080 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
19081 return SCHED_FUSION_NONE;
19082 }
19083
19084 if (GET_CODE (src) == MEM && REG_P (dest))
19085 extract_base_offset_in_addr (src, base, offset);
19086 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
19087 {
19088 fusion = SCHED_FUSION_ST;
19089 extract_base_offset_in_addr (dest, base, offset);
19090 }
19091 else
19092 return SCHED_FUSION_NONE;
19093
19094 if (*base == NULL_RTX || *offset == NULL_RTX)
19095 fusion = SCHED_FUSION_NONE;
19096
19097 return fusion;
19098 }
19099
19100 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
19101
19102 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
19103 and PRI are only calculated for these instructions. For other instruction,
19104 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
19105 type instruction fusion can be added by returning different priorities.
19106
19107 It's important that irrelevant instructions get the largest FUSION_PRI. */
19108
19109 static void
19110 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
19111 int *fusion_pri, int *pri)
19112 {
19113 int tmp, off_val;
19114 rtx base, offset;
19115 enum sched_fusion_type fusion;
19116
19117 gcc_assert (INSN_P (insn));
19118
19119 tmp = max_pri - 1;
19120 fusion = fusion_load_store (insn, &base, &offset);
19121 if (fusion == SCHED_FUSION_NONE)
19122 {
19123 *pri = tmp;
19124 *fusion_pri = tmp;
19125 return;
19126 }
19127
19128 /* Set FUSION_PRI according to fusion type and base register. */
19129 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
19130
19131 /* Calculate PRI. */
19132 tmp /= 2;
19133
19134 /* INSN with smaller offset goes first. */
19135 off_val = (int)(INTVAL (offset));
19136 if (off_val >= 0)
19137 tmp -= (off_val & 0xfffff);
19138 else
19139 tmp += ((- off_val) & 0xfffff);
19140
19141 *pri = tmp;
19142 return;
19143 }
19144
19145 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
19146 Adjust priority of sha1h instructions so they are scheduled before
19147 other SHA1 instructions. */
19148
19149 static int
19150 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
19151 {
19152 rtx x = PATTERN (insn);
19153
19154 if (GET_CODE (x) == SET)
19155 {
19156 x = SET_SRC (x);
19157
19158 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
19159 return priority + 10;
19160 }
19161
19162 return priority;
19163 }
19164
19165 /* Given OPERANDS of consecutive load/store, check if we can merge
19166 them into ldp/stp. LOAD is true if they are load instructions.
19167 MODE is the mode of memory operands. */
19168
19169 bool
19170 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
19171 machine_mode mode)
19172 {
19173 HOST_WIDE_INT offval_1, offval_2, msize;
19174 enum reg_class rclass_1, rclass_2;
19175 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
19176
19177 if (load)
19178 {
19179 mem_1 = operands[1];
19180 mem_2 = operands[3];
19181 reg_1 = operands[0];
19182 reg_2 = operands[2];
19183 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
19184 if (REGNO (reg_1) == REGNO (reg_2))
19185 return false;
19186 }
19187 else
19188 {
19189 mem_1 = operands[0];
19190 mem_2 = operands[2];
19191 reg_1 = operands[1];
19192 reg_2 = operands[3];
19193 }
19194
19195 /* The mems cannot be volatile. */
19196 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
19197 return false;
19198
19199 /* If we have SImode and slow unaligned ldp,
19200 check the alignment to be at least 8 byte. */
19201 if (mode == SImode
19202 && (aarch64_tune_params.extra_tuning_flags
19203 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19204 && !optimize_size
19205 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
19206 return false;
19207
19208 /* Check if the addresses are in the form of [base+offset]. */
19209 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19210 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
19211 return false;
19212 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19213 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
19214 return false;
19215
19216 /* Check if the bases are same. */
19217 if (!rtx_equal_p (base_1, base_2))
19218 return false;
19219
19220 /* The operands must be of the same size. */
19221 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
19222 GET_MODE_SIZE (GET_MODE (mem_2))));
19223
19224 offval_1 = INTVAL (offset_1);
19225 offval_2 = INTVAL (offset_2);
19226 /* We should only be trying this for fixed-sized modes. There is no
19227 SVE LDP/STP instruction. */
19228 msize = GET_MODE_SIZE (mode).to_constant ();
19229 /* Check if the offsets are consecutive. */
19230 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
19231 return false;
19232
19233 /* Check if the addresses are clobbered by load. */
19234 if (load)
19235 {
19236 if (reg_mentioned_p (reg_1, mem_1))
19237 return false;
19238
19239 /* In increasing order, the last load can clobber the address. */
19240 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
19241 return false;
19242 }
19243
19244 /* One of the memory accesses must be a mempair operand.
19245 If it is not the first one, they need to be swapped by the
19246 peephole. */
19247 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
19248 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
19249 return false;
19250
19251 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
19252 rclass_1 = FP_REGS;
19253 else
19254 rclass_1 = GENERAL_REGS;
19255
19256 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
19257 rclass_2 = FP_REGS;
19258 else
19259 rclass_2 = GENERAL_REGS;
19260
19261 /* Check if the registers are of same class. */
19262 if (rclass_1 != rclass_2)
19263 return false;
19264
19265 return true;
19266 }
19267
19268 /* Given OPERANDS of consecutive load/store that can be merged,
19269 swap them if they are not in ascending order. */
19270 void
19271 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
19272 {
19273 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
19274 HOST_WIDE_INT offval_1, offval_2;
19275
19276 if (load)
19277 {
19278 mem_1 = operands[1];
19279 mem_2 = operands[3];
19280 }
19281 else
19282 {
19283 mem_1 = operands[0];
19284 mem_2 = operands[2];
19285 }
19286
19287 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
19288 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
19289
19290 offval_1 = INTVAL (offset_1);
19291 offval_2 = INTVAL (offset_2);
19292
19293 if (offval_1 > offval_2)
19294 {
19295 /* Irrespective of whether this is a load or a store,
19296 we do the same swap. */
19297 std::swap (operands[0], operands[2]);
19298 std::swap (operands[1], operands[3]);
19299 }
19300 }
19301
19302 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
19303 comparison between the two. */
19304 int
19305 aarch64_host_wide_int_compare (const void *x, const void *y)
19306 {
19307 return wi::cmps (* ((const HOST_WIDE_INT *) x),
19308 * ((const HOST_WIDE_INT *) y));
19309 }
19310
19311 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
19312 other pointing to a REG rtx containing an offset, compare the offsets
19313 of the two pairs.
19314
19315 Return:
19316
19317 1 iff offset (X) > offset (Y)
19318 0 iff offset (X) == offset (Y)
19319 -1 iff offset (X) < offset (Y) */
19320 int
19321 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19322 {
19323 const rtx * operands_1 = (const rtx *) x;
19324 const rtx * operands_2 = (const rtx *) y;
19325 rtx mem_1, mem_2, base, offset_1, offset_2;
19326
19327 if (MEM_P (operands_1[0]))
19328 mem_1 = operands_1[0];
19329 else
19330 mem_1 = operands_1[1];
19331
19332 if (MEM_P (operands_2[0]))
19333 mem_2 = operands_2[0];
19334 else
19335 mem_2 = operands_2[1];
19336
19337 /* Extract the offsets. */
19338 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19339 extract_base_offset_in_addr (mem_2, &base, &offset_2);
19340
19341 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19342
19343 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19344 }
19345
19346 /* Given OPERANDS of consecutive load/store, check if we can merge
19347 them into ldp/stp by adjusting the offset. LOAD is true if they
19348 are load instructions. MODE is the mode of memory operands.
19349
19350 Given below consecutive stores:
19351
19352 str w1, [xb, 0x100]
19353 str w1, [xb, 0x104]
19354 str w1, [xb, 0x108]
19355 str w1, [xb, 0x10c]
19356
19357 Though the offsets are out of the range supported by stp, we can
19358 still pair them after adjusting the offset, like:
19359
19360 add scratch, xb, 0x100
19361 stp w1, w1, [scratch]
19362 stp w1, w1, [scratch, 0x8]
19363
19364 The peephole patterns detecting this opportunity should guarantee
19365 the scratch register is avaliable. */
19366
19367 bool
19368 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19369 scalar_mode mode)
19370 {
19371 const int num_insns = 4;
19372 enum reg_class rclass;
19373 HOST_WIDE_INT offvals[num_insns], msize;
19374 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19375
19376 if (load)
19377 {
19378 for (int i = 0; i < num_insns; i++)
19379 {
19380 reg[i] = operands[2 * i];
19381 mem[i] = operands[2 * i + 1];
19382
19383 gcc_assert (REG_P (reg[i]));
19384 }
19385
19386 /* Do not attempt to merge the loads if the loads clobber each other. */
19387 for (int i = 0; i < 8; i += 2)
19388 for (int j = i + 2; j < 8; j += 2)
19389 if (reg_overlap_mentioned_p (operands[i], operands[j]))
19390 return false;
19391 }
19392 else
19393 for (int i = 0; i < num_insns; i++)
19394 {
19395 mem[i] = operands[2 * i];
19396 reg[i] = operands[2 * i + 1];
19397 }
19398
19399 /* Skip if memory operand is by itself valid for ldp/stp. */
19400 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19401 return false;
19402
19403 for (int i = 0; i < num_insns; i++)
19404 {
19405 /* The mems cannot be volatile. */
19406 if (MEM_VOLATILE_P (mem[i]))
19407 return false;
19408
19409 /* Check if the addresses are in the form of [base+offset]. */
19410 extract_base_offset_in_addr (mem[i], base + i, offset + i);
19411 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19412 return false;
19413 }
19414
19415 /* Check if the registers are of same class. */
19416 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19417 ? FP_REGS : GENERAL_REGS;
19418
19419 for (int i = 1; i < num_insns; i++)
19420 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19421 {
19422 if (rclass != FP_REGS)
19423 return false;
19424 }
19425 else
19426 {
19427 if (rclass != GENERAL_REGS)
19428 return false;
19429 }
19430
19431 /* Only the last register in the order in which they occur
19432 may be clobbered by the load. */
19433 if (rclass == GENERAL_REGS && load)
19434 for (int i = 0; i < num_insns - 1; i++)
19435 if (reg_mentioned_p (reg[i], mem[i]))
19436 return false;
19437
19438 /* Check if the bases are same. */
19439 for (int i = 0; i < num_insns - 1; i++)
19440 if (!rtx_equal_p (base[i], base[i + 1]))
19441 return false;
19442
19443 for (int i = 0; i < num_insns; i++)
19444 offvals[i] = INTVAL (offset[i]);
19445
19446 msize = GET_MODE_SIZE (mode);
19447
19448 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19449 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19450 aarch64_host_wide_int_compare);
19451
19452 if (!(offvals[1] == offvals[0] + msize
19453 && offvals[3] == offvals[2] + msize))
19454 return false;
19455
19456 /* Check that offsets are within range of each other. The ldp/stp
19457 instructions have 7 bit immediate offsets, so use 0x80. */
19458 if (offvals[2] - offvals[0] >= msize * 0x80)
19459 return false;
19460
19461 /* The offsets must be aligned with respect to each other. */
19462 if (offvals[0] % msize != offvals[2] % msize)
19463 return false;
19464
19465 /* If we have SImode and slow unaligned ldp,
19466 check the alignment to be at least 8 byte. */
19467 if (mode == SImode
19468 && (aarch64_tune_params.extra_tuning_flags
19469 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19470 && !optimize_size
19471 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19472 return false;
19473
19474 return true;
19475 }
19476
19477 /* Given OPERANDS of consecutive load/store, this function pairs them
19478 into LDP/STP after adjusting the offset. It depends on the fact
19479 that the operands can be sorted so the offsets are correct for STP.
19480 MODE is the mode of memory operands. CODE is the rtl operator
19481 which should be applied to all memory operands, it's SIGN_EXTEND,
19482 ZERO_EXTEND or UNKNOWN. */
19483
19484 bool
19485 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19486 scalar_mode mode, RTX_CODE code)
19487 {
19488 rtx base, offset_1, offset_3, t1, t2;
19489 rtx mem_1, mem_2, mem_3, mem_4;
19490 rtx temp_operands[8];
19491 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19492 stp_off_upper_limit, stp_off_lower_limit, msize;
19493
19494 /* We make changes on a copy as we may still bail out. */
19495 for (int i = 0; i < 8; i ++)
19496 temp_operands[i] = operands[i];
19497
19498 /* Sort the operands. */
19499 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19500
19501 /* Copy the memory operands so that if we have to bail for some
19502 reason the original addresses are unchanged. */
19503 if (load)
19504 {
19505 mem_1 = copy_rtx (temp_operands[1]);
19506 mem_2 = copy_rtx (temp_operands[3]);
19507 mem_3 = copy_rtx (temp_operands[5]);
19508 mem_4 = copy_rtx (temp_operands[7]);
19509 }
19510 else
19511 {
19512 mem_1 = copy_rtx (temp_operands[0]);
19513 mem_2 = copy_rtx (temp_operands[2]);
19514 mem_3 = copy_rtx (temp_operands[4]);
19515 mem_4 = copy_rtx (temp_operands[6]);
19516 gcc_assert (code == UNKNOWN);
19517 }
19518
19519 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19520 extract_base_offset_in_addr (mem_3, &base, &offset_3);
19521 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19522 && offset_3 != NULL_RTX);
19523
19524 /* Adjust offset so it can fit in LDP/STP instruction. */
19525 msize = GET_MODE_SIZE (mode);
19526 stp_off_upper_limit = msize * (0x40 - 1);
19527 stp_off_lower_limit = - msize * 0x40;
19528
19529 off_val_1 = INTVAL (offset_1);
19530 off_val_3 = INTVAL (offset_3);
19531
19532 /* The base offset is optimally half way between the two STP/LDP offsets. */
19533 if (msize <= 4)
19534 base_off = (off_val_1 + off_val_3) / 2;
19535 else
19536 /* However, due to issues with negative LDP/STP offset generation for
19537 larger modes, for DF, DI and vector modes. we must not use negative
19538 addresses smaller than 9 signed unadjusted bits can store. This
19539 provides the most range in this case. */
19540 base_off = off_val_1;
19541
19542 /* Adjust the base so that it is aligned with the addresses but still
19543 optimal. */
19544 if (base_off % msize != off_val_1 % msize)
19545 /* Fix the offset, bearing in mind we want to make it bigger not
19546 smaller. */
19547 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19548 else if (msize <= 4)
19549 /* The negative range of LDP/STP is one larger than the positive range. */
19550 base_off += msize;
19551
19552 /* Check if base offset is too big or too small. We can attempt to resolve
19553 this issue by setting it to the maximum value and seeing if the offsets
19554 still fit. */
19555 if (base_off >= 0x1000)
19556 {
19557 base_off = 0x1000 - 1;
19558 /* We must still make sure that the base offset is aligned with respect
19559 to the address. But it may may not be made any bigger. */
19560 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19561 }
19562
19563 /* Likewise for the case where the base is too small. */
19564 if (base_off <= -0x1000)
19565 {
19566 base_off = -0x1000 + 1;
19567 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19568 }
19569
19570 /* Offset of the first STP/LDP. */
19571 new_off_1 = off_val_1 - base_off;
19572
19573 /* Offset of the second STP/LDP. */
19574 new_off_3 = off_val_3 - base_off;
19575
19576 /* The offsets must be within the range of the LDP/STP instructions. */
19577 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19578 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19579 return false;
19580
19581 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19582 new_off_1), true);
19583 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19584 new_off_1 + msize), true);
19585 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19586 new_off_3), true);
19587 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19588 new_off_3 + msize), true);
19589
19590 if (!aarch64_mem_pair_operand (mem_1, mode)
19591 || !aarch64_mem_pair_operand (mem_3, mode))
19592 return false;
19593
19594 if (code == ZERO_EXTEND)
19595 {
19596 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19597 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19598 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19599 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19600 }
19601 else if (code == SIGN_EXTEND)
19602 {
19603 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19604 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19605 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19606 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19607 }
19608
19609 if (load)
19610 {
19611 operands[0] = temp_operands[0];
19612 operands[1] = mem_1;
19613 operands[2] = temp_operands[2];
19614 operands[3] = mem_2;
19615 operands[4] = temp_operands[4];
19616 operands[5] = mem_3;
19617 operands[6] = temp_operands[6];
19618 operands[7] = mem_4;
19619 }
19620 else
19621 {
19622 operands[0] = mem_1;
19623 operands[1] = temp_operands[1];
19624 operands[2] = mem_2;
19625 operands[3] = temp_operands[3];
19626 operands[4] = mem_3;
19627 operands[5] = temp_operands[5];
19628 operands[6] = mem_4;
19629 operands[7] = temp_operands[7];
19630 }
19631
19632 /* Emit adjusting instruction. */
19633 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19634 /* Emit ldp/stp instructions. */
19635 t1 = gen_rtx_SET (operands[0], operands[1]);
19636 t2 = gen_rtx_SET (operands[2], operands[3]);
19637 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19638 t1 = gen_rtx_SET (operands[4], operands[5]);
19639 t2 = gen_rtx_SET (operands[6], operands[7]);
19640 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19641 return true;
19642 }
19643
19644 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19645 it isn't worth branching around empty masked ops (including masked
19646 stores). */
19647
19648 static bool
19649 aarch64_empty_mask_is_expensive (unsigned)
19650 {
19651 return false;
19652 }
19653
19654 /* Return 1 if pseudo register should be created and used to hold
19655 GOT address for PIC code. */
19656
19657 bool
19658 aarch64_use_pseudo_pic_reg (void)
19659 {
19660 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19661 }
19662
19663 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19664
19665 static int
19666 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19667 {
19668 switch (XINT (x, 1))
19669 {
19670 case UNSPEC_GOTSMALLPIC:
19671 case UNSPEC_GOTSMALLPIC28K:
19672 case UNSPEC_GOTTINYPIC:
19673 return 0;
19674 default:
19675 break;
19676 }
19677
19678 return default_unspec_may_trap_p (x, flags);
19679 }
19680
19681
19682 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19683 return the log2 of that value. Otherwise return -1. */
19684
19685 int
19686 aarch64_fpconst_pow_of_2 (rtx x)
19687 {
19688 const REAL_VALUE_TYPE *r;
19689
19690 if (!CONST_DOUBLE_P (x))
19691 return -1;
19692
19693 r = CONST_DOUBLE_REAL_VALUE (x);
19694
19695 if (REAL_VALUE_NEGATIVE (*r)
19696 || REAL_VALUE_ISNAN (*r)
19697 || REAL_VALUE_ISINF (*r)
19698 || !real_isinteger (r, DFmode))
19699 return -1;
19700
19701 return exact_log2 (real_to_integer (r));
19702 }
19703
19704 /* If X is a vector of equal CONST_DOUBLE values and that value is
19705 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19706
19707 int
19708 aarch64_vec_fpconst_pow_of_2 (rtx x)
19709 {
19710 int nelts;
19711 if (GET_CODE (x) != CONST_VECTOR
19712 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19713 return -1;
19714
19715 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19716 return -1;
19717
19718 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19719 if (firstval <= 0)
19720 return -1;
19721
19722 for (int i = 1; i < nelts; i++)
19723 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19724 return -1;
19725
19726 return firstval;
19727 }
19728
19729 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19730 to float.
19731
19732 __fp16 always promotes through this hook.
19733 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19734 through the generic excess precision logic rather than here. */
19735
19736 static tree
19737 aarch64_promoted_type (const_tree t)
19738 {
19739 if (SCALAR_FLOAT_TYPE_P (t)
19740 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19741 return float_type_node;
19742
19743 return NULL_TREE;
19744 }
19745
19746 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19747
19748 static bool
19749 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19750 optimization_type opt_type)
19751 {
19752 switch (op)
19753 {
19754 case rsqrt_optab:
19755 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19756
19757 default:
19758 return true;
19759 }
19760 }
19761
19762 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19763
19764 static unsigned int
19765 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19766 int *offset)
19767 {
19768 /* Polynomial invariant 1 == (VG / 2) - 1. */
19769 gcc_assert (i == 1);
19770 *factor = 2;
19771 *offset = 1;
19772 return AARCH64_DWARF_VG;
19773 }
19774
19775 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19776 if MODE is HFmode, and punt to the generic implementation otherwise. */
19777
19778 static bool
19779 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19780 {
19781 return (mode == HFmode
19782 ? true
19783 : default_libgcc_floating_mode_supported_p (mode));
19784 }
19785
19786 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19787 if MODE is HFmode, and punt to the generic implementation otherwise. */
19788
19789 static bool
19790 aarch64_scalar_mode_supported_p (scalar_mode mode)
19791 {
19792 return (mode == HFmode
19793 ? true
19794 : default_scalar_mode_supported_p (mode));
19795 }
19796
19797 /* Set the value of FLT_EVAL_METHOD.
19798 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19799
19800 0: evaluate all operations and constants, whose semantic type has at
19801 most the range and precision of type float, to the range and
19802 precision of float; evaluate all other operations and constants to
19803 the range and precision of the semantic type;
19804
19805 N, where _FloatN is a supported interchange floating type
19806 evaluate all operations and constants, whose semantic type has at
19807 most the range and precision of _FloatN type, to the range and
19808 precision of the _FloatN type; evaluate all other operations and
19809 constants to the range and precision of the semantic type;
19810
19811 If we have the ARMv8.2-A extensions then we support _Float16 in native
19812 precision, so we should set this to 16. Otherwise, we support the type,
19813 but want to evaluate expressions in float precision, so set this to
19814 0. */
19815
19816 static enum flt_eval_method
19817 aarch64_excess_precision (enum excess_precision_type type)
19818 {
19819 switch (type)
19820 {
19821 case EXCESS_PRECISION_TYPE_FAST:
19822 case EXCESS_PRECISION_TYPE_STANDARD:
19823 /* We can calculate either in 16-bit range and precision or
19824 32-bit range and precision. Make that decision based on whether
19825 we have native support for the ARMv8.2-A 16-bit floating-point
19826 instructions or not. */
19827 return (TARGET_FP_F16INST
19828 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19829 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19830 case EXCESS_PRECISION_TYPE_IMPLICIT:
19831 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19832 default:
19833 gcc_unreachable ();
19834 }
19835 return FLT_EVAL_METHOD_UNPREDICTABLE;
19836 }
19837
19838 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
19839 scheduled for speculative execution. Reject the long-running division
19840 and square-root instructions. */
19841
19842 static bool
19843 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19844 {
19845 switch (get_attr_type (insn))
19846 {
19847 case TYPE_SDIV:
19848 case TYPE_UDIV:
19849 case TYPE_FDIVS:
19850 case TYPE_FDIVD:
19851 case TYPE_FSQRTS:
19852 case TYPE_FSQRTD:
19853 case TYPE_NEON_FP_SQRT_S:
19854 case TYPE_NEON_FP_SQRT_D:
19855 case TYPE_NEON_FP_SQRT_S_Q:
19856 case TYPE_NEON_FP_SQRT_D_Q:
19857 case TYPE_NEON_FP_DIV_S:
19858 case TYPE_NEON_FP_DIV_D:
19859 case TYPE_NEON_FP_DIV_S_Q:
19860 case TYPE_NEON_FP_DIV_D_Q:
19861 return false;
19862 default:
19863 return true;
19864 }
19865 }
19866
19867 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
19868
19869 static int
19870 aarch64_compute_pressure_classes (reg_class *classes)
19871 {
19872 int i = 0;
19873 classes[i++] = GENERAL_REGS;
19874 classes[i++] = FP_REGS;
19875 /* PR_REGS isn't a useful pressure class because many predicate pseudo
19876 registers need to go in PR_LO_REGS at some point during their
19877 lifetime. Splitting it into two halves has the effect of making
19878 all predicates count against PR_LO_REGS, so that we try whenever
19879 possible to restrict the number of live predicates to 8. This
19880 greatly reduces the amount of spilling in certain loops. */
19881 classes[i++] = PR_LO_REGS;
19882 classes[i++] = PR_HI_REGS;
19883 return i;
19884 }
19885
19886 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
19887
19888 static bool
19889 aarch64_can_change_mode_class (machine_mode from,
19890 machine_mode to, reg_class_t)
19891 {
19892 if (BYTES_BIG_ENDIAN)
19893 {
19894 bool from_sve_p = aarch64_sve_data_mode_p (from);
19895 bool to_sve_p = aarch64_sve_data_mode_p (to);
19896
19897 /* Don't allow changes between SVE data modes and non-SVE modes.
19898 See the comment at the head of aarch64-sve.md for details. */
19899 if (from_sve_p != to_sve_p)
19900 return false;
19901
19902 /* Don't allow changes in element size: lane 0 of the new vector
19903 would not then be lane 0 of the old vector. See the comment
19904 above aarch64_maybe_expand_sve_subreg_move for a more detailed
19905 description.
19906
19907 In the worst case, this forces a register to be spilled in
19908 one mode and reloaded in the other, which handles the
19909 endianness correctly. */
19910 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19911 return false;
19912 }
19913 return true;
19914 }
19915
19916 /* Implement TARGET_EARLY_REMAT_MODES. */
19917
19918 static void
19919 aarch64_select_early_remat_modes (sbitmap modes)
19920 {
19921 /* SVE values are not normally live across a call, so it should be
19922 worth doing early rematerialization even in VL-specific mode. */
19923 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19924 {
19925 machine_mode mode = (machine_mode) i;
19926 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19927 if (vec_flags & VEC_ANY_SVE)
19928 bitmap_set_bit (modes, i);
19929 }
19930 }
19931
19932 /* Override the default target speculation_safe_value. */
19933 static rtx
19934 aarch64_speculation_safe_value (machine_mode mode,
19935 rtx result, rtx val, rtx failval)
19936 {
19937 /* Maybe we should warn if falling back to hard barriers. They are
19938 likely to be noticably more expensive than the alternative below. */
19939 if (!aarch64_track_speculation)
19940 return default_speculation_safe_value (mode, result, val, failval);
19941
19942 if (!REG_P (val))
19943 val = copy_to_mode_reg (mode, val);
19944
19945 if (!aarch64_reg_or_zero (failval, mode))
19946 failval = copy_to_mode_reg (mode, failval);
19947
19948 emit_insn (gen_despeculate_copy (mode, result, val, failval));
19949 return result;
19950 }
19951
19952 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19953 Look into the tuning structure for an estimate.
19954 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19955 Advanced SIMD 128 bits. */
19956
19957 static HOST_WIDE_INT
19958 aarch64_estimated_poly_value (poly_int64 val)
19959 {
19960 enum aarch64_sve_vector_bits_enum width_source
19961 = aarch64_tune_params.sve_width;
19962
19963 /* If we still don't have an estimate, use the default. */
19964 if (width_source == SVE_SCALABLE)
19965 return default_estimated_poly_value (val);
19966
19967 HOST_WIDE_INT over_128 = width_source - 128;
19968 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19969 }
19970
19971
19972 /* Return true for types that could be supported as SIMD return or
19973 argument types. */
19974
19975 static bool
19976 supported_simd_type (tree t)
19977 {
19978 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19979 {
19980 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19981 return s == 1 || s == 2 || s == 4 || s == 8;
19982 }
19983 return false;
19984 }
19985
19986 /* Return true for types that currently are supported as SIMD return
19987 or argument types. */
19988
19989 static bool
19990 currently_supported_simd_type (tree t, tree b)
19991 {
19992 if (COMPLEX_FLOAT_TYPE_P (t))
19993 return false;
19994
19995 if (TYPE_SIZE (t) != TYPE_SIZE (b))
19996 return false;
19997
19998 return supported_simd_type (t);
19999 }
20000
20001 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
20002
20003 static int
20004 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
20005 struct cgraph_simd_clone *clonei,
20006 tree base_type, int num)
20007 {
20008 tree t, ret_type, arg_type;
20009 unsigned int elt_bits, vec_bits, count;
20010
20011 if (!TARGET_SIMD)
20012 return 0;
20013
20014 if (clonei->simdlen
20015 && (clonei->simdlen < 2
20016 || clonei->simdlen > 1024
20017 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
20018 {
20019 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20020 "unsupported simdlen %d", clonei->simdlen);
20021 return 0;
20022 }
20023
20024 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
20025 if (TREE_CODE (ret_type) != VOID_TYPE
20026 && !currently_supported_simd_type (ret_type, base_type))
20027 {
20028 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
20029 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20030 "GCC does not currently support mixed size types "
20031 "for %<simd%> functions");
20032 else if (supported_simd_type (ret_type))
20033 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20034 "GCC does not currently support return type %qT "
20035 "for %<simd%> functions", ret_type);
20036 else
20037 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20038 "unsupported return type %qT for %<simd%> functions",
20039 ret_type);
20040 return 0;
20041 }
20042
20043 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
20044 {
20045 arg_type = TREE_TYPE (t);
20046
20047 if (!currently_supported_simd_type (arg_type, base_type))
20048 {
20049 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
20050 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20051 "GCC does not currently support mixed size types "
20052 "for %<simd%> functions");
20053 else
20054 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20055 "GCC does not currently support argument type %qT "
20056 "for %<simd%> functions", arg_type);
20057 return 0;
20058 }
20059 }
20060
20061 clonei->vecsize_mangle = 'n';
20062 clonei->mask_mode = VOIDmode;
20063 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
20064 if (clonei->simdlen == 0)
20065 {
20066 count = 2;
20067 vec_bits = (num == 0 ? 64 : 128);
20068 clonei->simdlen = vec_bits / elt_bits;
20069 }
20070 else
20071 {
20072 count = 1;
20073 vec_bits = clonei->simdlen * elt_bits;
20074 if (vec_bits != 64 && vec_bits != 128)
20075 {
20076 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
20077 "GCC does not currently support simdlen %d for type %qT",
20078 clonei->simdlen, base_type);
20079 return 0;
20080 }
20081 }
20082 clonei->vecsize_int = vec_bits;
20083 clonei->vecsize_float = vec_bits;
20084 return count;
20085 }
20086
20087 /* Implement TARGET_SIMD_CLONE_ADJUST. */
20088
20089 static void
20090 aarch64_simd_clone_adjust (struct cgraph_node *node)
20091 {
20092 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
20093 use the correct ABI. */
20094
20095 tree t = TREE_TYPE (node->decl);
20096 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
20097 TYPE_ATTRIBUTES (t));
20098 }
20099
20100 /* Implement TARGET_SIMD_CLONE_USABLE. */
20101
20102 static int
20103 aarch64_simd_clone_usable (struct cgraph_node *node)
20104 {
20105 switch (node->simdclone->vecsize_mangle)
20106 {
20107 case 'n':
20108 if (!TARGET_SIMD)
20109 return -1;
20110 return 0;
20111 default:
20112 gcc_unreachable ();
20113 }
20114 }
20115
20116 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
20117
20118 static int
20119 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
20120 {
20121 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
20122 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
20123 return 0;
20124 return 1;
20125 }
20126
20127 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
20128
20129 static const char *
20130 aarch64_get_multilib_abi_name (void)
20131 {
20132 if (TARGET_BIG_END)
20133 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
20134 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
20135 }
20136
20137 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
20138 global variable based guard use the default else
20139 return a null tree. */
20140 static tree
20141 aarch64_stack_protect_guard (void)
20142 {
20143 if (aarch64_stack_protector_guard == SSP_GLOBAL)
20144 return default_stack_protect_guard ();
20145
20146 return NULL_TREE;
20147 }
20148
20149 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
20150 section at the end if needed. */
20151 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
20152 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
20153 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
20154 void
20155 aarch64_file_end_indicate_exec_stack ()
20156 {
20157 file_end_indicate_exec_stack ();
20158
20159 unsigned feature_1_and = 0;
20160 if (aarch64_bti_enabled ())
20161 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
20162
20163 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
20164 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
20165
20166 if (feature_1_and)
20167 {
20168 /* Generate .note.gnu.property section. */
20169 switch_to_section (get_section (".note.gnu.property",
20170 SECTION_NOTYPE, NULL));
20171
20172 /* PT_NOTE header: namesz, descsz, type.
20173 namesz = 4 ("GNU\0")
20174 descsz = 16 (Size of the program property array)
20175 [(12 + padding) * Number of array elements]
20176 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
20177 assemble_align (POINTER_SIZE);
20178 assemble_integer (GEN_INT (4), 4, 32, 1);
20179 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
20180 assemble_integer (GEN_INT (5), 4, 32, 1);
20181
20182 /* PT_NOTE name. */
20183 assemble_string ("GNU", 4);
20184
20185 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
20186 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
20187 datasz = 4
20188 data = feature_1_and. */
20189 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
20190 assemble_integer (GEN_INT (4), 4, 32, 1);
20191 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
20192
20193 /* Pad the size of the note to the required alignment. */
20194 assemble_align (POINTER_SIZE);
20195 }
20196 }
20197 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
20198 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
20199 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
20200
20201 /* Target-specific selftests. */
20202
20203 #if CHECKING_P
20204
20205 namespace selftest {
20206
20207 /* Selftest for the RTL loader.
20208 Verify that the RTL loader copes with a dump from
20209 print_rtx_function. This is essentially just a test that class
20210 function_reader can handle a real dump, but it also verifies
20211 that lookup_reg_by_dump_name correctly handles hard regs.
20212 The presence of hard reg names in the dump means that the test is
20213 target-specific, hence it is in this file. */
20214
20215 static void
20216 aarch64_test_loading_full_dump ()
20217 {
20218 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
20219
20220 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
20221
20222 rtx_insn *insn_1 = get_insn_by_uid (1);
20223 ASSERT_EQ (NOTE, GET_CODE (insn_1));
20224
20225 rtx_insn *insn_15 = get_insn_by_uid (15);
20226 ASSERT_EQ (INSN, GET_CODE (insn_15));
20227 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
20228
20229 /* Verify crtl->return_rtx. */
20230 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
20231 ASSERT_EQ (0, REGNO (crtl->return_rtx));
20232 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
20233 }
20234
20235 /* Run all target-specific selftests. */
20236
20237 static void
20238 aarch64_run_selftests (void)
20239 {
20240 aarch64_test_loading_full_dump ();
20241 }
20242
20243 } // namespace selftest
20244
20245 #endif /* #if CHECKING_P */
20246
20247 #undef TARGET_STACK_PROTECT_GUARD
20248 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
20249
20250 #undef TARGET_ADDRESS_COST
20251 #define TARGET_ADDRESS_COST aarch64_address_cost
20252
20253 /* This hook will determines whether unnamed bitfields affect the alignment
20254 of the containing structure. The hook returns true if the structure
20255 should inherit the alignment requirements of an unnamed bitfield's
20256 type. */
20257 #undef TARGET_ALIGN_ANON_BITFIELD
20258 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
20259
20260 #undef TARGET_ASM_ALIGNED_DI_OP
20261 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
20262
20263 #undef TARGET_ASM_ALIGNED_HI_OP
20264 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
20265
20266 #undef TARGET_ASM_ALIGNED_SI_OP
20267 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
20268
20269 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
20270 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
20271 hook_bool_const_tree_hwi_hwi_const_tree_true
20272
20273 #undef TARGET_ASM_FILE_START
20274 #define TARGET_ASM_FILE_START aarch64_start_file
20275
20276 #undef TARGET_ASM_OUTPUT_MI_THUNK
20277 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
20278
20279 #undef TARGET_ASM_SELECT_RTX_SECTION
20280 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
20281
20282 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
20283 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
20284
20285 #undef TARGET_BUILD_BUILTIN_VA_LIST
20286 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
20287
20288 #undef TARGET_CALLEE_COPIES
20289 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
20290
20291 #undef TARGET_CAN_ELIMINATE
20292 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
20293
20294 #undef TARGET_CAN_INLINE_P
20295 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
20296
20297 #undef TARGET_CANNOT_FORCE_CONST_MEM
20298 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
20299
20300 #undef TARGET_CASE_VALUES_THRESHOLD
20301 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
20302
20303 #undef TARGET_CONDITIONAL_REGISTER_USAGE
20304 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
20305
20306 /* Only the least significant bit is used for initialization guard
20307 variables. */
20308 #undef TARGET_CXX_GUARD_MASK_BIT
20309 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
20310
20311 #undef TARGET_C_MODE_FOR_SUFFIX
20312 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
20313
20314 #ifdef TARGET_BIG_ENDIAN_DEFAULT
20315 #undef TARGET_DEFAULT_TARGET_FLAGS
20316 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20317 #endif
20318
20319 #undef TARGET_CLASS_MAX_NREGS
20320 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20321
20322 #undef TARGET_BUILTIN_DECL
20323 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20324
20325 #undef TARGET_BUILTIN_RECIPROCAL
20326 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20327
20328 #undef TARGET_C_EXCESS_PRECISION
20329 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20330
20331 #undef TARGET_EXPAND_BUILTIN
20332 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20333
20334 #undef TARGET_EXPAND_BUILTIN_VA_START
20335 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20336
20337 #undef TARGET_FOLD_BUILTIN
20338 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20339
20340 #undef TARGET_FUNCTION_ARG
20341 #define TARGET_FUNCTION_ARG aarch64_function_arg
20342
20343 #undef TARGET_FUNCTION_ARG_ADVANCE
20344 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20345
20346 #undef TARGET_FUNCTION_ARG_BOUNDARY
20347 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20348
20349 #undef TARGET_FUNCTION_ARG_PADDING
20350 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20351
20352 #undef TARGET_GET_RAW_RESULT_MODE
20353 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20354 #undef TARGET_GET_RAW_ARG_MODE
20355 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20356
20357 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20358 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20359
20360 #undef TARGET_FUNCTION_VALUE
20361 #define TARGET_FUNCTION_VALUE aarch64_function_value
20362
20363 #undef TARGET_FUNCTION_VALUE_REGNO_P
20364 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20365
20366 #undef TARGET_GIMPLE_FOLD_BUILTIN
20367 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20368
20369 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20370 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20371
20372 #undef TARGET_INIT_BUILTINS
20373 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20374
20375 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20376 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20377 aarch64_ira_change_pseudo_allocno_class
20378
20379 #undef TARGET_LEGITIMATE_ADDRESS_P
20380 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20381
20382 #undef TARGET_LEGITIMATE_CONSTANT_P
20383 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20384
20385 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20386 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20387 aarch64_legitimize_address_displacement
20388
20389 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20390 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20391
20392 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20393 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20394 aarch64_libgcc_floating_mode_supported_p
20395
20396 #undef TARGET_MANGLE_TYPE
20397 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20398
20399 #undef TARGET_MEMORY_MOVE_COST
20400 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20401
20402 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20403 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20404
20405 #undef TARGET_MUST_PASS_IN_STACK
20406 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20407
20408 /* This target hook should return true if accesses to volatile bitfields
20409 should use the narrowest mode possible. It should return false if these
20410 accesses should use the bitfield container type. */
20411 #undef TARGET_NARROW_VOLATILE_BITFIELD
20412 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20413
20414 #undef TARGET_OPTION_OVERRIDE
20415 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20416
20417 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20418 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20419 aarch64_override_options_after_change
20420
20421 #undef TARGET_OPTION_SAVE
20422 #define TARGET_OPTION_SAVE aarch64_option_save
20423
20424 #undef TARGET_OPTION_RESTORE
20425 #define TARGET_OPTION_RESTORE aarch64_option_restore
20426
20427 #undef TARGET_OPTION_PRINT
20428 #define TARGET_OPTION_PRINT aarch64_option_print
20429
20430 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20431 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20432
20433 #undef TARGET_SET_CURRENT_FUNCTION
20434 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20435
20436 #undef TARGET_PASS_BY_REFERENCE
20437 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20438
20439 #undef TARGET_PREFERRED_RELOAD_CLASS
20440 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20441
20442 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20443 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20444
20445 #undef TARGET_PROMOTED_TYPE
20446 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20447
20448 #undef TARGET_SECONDARY_RELOAD
20449 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20450
20451 #undef TARGET_SHIFT_TRUNCATION_MASK
20452 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20453
20454 #undef TARGET_SETUP_INCOMING_VARARGS
20455 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20456
20457 #undef TARGET_STRUCT_VALUE_RTX
20458 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20459
20460 #undef TARGET_REGISTER_MOVE_COST
20461 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20462
20463 #undef TARGET_RETURN_IN_MEMORY
20464 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20465
20466 #undef TARGET_RETURN_IN_MSB
20467 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20468
20469 #undef TARGET_RTX_COSTS
20470 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20471
20472 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20473 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20474
20475 #undef TARGET_SCHED_ISSUE_RATE
20476 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20477
20478 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20479 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20480 aarch64_sched_first_cycle_multipass_dfa_lookahead
20481
20482 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20483 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20484 aarch64_first_cycle_multipass_dfa_lookahead_guard
20485
20486 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20487 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20488 aarch64_get_separate_components
20489
20490 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20491 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20492 aarch64_components_for_bb
20493
20494 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20495 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20496 aarch64_disqualify_components
20497
20498 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20499 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20500 aarch64_emit_prologue_components
20501
20502 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20503 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20504 aarch64_emit_epilogue_components
20505
20506 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20507 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20508 aarch64_set_handled_components
20509
20510 #undef TARGET_TRAMPOLINE_INIT
20511 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20512
20513 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20514 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20515
20516 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20517 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20518
20519 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20520 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20521 aarch64_builtin_support_vector_misalignment
20522
20523 #undef TARGET_ARRAY_MODE
20524 #define TARGET_ARRAY_MODE aarch64_array_mode
20525
20526 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20527 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20528
20529 #undef TARGET_VECTORIZE_ADD_STMT_COST
20530 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20531
20532 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20533 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20534 aarch64_builtin_vectorization_cost
20535
20536 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20537 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20538
20539 #undef TARGET_VECTORIZE_BUILTINS
20540 #define TARGET_VECTORIZE_BUILTINS
20541
20542 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20543 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20544 aarch64_builtin_vectorized_function
20545
20546 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20547 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20548 aarch64_autovectorize_vector_sizes
20549
20550 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20551 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20552 aarch64_atomic_assign_expand_fenv
20553
20554 /* Section anchor support. */
20555
20556 #undef TARGET_MIN_ANCHOR_OFFSET
20557 #define TARGET_MIN_ANCHOR_OFFSET -256
20558
20559 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20560 byte offset; we can do much more for larger data types, but have no way
20561 to determine the size of the access. We assume accesses are aligned. */
20562 #undef TARGET_MAX_ANCHOR_OFFSET
20563 #define TARGET_MAX_ANCHOR_OFFSET 4095
20564
20565 #undef TARGET_VECTOR_ALIGNMENT
20566 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20567
20568 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20569 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20570 aarch64_vectorize_preferred_vector_alignment
20571 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20572 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20573 aarch64_simd_vector_alignment_reachable
20574
20575 /* vec_perm support. */
20576
20577 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20578 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20579 aarch64_vectorize_vec_perm_const
20580
20581 #undef TARGET_VECTORIZE_GET_MASK_MODE
20582 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20583 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20584 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20585 aarch64_empty_mask_is_expensive
20586 #undef TARGET_PREFERRED_ELSE_VALUE
20587 #define TARGET_PREFERRED_ELSE_VALUE \
20588 aarch64_preferred_else_value
20589
20590 #undef TARGET_INIT_LIBFUNCS
20591 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20592
20593 #undef TARGET_FIXED_CONDITION_CODE_REGS
20594 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20595
20596 #undef TARGET_FLAGS_REGNUM
20597 #define TARGET_FLAGS_REGNUM CC_REGNUM
20598
20599 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20600 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20601
20602 #undef TARGET_ASAN_SHADOW_OFFSET
20603 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20604
20605 #undef TARGET_LEGITIMIZE_ADDRESS
20606 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20607
20608 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20609 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20610
20611 #undef TARGET_CAN_USE_DOLOOP_P
20612 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20613
20614 #undef TARGET_SCHED_ADJUST_PRIORITY
20615 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20616
20617 #undef TARGET_SCHED_MACRO_FUSION_P
20618 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20619
20620 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20621 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20622
20623 #undef TARGET_SCHED_FUSION_PRIORITY
20624 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20625
20626 #undef TARGET_UNSPEC_MAY_TRAP_P
20627 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20628
20629 #undef TARGET_USE_PSEUDO_PIC_REG
20630 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20631
20632 #undef TARGET_PRINT_OPERAND
20633 #define TARGET_PRINT_OPERAND aarch64_print_operand
20634
20635 #undef TARGET_PRINT_OPERAND_ADDRESS
20636 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20637
20638 #undef TARGET_OPTAB_SUPPORTED_P
20639 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20640
20641 #undef TARGET_OMIT_STRUCT_RETURN_REG
20642 #define TARGET_OMIT_STRUCT_RETURN_REG true
20643
20644 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20645 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20646 aarch64_dwarf_poly_indeterminate_value
20647
20648 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20649 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20650 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20651
20652 #undef TARGET_HARD_REGNO_NREGS
20653 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20654 #undef TARGET_HARD_REGNO_MODE_OK
20655 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20656
20657 #undef TARGET_MODES_TIEABLE_P
20658 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20659
20660 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20661 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20662 aarch64_hard_regno_call_part_clobbered
20663
20664 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20665 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20666 aarch64_remove_extra_call_preserved_regs
20667
20668 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20669 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20670 aarch64_return_call_with_max_clobbers
20671
20672 #undef TARGET_CONSTANT_ALIGNMENT
20673 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20674
20675 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20676 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20677 aarch64_stack_clash_protection_alloca_probe_range
20678
20679 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20680 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20681
20682 #undef TARGET_CAN_CHANGE_MODE_CLASS
20683 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20684
20685 #undef TARGET_SELECT_EARLY_REMAT_MODES
20686 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20687
20688 #undef TARGET_SPECULATION_SAFE_VALUE
20689 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20690
20691 #undef TARGET_ESTIMATED_POLY_VALUE
20692 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20693
20694 #undef TARGET_ATTRIBUTE_TABLE
20695 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20696
20697 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20698 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20699 aarch64_simd_clone_compute_vecsize_and_simdlen
20700
20701 #undef TARGET_SIMD_CLONE_ADJUST
20702 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20703
20704 #undef TARGET_SIMD_CLONE_USABLE
20705 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20706
20707 #undef TARGET_COMP_TYPE_ATTRIBUTES
20708 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20709
20710 #undef TARGET_GET_MULTILIB_ABI_NAME
20711 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20712
20713 #if CHECKING_P
20714 #undef TARGET_RUN_TARGET_SELFTESTS
20715 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20716 #endif /* #if CHECKING_P */
20717
20718 #undef TARGET_ASM_POST_CFI_STARTPROC
20719 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20720
20721 struct gcc_target targetm = TARGET_INITIALIZER;
20722
20723 #include "gt-aarch64.h"