]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
[AArch64] Add a "GP strictness" operand to SVE FP unspecs
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2019 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76
77 /* This file should be included last. */
78 #include "target-def.h"
79
80 /* Defined for convenience. */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82
83 /* Information about a legitimate vector immediate operand. */
84 struct simd_immediate_info
85 {
86 enum insn_type { MOV, MVN, INDEX, PTRUE };
87 enum modifier_type { LSL, MSL };
88
89 simd_immediate_info () {}
90 simd_immediate_info (scalar_float_mode, rtx);
91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 insn_type = MOV, modifier_type = LSL,
93 unsigned int = 0);
94 simd_immediate_info (scalar_mode, rtx, rtx);
95 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
96
97 /* The mode of the elements. */
98 scalar_mode elt_mode;
99
100 /* The instruction to use to move the immediate into a vector. */
101 insn_type insn;
102
103 union
104 {
105 /* For MOV and MVN. */
106 struct
107 {
108 /* The value of each element. */
109 rtx value;
110
111 /* The kind of shift modifier to use, and the number of bits to shift.
112 This is (LSL, 0) if no shift is needed. */
113 modifier_type modifier;
114 unsigned int shift;
115 } mov;
116
117 /* For INDEX. */
118 struct
119 {
120 /* The value of the first element and the step to be added for each
121 subsequent element. */
122 rtx base, step;
123 } index;
124
125 /* For PTRUE. */
126 aarch64_svpattern pattern;
127 } u;
128 };
129
130 /* Construct a floating-point immediate in which each element has mode
131 ELT_MODE_IN and value VALUE_IN. */
132 inline simd_immediate_info
133 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
134 : elt_mode (elt_mode_in), insn (MOV)
135 {
136 u.mov.value = value_in;
137 u.mov.modifier = LSL;
138 u.mov.shift = 0;
139 }
140
141 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
142 and value VALUE_IN. The other parameters are as for the structure
143 fields. */
144 inline simd_immediate_info
145 ::simd_immediate_info (scalar_int_mode elt_mode_in,
146 unsigned HOST_WIDE_INT value_in,
147 insn_type insn_in, modifier_type modifier_in,
148 unsigned int shift_in)
149 : elt_mode (elt_mode_in), insn (insn_in)
150 {
151 u.mov.value = gen_int_mode (value_in, elt_mode_in);
152 u.mov.modifier = modifier_in;
153 u.mov.shift = shift_in;
154 }
155
156 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
157 and where element I is equal to BASE_IN + I * STEP_IN. */
158 inline simd_immediate_info
159 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
160 : elt_mode (elt_mode_in), insn (INDEX)
161 {
162 u.index.base = base_in;
163 u.index.step = step_in;
164 }
165
166 /* Construct a predicate that controls elements of mode ELT_MODE_IN
167 and has PTRUE pattern PATTERN_IN. */
168 inline simd_immediate_info
169 ::simd_immediate_info (scalar_int_mode elt_mode_in,
170 aarch64_svpattern pattern_in)
171 : elt_mode (elt_mode_in), insn (PTRUE)
172 {
173 u.pattern = pattern_in;
174 }
175
176 /* The current code model. */
177 enum aarch64_code_model aarch64_cmodel;
178
179 /* The number of 64-bit elements in an SVE vector. */
180 poly_uint16 aarch64_sve_vg;
181
182 #ifdef HAVE_AS_TLS
183 #undef TARGET_HAVE_TLS
184 #define TARGET_HAVE_TLS 1
185 #endif
186
187 static bool aarch64_composite_type_p (const_tree, machine_mode);
188 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
189 const_tree,
190 machine_mode *, int *,
191 bool *);
192 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
193 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
194 static void aarch64_override_options_after_change (void);
195 static bool aarch64_vector_mode_supported_p (machine_mode);
196 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
197 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
198 const_tree type,
199 int misalignment,
200 bool is_packed);
201 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
202 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
203 aarch64_addr_query_type);
204 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
205
206 /* Major revision number of the ARM Architecture implemented by the target. */
207 unsigned aarch64_architecture_version;
208
209 /* The processor for which instructions should be scheduled. */
210 enum aarch64_processor aarch64_tune = cortexa53;
211
212 /* Mask to specify which instruction scheduling options should be used. */
213 uint64_t aarch64_tune_flags = 0;
214
215 /* Global flag for PC relative loads. */
216 bool aarch64_pcrelative_literal_loads;
217
218 /* Global flag for whether frame pointer is enabled. */
219 bool aarch64_use_frame_pointer;
220
221 #define BRANCH_PROTECT_STR_MAX 255
222 char *accepted_branch_protection_string = NULL;
223
224 static enum aarch64_parse_opt_result
225 aarch64_parse_branch_protection (const char*, char**);
226
227 /* Support for command line parsing of boolean flags in the tuning
228 structures. */
229 struct aarch64_flag_desc
230 {
231 const char* name;
232 unsigned int flag;
233 };
234
235 #define AARCH64_FUSION_PAIR(name, internal_name) \
236 { name, AARCH64_FUSE_##internal_name },
237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
238 {
239 { "none", AARCH64_FUSE_NOTHING },
240 #include "aarch64-fusion-pairs.def"
241 { "all", AARCH64_FUSE_ALL },
242 { NULL, AARCH64_FUSE_NOTHING }
243 };
244
245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
246 { name, AARCH64_EXTRA_TUNE_##internal_name },
247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
248 {
249 { "none", AARCH64_EXTRA_TUNE_NONE },
250 #include "aarch64-tuning-flags.def"
251 { "all", AARCH64_EXTRA_TUNE_ALL },
252 { NULL, AARCH64_EXTRA_TUNE_NONE }
253 };
254
255 /* Tuning parameters. */
256
257 static const struct cpu_addrcost_table generic_addrcost_table =
258 {
259 {
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
264 },
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
271 };
272
273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
274 {
275 {
276 0, /* hi */
277 0, /* si */
278 0, /* di */
279 2, /* ti */
280 },
281 0, /* pre_modify */
282 0, /* post_modify */
283 1, /* register_offset */
284 1, /* register_sextend */
285 2, /* register_zextend */
286 0, /* imm_offset */
287 };
288
289 static const struct cpu_addrcost_table xgene1_addrcost_table =
290 {
291 {
292 1, /* hi */
293 0, /* si */
294 0, /* di */
295 1, /* ti */
296 },
297 1, /* pre_modify */
298 1, /* post_modify */
299 0, /* register_offset */
300 1, /* register_sextend */
301 1, /* register_zextend */
302 0, /* imm_offset */
303 };
304
305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
306 {
307 {
308 1, /* hi */
309 1, /* si */
310 1, /* di */
311 2, /* ti */
312 },
313 0, /* pre_modify */
314 0, /* post_modify */
315 2, /* register_offset */
316 3, /* register_sextend */
317 3, /* register_zextend */
318 0, /* imm_offset */
319 };
320
321 static const struct cpu_addrcost_table tsv110_addrcost_table =
322 {
323 {
324 1, /* hi */
325 0, /* si */
326 0, /* di */
327 1, /* ti */
328 },
329 0, /* pre_modify */
330 0, /* post_modify */
331 0, /* register_offset */
332 1, /* register_sextend */
333 1, /* register_zextend */
334 0, /* imm_offset */
335 };
336
337 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
338 {
339 {
340 1, /* hi */
341 1, /* si */
342 1, /* di */
343 2, /* ti */
344 },
345 1, /* pre_modify */
346 1, /* post_modify */
347 3, /* register_offset */
348 3, /* register_sextend */
349 3, /* register_zextend */
350 2, /* imm_offset */
351 };
352
353 static const struct cpu_regmove_cost generic_regmove_cost =
354 {
355 1, /* GP2GP */
356 /* Avoid the use of slow int<->fp moves for spilling by setting
357 their cost higher than memmov_cost. */
358 5, /* GP2FP */
359 5, /* FP2GP */
360 2 /* FP2FP */
361 };
362
363 static const struct cpu_regmove_cost cortexa57_regmove_cost =
364 {
365 1, /* GP2GP */
366 /* Avoid the use of slow int<->fp moves for spilling by setting
367 their cost higher than memmov_cost. */
368 5, /* GP2FP */
369 5, /* FP2GP */
370 2 /* FP2FP */
371 };
372
373 static const struct cpu_regmove_cost cortexa53_regmove_cost =
374 {
375 1, /* GP2GP */
376 /* Avoid the use of slow int<->fp moves for spilling by setting
377 their cost higher than memmov_cost. */
378 5, /* GP2FP */
379 5, /* FP2GP */
380 2 /* FP2FP */
381 };
382
383 static const struct cpu_regmove_cost exynosm1_regmove_cost =
384 {
385 1, /* GP2GP */
386 /* Avoid the use of slow int<->fp moves for spilling by setting
387 their cost higher than memmov_cost (actual, 4 and 9). */
388 9, /* GP2FP */
389 9, /* FP2GP */
390 1 /* FP2FP */
391 };
392
393 static const struct cpu_regmove_cost thunderx_regmove_cost =
394 {
395 2, /* GP2GP */
396 2, /* GP2FP */
397 6, /* FP2GP */
398 4 /* FP2FP */
399 };
400
401 static const struct cpu_regmove_cost xgene1_regmove_cost =
402 {
403 1, /* GP2GP */
404 /* Avoid the use of slow int<->fp moves for spilling by setting
405 their cost higher than memmov_cost. */
406 8, /* GP2FP */
407 8, /* FP2GP */
408 2 /* FP2FP */
409 };
410
411 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
412 {
413 2, /* GP2GP */
414 /* Avoid the use of int<->fp moves for spilling. */
415 6, /* GP2FP */
416 6, /* FP2GP */
417 4 /* FP2FP */
418 };
419
420 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
421 {
422 1, /* GP2GP */
423 /* Avoid the use of int<->fp moves for spilling. */
424 8, /* GP2FP */
425 8, /* FP2GP */
426 4 /* FP2FP */
427 };
428
429 static const struct cpu_regmove_cost tsv110_regmove_cost =
430 {
431 1, /* GP2GP */
432 /* Avoid the use of slow int<->fp moves for spilling by setting
433 their cost higher than memmov_cost. */
434 2, /* GP2FP */
435 3, /* FP2GP */
436 2 /* FP2FP */
437 };
438
439 /* Generic costs for vector insn classes. */
440 static const struct cpu_vector_cost generic_vector_cost =
441 {
442 1, /* scalar_int_stmt_cost */
443 1, /* scalar_fp_stmt_cost */
444 1, /* scalar_load_cost */
445 1, /* scalar_store_cost */
446 1, /* vec_int_stmt_cost */
447 1, /* vec_fp_stmt_cost */
448 2, /* vec_permute_cost */
449 1, /* vec_to_scalar_cost */
450 1, /* scalar_to_vec_cost */
451 1, /* vec_align_load_cost */
452 1, /* vec_unalign_load_cost */
453 1, /* vec_unalign_store_cost */
454 1, /* vec_store_cost */
455 3, /* cond_taken_branch_cost */
456 1 /* cond_not_taken_branch_cost */
457 };
458
459 /* QDF24XX costs for vector insn classes. */
460 static const struct cpu_vector_cost qdf24xx_vector_cost =
461 {
462 1, /* scalar_int_stmt_cost */
463 1, /* scalar_fp_stmt_cost */
464 1, /* scalar_load_cost */
465 1, /* scalar_store_cost */
466 1, /* vec_int_stmt_cost */
467 3, /* vec_fp_stmt_cost */
468 2, /* vec_permute_cost */
469 1, /* vec_to_scalar_cost */
470 1, /* scalar_to_vec_cost */
471 1, /* vec_align_load_cost */
472 1, /* vec_unalign_load_cost */
473 1, /* vec_unalign_store_cost */
474 1, /* vec_store_cost */
475 3, /* cond_taken_branch_cost */
476 1 /* cond_not_taken_branch_cost */
477 };
478
479 /* ThunderX costs for vector insn classes. */
480 static const struct cpu_vector_cost thunderx_vector_cost =
481 {
482 1, /* scalar_int_stmt_cost */
483 1, /* scalar_fp_stmt_cost */
484 3, /* scalar_load_cost */
485 1, /* scalar_store_cost */
486 4, /* vec_int_stmt_cost */
487 1, /* vec_fp_stmt_cost */
488 4, /* vec_permute_cost */
489 2, /* vec_to_scalar_cost */
490 2, /* scalar_to_vec_cost */
491 3, /* vec_align_load_cost */
492 5, /* vec_unalign_load_cost */
493 5, /* vec_unalign_store_cost */
494 1, /* vec_store_cost */
495 3, /* cond_taken_branch_cost */
496 3 /* cond_not_taken_branch_cost */
497 };
498
499 static const struct cpu_vector_cost tsv110_vector_cost =
500 {
501 1, /* scalar_int_stmt_cost */
502 1, /* scalar_fp_stmt_cost */
503 5, /* scalar_load_cost */
504 1, /* scalar_store_cost */
505 2, /* vec_int_stmt_cost */
506 2, /* vec_fp_stmt_cost */
507 2, /* vec_permute_cost */
508 3, /* vec_to_scalar_cost */
509 2, /* scalar_to_vec_cost */
510 5, /* vec_align_load_cost */
511 5, /* vec_unalign_load_cost */
512 1, /* vec_unalign_store_cost */
513 1, /* vec_store_cost */
514 1, /* cond_taken_branch_cost */
515 1 /* cond_not_taken_branch_cost */
516 };
517
518 /* Generic costs for vector insn classes. */
519 static const struct cpu_vector_cost cortexa57_vector_cost =
520 {
521 1, /* scalar_int_stmt_cost */
522 1, /* scalar_fp_stmt_cost */
523 4, /* scalar_load_cost */
524 1, /* scalar_store_cost */
525 2, /* vec_int_stmt_cost */
526 2, /* vec_fp_stmt_cost */
527 3, /* vec_permute_cost */
528 8, /* vec_to_scalar_cost */
529 8, /* scalar_to_vec_cost */
530 4, /* vec_align_load_cost */
531 4, /* vec_unalign_load_cost */
532 1, /* vec_unalign_store_cost */
533 1, /* vec_store_cost */
534 1, /* cond_taken_branch_cost */
535 1 /* cond_not_taken_branch_cost */
536 };
537
538 static const struct cpu_vector_cost exynosm1_vector_cost =
539 {
540 1, /* scalar_int_stmt_cost */
541 1, /* scalar_fp_stmt_cost */
542 5, /* scalar_load_cost */
543 1, /* scalar_store_cost */
544 3, /* vec_int_stmt_cost */
545 3, /* vec_fp_stmt_cost */
546 3, /* vec_permute_cost */
547 3, /* vec_to_scalar_cost */
548 3, /* scalar_to_vec_cost */
549 5, /* vec_align_load_cost */
550 5, /* vec_unalign_load_cost */
551 1, /* vec_unalign_store_cost */
552 1, /* vec_store_cost */
553 1, /* cond_taken_branch_cost */
554 1 /* cond_not_taken_branch_cost */
555 };
556
557 /* Generic costs for vector insn classes. */
558 static const struct cpu_vector_cost xgene1_vector_cost =
559 {
560 1, /* scalar_int_stmt_cost */
561 1, /* scalar_fp_stmt_cost */
562 5, /* scalar_load_cost */
563 1, /* scalar_store_cost */
564 2, /* vec_int_stmt_cost */
565 2, /* vec_fp_stmt_cost */
566 2, /* vec_permute_cost */
567 4, /* vec_to_scalar_cost */
568 4, /* scalar_to_vec_cost */
569 10, /* vec_align_load_cost */
570 10, /* vec_unalign_load_cost */
571 2, /* vec_unalign_store_cost */
572 2, /* vec_store_cost */
573 2, /* cond_taken_branch_cost */
574 1 /* cond_not_taken_branch_cost */
575 };
576
577 /* Costs for vector insn classes for Vulcan. */
578 static const struct cpu_vector_cost thunderx2t99_vector_cost =
579 {
580 1, /* scalar_int_stmt_cost */
581 6, /* scalar_fp_stmt_cost */
582 4, /* scalar_load_cost */
583 1, /* scalar_store_cost */
584 5, /* vec_int_stmt_cost */
585 6, /* vec_fp_stmt_cost */
586 3, /* vec_permute_cost */
587 6, /* vec_to_scalar_cost */
588 5, /* scalar_to_vec_cost */
589 8, /* vec_align_load_cost */
590 8, /* vec_unalign_load_cost */
591 4, /* vec_unalign_store_cost */
592 4, /* vec_store_cost */
593 2, /* cond_taken_branch_cost */
594 1 /* cond_not_taken_branch_cost */
595 };
596
597 /* Generic costs for branch instructions. */
598 static const struct cpu_branch_cost generic_branch_cost =
599 {
600 1, /* Predictable. */
601 3 /* Unpredictable. */
602 };
603
604 /* Generic approximation modes. */
605 static const cpu_approx_modes generic_approx_modes =
606 {
607 AARCH64_APPROX_NONE, /* division */
608 AARCH64_APPROX_NONE, /* sqrt */
609 AARCH64_APPROX_NONE /* recip_sqrt */
610 };
611
612 /* Approximation modes for Exynos M1. */
613 static const cpu_approx_modes exynosm1_approx_modes =
614 {
615 AARCH64_APPROX_NONE, /* division */
616 AARCH64_APPROX_ALL, /* sqrt */
617 AARCH64_APPROX_ALL /* recip_sqrt */
618 };
619
620 /* Approximation modes for X-Gene 1. */
621 static const cpu_approx_modes xgene1_approx_modes =
622 {
623 AARCH64_APPROX_NONE, /* division */
624 AARCH64_APPROX_NONE, /* sqrt */
625 AARCH64_APPROX_ALL /* recip_sqrt */
626 };
627
628 /* Generic prefetch settings (which disable prefetch). */
629 static const cpu_prefetch_tune generic_prefetch_tune =
630 {
631 0, /* num_slots */
632 -1, /* l1_cache_size */
633 -1, /* l1_cache_line_size */
634 -1, /* l2_cache_size */
635 true, /* prefetch_dynamic_strides */
636 -1, /* minimum_stride */
637 -1 /* default_opt_level */
638 };
639
640 static const cpu_prefetch_tune exynosm1_prefetch_tune =
641 {
642 0, /* num_slots */
643 -1, /* l1_cache_size */
644 64, /* l1_cache_line_size */
645 -1, /* l2_cache_size */
646 true, /* prefetch_dynamic_strides */
647 -1, /* minimum_stride */
648 -1 /* default_opt_level */
649 };
650
651 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
652 {
653 4, /* num_slots */
654 32, /* l1_cache_size */
655 64, /* l1_cache_line_size */
656 512, /* l2_cache_size */
657 false, /* prefetch_dynamic_strides */
658 2048, /* minimum_stride */
659 3 /* default_opt_level */
660 };
661
662 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
663 {
664 8, /* num_slots */
665 32, /* l1_cache_size */
666 128, /* l1_cache_line_size */
667 16*1024, /* l2_cache_size */
668 true, /* prefetch_dynamic_strides */
669 -1, /* minimum_stride */
670 3 /* default_opt_level */
671 };
672
673 static const cpu_prefetch_tune thunderx_prefetch_tune =
674 {
675 8, /* num_slots */
676 32, /* l1_cache_size */
677 128, /* l1_cache_line_size */
678 -1, /* l2_cache_size */
679 true, /* prefetch_dynamic_strides */
680 -1, /* minimum_stride */
681 -1 /* default_opt_level */
682 };
683
684 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
685 {
686 8, /* num_slots */
687 32, /* l1_cache_size */
688 64, /* l1_cache_line_size */
689 256, /* l2_cache_size */
690 true, /* prefetch_dynamic_strides */
691 -1, /* minimum_stride */
692 -1 /* default_opt_level */
693 };
694
695 static const cpu_prefetch_tune tsv110_prefetch_tune =
696 {
697 0, /* num_slots */
698 64, /* l1_cache_size */
699 64, /* l1_cache_line_size */
700 512, /* l2_cache_size */
701 true, /* prefetch_dynamic_strides */
702 -1, /* minimum_stride */
703 -1 /* default_opt_level */
704 };
705
706 static const cpu_prefetch_tune xgene1_prefetch_tune =
707 {
708 8, /* num_slots */
709 32, /* l1_cache_size */
710 64, /* l1_cache_line_size */
711 256, /* l2_cache_size */
712 true, /* prefetch_dynamic_strides */
713 -1, /* minimum_stride */
714 -1 /* default_opt_level */
715 };
716
717 static const struct tune_params generic_tunings =
718 {
719 &cortexa57_extra_costs,
720 &generic_addrcost_table,
721 &generic_regmove_cost,
722 &generic_vector_cost,
723 &generic_branch_cost,
724 &generic_approx_modes,
725 SVE_NOT_IMPLEMENTED, /* sve_width */
726 4, /* memmov_cost */
727 2, /* issue_rate */
728 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
729 "16:12", /* function_align. */
730 "4", /* jump_align. */
731 "8", /* loop_align. */
732 2, /* int_reassoc_width. */
733 4, /* fp_reassoc_width. */
734 1, /* vec_reassoc_width. */
735 2, /* min_div_recip_mul_sf. */
736 2, /* min_div_recip_mul_df. */
737 0, /* max_case_values. */
738 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
739 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
740 &generic_prefetch_tune
741 };
742
743 static const struct tune_params cortexa35_tunings =
744 {
745 &cortexa53_extra_costs,
746 &generic_addrcost_table,
747 &cortexa53_regmove_cost,
748 &generic_vector_cost,
749 &generic_branch_cost,
750 &generic_approx_modes,
751 SVE_NOT_IMPLEMENTED, /* sve_width */
752 4, /* memmov_cost */
753 1, /* issue_rate */
754 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
755 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
756 "16", /* function_align. */
757 "4", /* jump_align. */
758 "8", /* loop_align. */
759 2, /* int_reassoc_width. */
760 4, /* fp_reassoc_width. */
761 1, /* vec_reassoc_width. */
762 2, /* min_div_recip_mul_sf. */
763 2, /* min_div_recip_mul_df. */
764 0, /* max_case_values. */
765 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
766 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
767 &generic_prefetch_tune
768 };
769
770 static const struct tune_params cortexa53_tunings =
771 {
772 &cortexa53_extra_costs,
773 &generic_addrcost_table,
774 &cortexa53_regmove_cost,
775 &generic_vector_cost,
776 &generic_branch_cost,
777 &generic_approx_modes,
778 SVE_NOT_IMPLEMENTED, /* sve_width */
779 4, /* memmov_cost */
780 2, /* issue_rate */
781 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
782 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
783 "16", /* function_align. */
784 "4", /* jump_align. */
785 "8", /* loop_align. */
786 2, /* int_reassoc_width. */
787 4, /* fp_reassoc_width. */
788 1, /* vec_reassoc_width. */
789 2, /* min_div_recip_mul_sf. */
790 2, /* min_div_recip_mul_df. */
791 0, /* max_case_values. */
792 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
793 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
794 &generic_prefetch_tune
795 };
796
797 static const struct tune_params cortexa57_tunings =
798 {
799 &cortexa57_extra_costs,
800 &generic_addrcost_table,
801 &cortexa57_regmove_cost,
802 &cortexa57_vector_cost,
803 &generic_branch_cost,
804 &generic_approx_modes,
805 SVE_NOT_IMPLEMENTED, /* sve_width */
806 4, /* memmov_cost */
807 3, /* issue_rate */
808 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
809 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
810 "16", /* function_align. */
811 "4", /* jump_align. */
812 "8", /* loop_align. */
813 2, /* int_reassoc_width. */
814 4, /* fp_reassoc_width. */
815 1, /* vec_reassoc_width. */
816 2, /* min_div_recip_mul_sf. */
817 2, /* min_div_recip_mul_df. */
818 0, /* max_case_values. */
819 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
820 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
821 &generic_prefetch_tune
822 };
823
824 static const struct tune_params cortexa72_tunings =
825 {
826 &cortexa57_extra_costs,
827 &generic_addrcost_table,
828 &cortexa57_regmove_cost,
829 &cortexa57_vector_cost,
830 &generic_branch_cost,
831 &generic_approx_modes,
832 SVE_NOT_IMPLEMENTED, /* sve_width */
833 4, /* memmov_cost */
834 3, /* issue_rate */
835 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
836 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
837 "16", /* function_align. */
838 "4", /* jump_align. */
839 "8", /* loop_align. */
840 2, /* int_reassoc_width. */
841 4, /* fp_reassoc_width. */
842 1, /* vec_reassoc_width. */
843 2, /* min_div_recip_mul_sf. */
844 2, /* min_div_recip_mul_df. */
845 0, /* max_case_values. */
846 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
847 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
848 &generic_prefetch_tune
849 };
850
851 static const struct tune_params cortexa73_tunings =
852 {
853 &cortexa57_extra_costs,
854 &generic_addrcost_table,
855 &cortexa57_regmove_cost,
856 &cortexa57_vector_cost,
857 &generic_branch_cost,
858 &generic_approx_modes,
859 SVE_NOT_IMPLEMENTED, /* sve_width */
860 4, /* memmov_cost. */
861 2, /* issue_rate. */
862 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
863 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
864 "16", /* function_align. */
865 "4", /* jump_align. */
866 "8", /* loop_align. */
867 2, /* int_reassoc_width. */
868 4, /* fp_reassoc_width. */
869 1, /* vec_reassoc_width. */
870 2, /* min_div_recip_mul_sf. */
871 2, /* min_div_recip_mul_df. */
872 0, /* max_case_values. */
873 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
874 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
875 &generic_prefetch_tune
876 };
877
878
879
880 static const struct tune_params exynosm1_tunings =
881 {
882 &exynosm1_extra_costs,
883 &exynosm1_addrcost_table,
884 &exynosm1_regmove_cost,
885 &exynosm1_vector_cost,
886 &generic_branch_cost,
887 &exynosm1_approx_modes,
888 SVE_NOT_IMPLEMENTED, /* sve_width */
889 4, /* memmov_cost */
890 3, /* issue_rate */
891 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
892 "4", /* function_align. */
893 "4", /* jump_align. */
894 "4", /* loop_align. */
895 2, /* int_reassoc_width. */
896 4, /* fp_reassoc_width. */
897 1, /* vec_reassoc_width. */
898 2, /* min_div_recip_mul_sf. */
899 2, /* min_div_recip_mul_df. */
900 48, /* max_case_values. */
901 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
902 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
903 &exynosm1_prefetch_tune
904 };
905
906 static const struct tune_params thunderxt88_tunings =
907 {
908 &thunderx_extra_costs,
909 &generic_addrcost_table,
910 &thunderx_regmove_cost,
911 &thunderx_vector_cost,
912 &generic_branch_cost,
913 &generic_approx_modes,
914 SVE_NOT_IMPLEMENTED, /* sve_width */
915 6, /* memmov_cost */
916 2, /* issue_rate */
917 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
918 "8", /* function_align. */
919 "8", /* jump_align. */
920 "8", /* loop_align. */
921 2, /* int_reassoc_width. */
922 4, /* fp_reassoc_width. */
923 1, /* vec_reassoc_width. */
924 2, /* min_div_recip_mul_sf. */
925 2, /* min_div_recip_mul_df. */
926 0, /* max_case_values. */
927 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
928 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
929 &thunderxt88_prefetch_tune
930 };
931
932 static const struct tune_params thunderx_tunings =
933 {
934 &thunderx_extra_costs,
935 &generic_addrcost_table,
936 &thunderx_regmove_cost,
937 &thunderx_vector_cost,
938 &generic_branch_cost,
939 &generic_approx_modes,
940 SVE_NOT_IMPLEMENTED, /* sve_width */
941 6, /* memmov_cost */
942 2, /* issue_rate */
943 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
944 "8", /* function_align. */
945 "8", /* jump_align. */
946 "8", /* loop_align. */
947 2, /* int_reassoc_width. */
948 4, /* fp_reassoc_width. */
949 1, /* vec_reassoc_width. */
950 2, /* min_div_recip_mul_sf. */
951 2, /* min_div_recip_mul_df. */
952 0, /* max_case_values. */
953 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
954 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
955 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
956 &thunderx_prefetch_tune
957 };
958
959 static const struct tune_params tsv110_tunings =
960 {
961 &tsv110_extra_costs,
962 &tsv110_addrcost_table,
963 &tsv110_regmove_cost,
964 &tsv110_vector_cost,
965 &generic_branch_cost,
966 &generic_approx_modes,
967 SVE_NOT_IMPLEMENTED, /* sve_width */
968 4, /* memmov_cost */
969 4, /* issue_rate */
970 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
971 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
972 "16", /* function_align. */
973 "4", /* jump_align. */
974 "8", /* loop_align. */
975 2, /* int_reassoc_width. */
976 4, /* fp_reassoc_width. */
977 1, /* vec_reassoc_width. */
978 2, /* min_div_recip_mul_sf. */
979 2, /* min_div_recip_mul_df. */
980 0, /* max_case_values. */
981 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
982 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
983 &tsv110_prefetch_tune
984 };
985
986 static const struct tune_params xgene1_tunings =
987 {
988 &xgene1_extra_costs,
989 &xgene1_addrcost_table,
990 &xgene1_regmove_cost,
991 &xgene1_vector_cost,
992 &generic_branch_cost,
993 &xgene1_approx_modes,
994 SVE_NOT_IMPLEMENTED, /* sve_width */
995 6, /* memmov_cost */
996 4, /* issue_rate */
997 AARCH64_FUSE_NOTHING, /* fusible_ops */
998 "16", /* function_align. */
999 "16", /* jump_align. */
1000 "16", /* loop_align. */
1001 2, /* int_reassoc_width. */
1002 4, /* fp_reassoc_width. */
1003 1, /* vec_reassoc_width. */
1004 2, /* min_div_recip_mul_sf. */
1005 2, /* min_div_recip_mul_df. */
1006 17, /* max_case_values. */
1007 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1008 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1009 &xgene1_prefetch_tune
1010 };
1011
1012 static const struct tune_params emag_tunings =
1013 {
1014 &xgene1_extra_costs,
1015 &xgene1_addrcost_table,
1016 &xgene1_regmove_cost,
1017 &xgene1_vector_cost,
1018 &generic_branch_cost,
1019 &xgene1_approx_modes,
1020 SVE_NOT_IMPLEMENTED,
1021 6, /* memmov_cost */
1022 4, /* issue_rate */
1023 AARCH64_FUSE_NOTHING, /* fusible_ops */
1024 "16", /* function_align. */
1025 "16", /* jump_align. */
1026 "16", /* loop_align. */
1027 2, /* int_reassoc_width. */
1028 4, /* fp_reassoc_width. */
1029 1, /* vec_reassoc_width. */
1030 2, /* min_div_recip_mul_sf. */
1031 2, /* min_div_recip_mul_df. */
1032 17, /* max_case_values. */
1033 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1034 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1035 &xgene1_prefetch_tune
1036 };
1037
1038 static const struct tune_params qdf24xx_tunings =
1039 {
1040 &qdf24xx_extra_costs,
1041 &qdf24xx_addrcost_table,
1042 &qdf24xx_regmove_cost,
1043 &qdf24xx_vector_cost,
1044 &generic_branch_cost,
1045 &generic_approx_modes,
1046 SVE_NOT_IMPLEMENTED, /* sve_width */
1047 4, /* memmov_cost */
1048 4, /* issue_rate */
1049 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1050 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1051 "16", /* function_align. */
1052 "8", /* jump_align. */
1053 "16", /* loop_align. */
1054 2, /* int_reassoc_width. */
1055 4, /* fp_reassoc_width. */
1056 1, /* vec_reassoc_width. */
1057 2, /* min_div_recip_mul_sf. */
1058 2, /* min_div_recip_mul_df. */
1059 0, /* max_case_values. */
1060 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1061 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1062 &qdf24xx_prefetch_tune
1063 };
1064
1065 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1066 for now. */
1067 static const struct tune_params saphira_tunings =
1068 {
1069 &generic_extra_costs,
1070 &generic_addrcost_table,
1071 &generic_regmove_cost,
1072 &generic_vector_cost,
1073 &generic_branch_cost,
1074 &generic_approx_modes,
1075 SVE_NOT_IMPLEMENTED, /* sve_width */
1076 4, /* memmov_cost */
1077 4, /* issue_rate */
1078 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1079 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1080 "16", /* function_align. */
1081 "8", /* jump_align. */
1082 "16", /* loop_align. */
1083 2, /* int_reassoc_width. */
1084 4, /* fp_reassoc_width. */
1085 1, /* vec_reassoc_width. */
1086 2, /* min_div_recip_mul_sf. */
1087 2, /* min_div_recip_mul_df. */
1088 0, /* max_case_values. */
1089 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1090 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1091 &generic_prefetch_tune
1092 };
1093
1094 static const struct tune_params thunderx2t99_tunings =
1095 {
1096 &thunderx2t99_extra_costs,
1097 &thunderx2t99_addrcost_table,
1098 &thunderx2t99_regmove_cost,
1099 &thunderx2t99_vector_cost,
1100 &generic_branch_cost,
1101 &generic_approx_modes,
1102 SVE_NOT_IMPLEMENTED, /* sve_width */
1103 4, /* memmov_cost. */
1104 4, /* issue_rate. */
1105 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1106 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1107 "16", /* function_align. */
1108 "8", /* jump_align. */
1109 "16", /* loop_align. */
1110 3, /* int_reassoc_width. */
1111 2, /* fp_reassoc_width. */
1112 2, /* vec_reassoc_width. */
1113 2, /* min_div_recip_mul_sf. */
1114 2, /* min_div_recip_mul_df. */
1115 0, /* max_case_values. */
1116 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1117 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1118 &thunderx2t99_prefetch_tune
1119 };
1120
1121 static const struct tune_params neoversen1_tunings =
1122 {
1123 &cortexa57_extra_costs,
1124 &generic_addrcost_table,
1125 &generic_regmove_cost,
1126 &cortexa57_vector_cost,
1127 &generic_branch_cost,
1128 &generic_approx_modes,
1129 SVE_NOT_IMPLEMENTED, /* sve_width */
1130 4, /* memmov_cost */
1131 3, /* issue_rate */
1132 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1133 "32:16", /* function_align. */
1134 "32:16", /* jump_align. */
1135 "32:16", /* loop_align. */
1136 2, /* int_reassoc_width. */
1137 4, /* fp_reassoc_width. */
1138 2, /* vec_reassoc_width. */
1139 2, /* min_div_recip_mul_sf. */
1140 2, /* min_div_recip_mul_df. */
1141 0, /* max_case_values. */
1142 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1143 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1144 &generic_prefetch_tune
1145 };
1146
1147 /* Support for fine-grained override of the tuning structures. */
1148 struct aarch64_tuning_override_function
1149 {
1150 const char* name;
1151 void (*parse_override)(const char*, struct tune_params*);
1152 };
1153
1154 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1155 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1156 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1157
1158 static const struct aarch64_tuning_override_function
1159 aarch64_tuning_override_functions[] =
1160 {
1161 { "fuse", aarch64_parse_fuse_string },
1162 { "tune", aarch64_parse_tune_string },
1163 { "sve_width", aarch64_parse_sve_width_string },
1164 { NULL, NULL }
1165 };
1166
1167 /* A processor implementing AArch64. */
1168 struct processor
1169 {
1170 const char *const name;
1171 enum aarch64_processor ident;
1172 enum aarch64_processor sched_core;
1173 enum aarch64_arch arch;
1174 unsigned architecture_version;
1175 const uint64_t flags;
1176 const struct tune_params *const tune;
1177 };
1178
1179 /* Architectures implementing AArch64. */
1180 static const struct processor all_architectures[] =
1181 {
1182 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1183 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1184 #include "aarch64-arches.def"
1185 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1186 };
1187
1188 /* Processor cores implementing AArch64. */
1189 static const struct processor all_cores[] =
1190 {
1191 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1192 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1193 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1194 FLAGS, &COSTS##_tunings},
1195 #include "aarch64-cores.def"
1196 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1197 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1198 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1199 };
1200
1201
1202 /* Target specification. These are populated by the -march, -mtune, -mcpu
1203 handling code or by target attributes. */
1204 static const struct processor *selected_arch;
1205 static const struct processor *selected_cpu;
1206 static const struct processor *selected_tune;
1207
1208 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1209
1210 /* The current tuning set. */
1211 struct tune_params aarch64_tune_params = generic_tunings;
1212
1213 /* Table of machine attributes. */
1214 static const struct attribute_spec aarch64_attribute_table[] =
1215 {
1216 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1217 affects_type_identity, handler, exclude } */
1218 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL },
1219 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1220 };
1221
1222 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1223
1224 /* An ISA extension in the co-processor and main instruction set space. */
1225 struct aarch64_option_extension
1226 {
1227 const char *const name;
1228 const unsigned long flags_on;
1229 const unsigned long flags_off;
1230 };
1231
1232 typedef enum aarch64_cond_code
1233 {
1234 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1235 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1236 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1237 }
1238 aarch64_cc;
1239
1240 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1241
1242 struct aarch64_branch_protect_type
1243 {
1244 /* The type's name that the user passes to the branch-protection option
1245 string. */
1246 const char* name;
1247 /* Function to handle the protection type and set global variables.
1248 First argument is the string token corresponding with this type and the
1249 second argument is the next token in the option string.
1250 Return values:
1251 * AARCH64_PARSE_OK: Handling was sucessful.
1252 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1253 should print an error.
1254 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1255 own error. */
1256 enum aarch64_parse_opt_result (*handler)(char*, char*);
1257 /* A list of types that can follow this type in the option string. */
1258 const aarch64_branch_protect_type* subtypes;
1259 unsigned int num_subtypes;
1260 };
1261
1262 static enum aarch64_parse_opt_result
1263 aarch64_handle_no_branch_protection (char* str, char* rest)
1264 {
1265 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1266 aarch64_enable_bti = 0;
1267 if (rest)
1268 {
1269 error ("unexpected %<%s%> after %<%s%>", rest, str);
1270 return AARCH64_PARSE_INVALID_FEATURE;
1271 }
1272 return AARCH64_PARSE_OK;
1273 }
1274
1275 static enum aarch64_parse_opt_result
1276 aarch64_handle_standard_branch_protection (char* str, char* rest)
1277 {
1278 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1279 aarch64_ra_sign_key = AARCH64_KEY_A;
1280 aarch64_enable_bti = 1;
1281 if (rest)
1282 {
1283 error ("unexpected %<%s%> after %<%s%>", rest, str);
1284 return AARCH64_PARSE_INVALID_FEATURE;
1285 }
1286 return AARCH64_PARSE_OK;
1287 }
1288
1289 static enum aarch64_parse_opt_result
1290 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1291 char* rest ATTRIBUTE_UNUSED)
1292 {
1293 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1294 aarch64_ra_sign_key = AARCH64_KEY_A;
1295 return AARCH64_PARSE_OK;
1296 }
1297
1298 static enum aarch64_parse_opt_result
1299 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1300 char* rest ATTRIBUTE_UNUSED)
1301 {
1302 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1303 return AARCH64_PARSE_OK;
1304 }
1305
1306 static enum aarch64_parse_opt_result
1307 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1308 char* rest ATTRIBUTE_UNUSED)
1309 {
1310 aarch64_ra_sign_key = AARCH64_KEY_B;
1311 return AARCH64_PARSE_OK;
1312 }
1313
1314 static enum aarch64_parse_opt_result
1315 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1316 char* rest ATTRIBUTE_UNUSED)
1317 {
1318 aarch64_enable_bti = 1;
1319 return AARCH64_PARSE_OK;
1320 }
1321
1322 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1323 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1324 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1325 { NULL, NULL, NULL, 0 }
1326 };
1327
1328 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1329 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1330 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1331 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1332 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1333 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1334 { NULL, NULL, NULL, 0 }
1335 };
1336
1337 /* The condition codes of the processor, and the inverse function. */
1338 static const char * const aarch64_condition_codes[] =
1339 {
1340 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1341 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1342 };
1343
1344 /* The preferred condition codes for SVE conditions. */
1345 static const char *const aarch64_sve_condition_codes[] =
1346 {
1347 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1348 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1349 };
1350
1351 /* Return the assembly token for svpattern value VALUE. */
1352
1353 static const char *
1354 svpattern_token (enum aarch64_svpattern pattern)
1355 {
1356 switch (pattern)
1357 {
1358 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1359 AARCH64_FOR_SVPATTERN (CASE)
1360 #undef CASE
1361 case AARCH64_NUM_SVPATTERNS:
1362 break;
1363 }
1364 gcc_unreachable ();
1365 }
1366
1367 /* Generate code to enable conditional branches in functions over 1 MiB. */
1368 const char *
1369 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1370 const char * branch_format)
1371 {
1372 rtx_code_label * tmp_label = gen_label_rtx ();
1373 char label_buf[256];
1374 char buffer[128];
1375 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1376 CODE_LABEL_NUMBER (tmp_label));
1377 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1378 rtx dest_label = operands[pos_label];
1379 operands[pos_label] = tmp_label;
1380
1381 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1382 output_asm_insn (buffer, operands);
1383
1384 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1385 operands[pos_label] = dest_label;
1386 output_asm_insn (buffer, operands);
1387 return "";
1388 }
1389
1390 void
1391 aarch64_err_no_fpadvsimd (machine_mode mode)
1392 {
1393 if (TARGET_GENERAL_REGS_ONLY)
1394 if (FLOAT_MODE_P (mode))
1395 error ("%qs is incompatible with the use of floating-point types",
1396 "-mgeneral-regs-only");
1397 else
1398 error ("%qs is incompatible with the use of vector types",
1399 "-mgeneral-regs-only");
1400 else
1401 if (FLOAT_MODE_P (mode))
1402 error ("%qs feature modifier is incompatible with the use of"
1403 " floating-point types", "+nofp");
1404 else
1405 error ("%qs feature modifier is incompatible with the use of"
1406 " vector types", "+nofp");
1407 }
1408
1409 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1410 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1411 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1412 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1413 and GENERAL_REGS is lower than the memory cost (in this case the best class
1414 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1415 cost results in bad allocations with many redundant int<->FP moves which
1416 are expensive on various cores.
1417 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1418 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1419 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1420 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1421 The result of this is that it is no longer inefficient to have a higher
1422 memory move cost than the register move cost.
1423 */
1424
1425 static reg_class_t
1426 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1427 reg_class_t best_class)
1428 {
1429 machine_mode mode;
1430
1431 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1432 || !reg_class_subset_p (FP_REGS, allocno_class))
1433 return allocno_class;
1434
1435 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1436 || !reg_class_subset_p (FP_REGS, best_class))
1437 return best_class;
1438
1439 mode = PSEUDO_REGNO_MODE (regno);
1440 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1441 }
1442
1443 static unsigned int
1444 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1445 {
1446 if (GET_MODE_UNIT_SIZE (mode) == 4)
1447 return aarch64_tune_params.min_div_recip_mul_sf;
1448 return aarch64_tune_params.min_div_recip_mul_df;
1449 }
1450
1451 /* Return the reassociation width of treeop OPC with mode MODE. */
1452 static int
1453 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1454 {
1455 if (VECTOR_MODE_P (mode))
1456 return aarch64_tune_params.vec_reassoc_width;
1457 if (INTEGRAL_MODE_P (mode))
1458 return aarch64_tune_params.int_reassoc_width;
1459 /* Avoid reassociating floating point addition so we emit more FMAs. */
1460 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1461 return aarch64_tune_params.fp_reassoc_width;
1462 return 1;
1463 }
1464
1465 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1466 unsigned
1467 aarch64_dbx_register_number (unsigned regno)
1468 {
1469 if (GP_REGNUM_P (regno))
1470 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1471 else if (regno == SP_REGNUM)
1472 return AARCH64_DWARF_SP;
1473 else if (FP_REGNUM_P (regno))
1474 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1475 else if (PR_REGNUM_P (regno))
1476 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1477 else if (regno == VG_REGNUM)
1478 return AARCH64_DWARF_VG;
1479
1480 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1481 equivalent DWARF register. */
1482 return DWARF_FRAME_REGISTERS;
1483 }
1484
1485 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1486 static bool
1487 aarch64_advsimd_struct_mode_p (machine_mode mode)
1488 {
1489 return (TARGET_SIMD
1490 && (mode == OImode || mode == CImode || mode == XImode));
1491 }
1492
1493 /* Return true if MODE is an SVE predicate mode. */
1494 static bool
1495 aarch64_sve_pred_mode_p (machine_mode mode)
1496 {
1497 return (TARGET_SVE
1498 && (mode == VNx16BImode
1499 || mode == VNx8BImode
1500 || mode == VNx4BImode
1501 || mode == VNx2BImode));
1502 }
1503
1504 /* Three mutually-exclusive flags describing a vector or predicate type. */
1505 const unsigned int VEC_ADVSIMD = 1;
1506 const unsigned int VEC_SVE_DATA = 2;
1507 const unsigned int VEC_SVE_PRED = 4;
1508 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1509 a structure of 2, 3 or 4 vectors. */
1510 const unsigned int VEC_STRUCT = 8;
1511 /* Useful combinations of the above. */
1512 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1513 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1514
1515 /* Return a set of flags describing the vector properties of mode MODE.
1516 Ignore modes that are not supported by the current target. */
1517 static unsigned int
1518 aarch64_classify_vector_mode (machine_mode mode)
1519 {
1520 if (aarch64_advsimd_struct_mode_p (mode))
1521 return VEC_ADVSIMD | VEC_STRUCT;
1522
1523 if (aarch64_sve_pred_mode_p (mode))
1524 return VEC_SVE_PRED;
1525
1526 /* Make the decision based on the mode's enum value rather than its
1527 properties, so that we keep the correct classification regardless
1528 of -msve-vector-bits. */
1529 switch (mode)
1530 {
1531 /* Single SVE vectors. */
1532 case E_VNx16QImode:
1533 case E_VNx8HImode:
1534 case E_VNx4SImode:
1535 case E_VNx2DImode:
1536 case E_VNx8HFmode:
1537 case E_VNx4SFmode:
1538 case E_VNx2DFmode:
1539 return TARGET_SVE ? VEC_SVE_DATA : 0;
1540
1541 /* x2 SVE vectors. */
1542 case E_VNx32QImode:
1543 case E_VNx16HImode:
1544 case E_VNx8SImode:
1545 case E_VNx4DImode:
1546 case E_VNx16HFmode:
1547 case E_VNx8SFmode:
1548 case E_VNx4DFmode:
1549 /* x3 SVE vectors. */
1550 case E_VNx48QImode:
1551 case E_VNx24HImode:
1552 case E_VNx12SImode:
1553 case E_VNx6DImode:
1554 case E_VNx24HFmode:
1555 case E_VNx12SFmode:
1556 case E_VNx6DFmode:
1557 /* x4 SVE vectors. */
1558 case E_VNx64QImode:
1559 case E_VNx32HImode:
1560 case E_VNx16SImode:
1561 case E_VNx8DImode:
1562 case E_VNx32HFmode:
1563 case E_VNx16SFmode:
1564 case E_VNx8DFmode:
1565 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1566
1567 /* 64-bit Advanced SIMD vectors. */
1568 case E_V8QImode:
1569 case E_V4HImode:
1570 case E_V2SImode:
1571 /* ...E_V1DImode doesn't exist. */
1572 case E_V4HFmode:
1573 case E_V2SFmode:
1574 case E_V1DFmode:
1575 /* 128-bit Advanced SIMD vectors. */
1576 case E_V16QImode:
1577 case E_V8HImode:
1578 case E_V4SImode:
1579 case E_V2DImode:
1580 case E_V8HFmode:
1581 case E_V4SFmode:
1582 case E_V2DFmode:
1583 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1584
1585 default:
1586 return 0;
1587 }
1588 }
1589
1590 /* Return true if MODE is any of the data vector modes, including
1591 structure modes. */
1592 static bool
1593 aarch64_vector_data_mode_p (machine_mode mode)
1594 {
1595 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1596 }
1597
1598 /* Return true if MODE is an SVE data vector mode; either a single vector
1599 or a structure of vectors. */
1600 static bool
1601 aarch64_sve_data_mode_p (machine_mode mode)
1602 {
1603 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1604 }
1605
1606 /* Implement target hook TARGET_ARRAY_MODE. */
1607 static opt_machine_mode
1608 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1609 {
1610 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1611 && IN_RANGE (nelems, 2, 4))
1612 return mode_for_vector (GET_MODE_INNER (mode),
1613 GET_MODE_NUNITS (mode) * nelems);
1614
1615 return opt_machine_mode ();
1616 }
1617
1618 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1619 static bool
1620 aarch64_array_mode_supported_p (machine_mode mode,
1621 unsigned HOST_WIDE_INT nelems)
1622 {
1623 if (TARGET_SIMD
1624 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1625 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1626 && (nelems >= 2 && nelems <= 4))
1627 return true;
1628
1629 return false;
1630 }
1631
1632 /* Return the SVE predicate mode to use for elements that have
1633 ELEM_NBYTES bytes, if such a mode exists. */
1634
1635 opt_machine_mode
1636 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1637 {
1638 if (TARGET_SVE)
1639 {
1640 if (elem_nbytes == 1)
1641 return VNx16BImode;
1642 if (elem_nbytes == 2)
1643 return VNx8BImode;
1644 if (elem_nbytes == 4)
1645 return VNx4BImode;
1646 if (elem_nbytes == 8)
1647 return VNx2BImode;
1648 }
1649 return opt_machine_mode ();
1650 }
1651
1652 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1653
1654 static opt_machine_mode
1655 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1656 {
1657 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1658 {
1659 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1660 machine_mode pred_mode;
1661 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1662 return pred_mode;
1663 }
1664
1665 return default_get_mask_mode (nunits, nbytes);
1666 }
1667
1668 /* Return the integer element mode associated with SVE mode MODE. */
1669
1670 static scalar_int_mode
1671 aarch64_sve_element_int_mode (machine_mode mode)
1672 {
1673 unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
1674 GET_MODE_NUNITS (mode));
1675 return int_mode_for_size (elt_bits, 0).require ();
1676 }
1677
1678 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1679 prefer to use the first arithmetic operand as the else value if
1680 the else value doesn't matter, since that exactly matches the SVE
1681 destructive merging form. For ternary operations we could either
1682 pick the first operand and use FMAD-like instructions or the last
1683 operand and use FMLA-like instructions; the latter seems more
1684 natural. */
1685
1686 static tree
1687 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1688 {
1689 return nops == 3 ? ops[2] : ops[0];
1690 }
1691
1692 /* Implement TARGET_HARD_REGNO_NREGS. */
1693
1694 static unsigned int
1695 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1696 {
1697 /* ??? Logically we should only need to provide a value when
1698 HARD_REGNO_MODE_OK says that the combination is valid,
1699 but at the moment we need to handle all modes. Just ignore
1700 any runtime parts for registers that can't store them. */
1701 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1702 switch (aarch64_regno_regclass (regno))
1703 {
1704 case FP_REGS:
1705 case FP_LO_REGS:
1706 case FP_LO8_REGS:
1707 if (aarch64_sve_data_mode_p (mode))
1708 return exact_div (GET_MODE_SIZE (mode),
1709 BYTES_PER_SVE_VECTOR).to_constant ();
1710 return CEIL (lowest_size, UNITS_PER_VREG);
1711 case PR_REGS:
1712 case PR_LO_REGS:
1713 case PR_HI_REGS:
1714 return 1;
1715 default:
1716 return CEIL (lowest_size, UNITS_PER_WORD);
1717 }
1718 gcc_unreachable ();
1719 }
1720
1721 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1722
1723 static bool
1724 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1725 {
1726 if (GET_MODE_CLASS (mode) == MODE_CC)
1727 return regno == CC_REGNUM;
1728
1729 if (regno == VG_REGNUM)
1730 /* This must have the same size as _Unwind_Word. */
1731 return mode == DImode;
1732
1733 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1734 if (vec_flags & VEC_SVE_PRED)
1735 return PR_REGNUM_P (regno);
1736
1737 if (PR_REGNUM_P (regno))
1738 return 0;
1739
1740 if (regno == SP_REGNUM)
1741 /* The purpose of comparing with ptr_mode is to support the
1742 global register variable associated with the stack pointer
1743 register via the syntax of asm ("wsp") in ILP32. */
1744 return mode == Pmode || mode == ptr_mode;
1745
1746 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1747 return mode == Pmode;
1748
1749 if (GP_REGNUM_P (regno))
1750 {
1751 if (known_le (GET_MODE_SIZE (mode), 8))
1752 return true;
1753 else if (known_le (GET_MODE_SIZE (mode), 16))
1754 return (regno & 1) == 0;
1755 }
1756 else if (FP_REGNUM_P (regno))
1757 {
1758 if (vec_flags & VEC_STRUCT)
1759 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1760 else
1761 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1762 }
1763
1764 return false;
1765 }
1766
1767 /* Return true if this is a definition of a vectorized simd function. */
1768
1769 static bool
1770 aarch64_simd_decl_p (tree fndecl)
1771 {
1772 tree fntype;
1773
1774 if (fndecl == NULL)
1775 return false;
1776 fntype = TREE_TYPE (fndecl);
1777 if (fntype == NULL)
1778 return false;
1779
1780 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */
1781 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1782 return true;
1783
1784 return false;
1785 }
1786
1787 /* Return the mode a register save/restore should use. DImode for integer
1788 registers, DFmode for FP registers in non-SIMD functions (they only save
1789 the bottom half of a 128 bit register), or TFmode for FP registers in
1790 SIMD functions. */
1791
1792 static machine_mode
1793 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1794 {
1795 return GP_REGNUM_P (regno)
1796 ? E_DImode
1797 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1798 }
1799
1800 /* Return true if the instruction is a call to a SIMD function, false
1801 if it is not a SIMD function or if we do not know anything about
1802 the function. */
1803
1804 static bool
1805 aarch64_simd_call_p (rtx_insn *insn)
1806 {
1807 rtx symbol;
1808 rtx call;
1809 tree fndecl;
1810
1811 gcc_assert (CALL_P (insn));
1812 call = get_call_rtx_from (insn);
1813 symbol = XEXP (XEXP (call, 0), 0);
1814 if (GET_CODE (symbol) != SYMBOL_REF)
1815 return false;
1816 fndecl = SYMBOL_REF_DECL (symbol);
1817 if (!fndecl)
1818 return false;
1819
1820 return aarch64_simd_decl_p (fndecl);
1821 }
1822
1823 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls
1824 a function that uses the SIMD ABI, take advantage of the extra
1825 call-preserved registers that the ABI provides. */
1826
1827 void
1828 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1829 HARD_REG_SET *return_set)
1830 {
1831 if (aarch64_simd_call_p (insn))
1832 {
1833 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1834 if (FP_SIMD_SAVED_REGNUM_P (regno))
1835 CLEAR_HARD_REG_BIT (*return_set, regno);
1836 }
1837 }
1838
1839 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1840 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1841 clobbers the top 64 bits when restoring the bottom 64 bits. */
1842
1843 static bool
1844 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1845 machine_mode mode)
1846 {
1847 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1848 return FP_REGNUM_P (regno)
1849 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1850 }
1851
1852 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */
1853
1854 rtx_insn *
1855 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1856 {
1857 gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1858
1859 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1860 return call_1;
1861 else
1862 return call_2;
1863 }
1864
1865 /* Implement REGMODE_NATURAL_SIZE. */
1866 poly_uint64
1867 aarch64_regmode_natural_size (machine_mode mode)
1868 {
1869 /* The natural size for SVE data modes is one SVE data vector,
1870 and similarly for predicates. We can't independently modify
1871 anything smaller than that. */
1872 /* ??? For now, only do this for variable-width SVE registers.
1873 Doing it for constant-sized registers breaks lower-subreg.c. */
1874 /* ??? And once that's fixed, we should probably have similar
1875 code for Advanced SIMD. */
1876 if (!aarch64_sve_vg.is_constant ())
1877 {
1878 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1879 if (vec_flags & VEC_SVE_PRED)
1880 return BYTES_PER_SVE_PRED;
1881 if (vec_flags & VEC_SVE_DATA)
1882 return BYTES_PER_SVE_VECTOR;
1883 }
1884 return UNITS_PER_WORD;
1885 }
1886
1887 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1888 machine_mode
1889 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1890 machine_mode mode)
1891 {
1892 /* The predicate mode determines which bits are significant and
1893 which are "don't care". Decreasing the number of lanes would
1894 lose data while increasing the number of lanes would make bits
1895 unnecessarily significant. */
1896 if (PR_REGNUM_P (regno))
1897 return mode;
1898 if (known_ge (GET_MODE_SIZE (mode), 4))
1899 return mode;
1900 else
1901 return SImode;
1902 }
1903
1904 /* Return true if I's bits are consecutive ones from the MSB. */
1905 bool
1906 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1907 {
1908 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1909 }
1910
1911 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1912 that strcpy from constants will be faster. */
1913
1914 static HOST_WIDE_INT
1915 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1916 {
1917 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1918 return MAX (align, BITS_PER_WORD);
1919 return align;
1920 }
1921
1922 /* Return true if calls to DECL should be treated as
1923 long-calls (ie called via a register). */
1924 static bool
1925 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1926 {
1927 return false;
1928 }
1929
1930 /* Return true if calls to symbol-ref SYM should be treated as
1931 long-calls (ie called via a register). */
1932 bool
1933 aarch64_is_long_call_p (rtx sym)
1934 {
1935 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1936 }
1937
1938 /* Return true if calls to symbol-ref SYM should not go through
1939 plt stubs. */
1940
1941 bool
1942 aarch64_is_noplt_call_p (rtx sym)
1943 {
1944 const_tree decl = SYMBOL_REF_DECL (sym);
1945
1946 if (flag_pic
1947 && decl
1948 && (!flag_plt
1949 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1950 && !targetm.binds_local_p (decl))
1951 return true;
1952
1953 return false;
1954 }
1955
1956 /* Return true if the offsets to a zero/sign-extract operation
1957 represent an expression that matches an extend operation. The
1958 operands represent the paramters from
1959
1960 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1961 bool
1962 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1963 rtx extract_imm)
1964 {
1965 HOST_WIDE_INT mult_val, extract_val;
1966
1967 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1968 return false;
1969
1970 mult_val = INTVAL (mult_imm);
1971 extract_val = INTVAL (extract_imm);
1972
1973 if (extract_val > 8
1974 && extract_val < GET_MODE_BITSIZE (mode)
1975 && exact_log2 (extract_val & ~7) > 0
1976 && (extract_val & 7) <= 4
1977 && mult_val == (1 << (extract_val & 7)))
1978 return true;
1979
1980 return false;
1981 }
1982
1983 /* Emit an insn that's a simple single-set. Both the operands must be
1984 known to be valid. */
1985 inline static rtx_insn *
1986 emit_set_insn (rtx x, rtx y)
1987 {
1988 return emit_insn (gen_rtx_SET (x, y));
1989 }
1990
1991 /* X and Y are two things to compare using CODE. Emit the compare insn and
1992 return the rtx for register 0 in the proper mode. */
1993 rtx
1994 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1995 {
1996 machine_mode mode = SELECT_CC_MODE (code, x, y);
1997 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1998
1999 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
2000 return cc_reg;
2001 }
2002
2003 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2004
2005 static rtx
2006 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2007 machine_mode y_mode)
2008 {
2009 if (y_mode == E_QImode || y_mode == E_HImode)
2010 {
2011 if (CONST_INT_P (y))
2012 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2013 else
2014 {
2015 rtx t, cc_reg;
2016 machine_mode cc_mode;
2017
2018 t = gen_rtx_ZERO_EXTEND (SImode, y);
2019 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2020 cc_mode = CC_SWPmode;
2021 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2022 emit_set_insn (cc_reg, t);
2023 return cc_reg;
2024 }
2025 }
2026
2027 return aarch64_gen_compare_reg (code, x, y);
2028 }
2029
2030 /* Build the SYMBOL_REF for __tls_get_addr. */
2031
2032 static GTY(()) rtx tls_get_addr_libfunc;
2033
2034 rtx
2035 aarch64_tls_get_addr (void)
2036 {
2037 if (!tls_get_addr_libfunc)
2038 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2039 return tls_get_addr_libfunc;
2040 }
2041
2042 /* Return the TLS model to use for ADDR. */
2043
2044 static enum tls_model
2045 tls_symbolic_operand_type (rtx addr)
2046 {
2047 enum tls_model tls_kind = TLS_MODEL_NONE;
2048 if (GET_CODE (addr) == CONST)
2049 {
2050 poly_int64 addend;
2051 rtx sym = strip_offset (addr, &addend);
2052 if (GET_CODE (sym) == SYMBOL_REF)
2053 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2054 }
2055 else if (GET_CODE (addr) == SYMBOL_REF)
2056 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2057
2058 return tls_kind;
2059 }
2060
2061 /* We'll allow lo_sum's in addresses in our legitimate addresses
2062 so that combine would take care of combining addresses where
2063 necessary, but for generation purposes, we'll generate the address
2064 as :
2065 RTL Absolute
2066 tmp = hi (symbol_ref); adrp x1, foo
2067 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2068 nop
2069
2070 PIC TLS
2071 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2072 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2073 bl __tls_get_addr
2074 nop
2075
2076 Load TLS symbol, depending on TLS mechanism and TLS access model.
2077
2078 Global Dynamic - Traditional TLS:
2079 adrp tmp, :tlsgd:imm
2080 add dest, tmp, #:tlsgd_lo12:imm
2081 bl __tls_get_addr
2082
2083 Global Dynamic - TLS Descriptors:
2084 adrp dest, :tlsdesc:imm
2085 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2086 add dest, dest, #:tlsdesc_lo12:imm
2087 blr tmp
2088 mrs tp, tpidr_el0
2089 add dest, dest, tp
2090
2091 Initial Exec:
2092 mrs tp, tpidr_el0
2093 adrp tmp, :gottprel:imm
2094 ldr dest, [tmp, #:gottprel_lo12:imm]
2095 add dest, dest, tp
2096
2097 Local Exec:
2098 mrs tp, tpidr_el0
2099 add t0, tp, #:tprel_hi12:imm, lsl #12
2100 add t0, t0, #:tprel_lo12_nc:imm
2101 */
2102
2103 static void
2104 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2105 enum aarch64_symbol_type type)
2106 {
2107 switch (type)
2108 {
2109 case SYMBOL_SMALL_ABSOLUTE:
2110 {
2111 /* In ILP32, the mode of dest can be either SImode or DImode. */
2112 rtx tmp_reg = dest;
2113 machine_mode mode = GET_MODE (dest);
2114
2115 gcc_assert (mode == Pmode || mode == ptr_mode);
2116
2117 if (can_create_pseudo_p ())
2118 tmp_reg = gen_reg_rtx (mode);
2119
2120 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2121 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2122 return;
2123 }
2124
2125 case SYMBOL_TINY_ABSOLUTE:
2126 emit_insn (gen_rtx_SET (dest, imm));
2127 return;
2128
2129 case SYMBOL_SMALL_GOT_28K:
2130 {
2131 machine_mode mode = GET_MODE (dest);
2132 rtx gp_rtx = pic_offset_table_rtx;
2133 rtx insn;
2134 rtx mem;
2135
2136 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2137 here before rtl expand. Tree IVOPT will generate rtl pattern to
2138 decide rtx costs, in which case pic_offset_table_rtx is not
2139 initialized. For that case no need to generate the first adrp
2140 instruction as the final cost for global variable access is
2141 one instruction. */
2142 if (gp_rtx != NULL)
2143 {
2144 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2145 using the page base as GOT base, the first page may be wasted,
2146 in the worst scenario, there is only 28K space for GOT).
2147
2148 The generate instruction sequence for accessing global variable
2149 is:
2150
2151 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2152
2153 Only one instruction needed. But we must initialize
2154 pic_offset_table_rtx properly. We generate initialize insn for
2155 every global access, and allow CSE to remove all redundant.
2156
2157 The final instruction sequences will look like the following
2158 for multiply global variables access.
2159
2160 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2161
2162 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2163 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2164 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2165 ... */
2166
2167 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2168 crtl->uses_pic_offset_table = 1;
2169 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2170
2171 if (mode != GET_MODE (gp_rtx))
2172 gp_rtx = gen_lowpart (mode, gp_rtx);
2173
2174 }
2175
2176 if (mode == ptr_mode)
2177 {
2178 if (mode == DImode)
2179 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2180 else
2181 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2182
2183 mem = XVECEXP (SET_SRC (insn), 0, 0);
2184 }
2185 else
2186 {
2187 gcc_assert (mode == Pmode);
2188
2189 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2190 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2191 }
2192
2193 /* The operand is expected to be MEM. Whenever the related insn
2194 pattern changed, above code which calculate mem should be
2195 updated. */
2196 gcc_assert (GET_CODE (mem) == MEM);
2197 MEM_READONLY_P (mem) = 1;
2198 MEM_NOTRAP_P (mem) = 1;
2199 emit_insn (insn);
2200 return;
2201 }
2202
2203 case SYMBOL_SMALL_GOT_4G:
2204 {
2205 /* In ILP32, the mode of dest can be either SImode or DImode,
2206 while the got entry is always of SImode size. The mode of
2207 dest depends on how dest is used: if dest is assigned to a
2208 pointer (e.g. in the memory), it has SImode; it may have
2209 DImode if dest is dereferenced to access the memeory.
2210 This is why we have to handle three different ldr_got_small
2211 patterns here (two patterns for ILP32). */
2212
2213 rtx insn;
2214 rtx mem;
2215 rtx tmp_reg = dest;
2216 machine_mode mode = GET_MODE (dest);
2217
2218 if (can_create_pseudo_p ())
2219 tmp_reg = gen_reg_rtx (mode);
2220
2221 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2222 if (mode == ptr_mode)
2223 {
2224 if (mode == DImode)
2225 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2226 else
2227 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2228
2229 mem = XVECEXP (SET_SRC (insn), 0, 0);
2230 }
2231 else
2232 {
2233 gcc_assert (mode == Pmode);
2234
2235 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2236 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2237 }
2238
2239 gcc_assert (GET_CODE (mem) == MEM);
2240 MEM_READONLY_P (mem) = 1;
2241 MEM_NOTRAP_P (mem) = 1;
2242 emit_insn (insn);
2243 return;
2244 }
2245
2246 case SYMBOL_SMALL_TLSGD:
2247 {
2248 rtx_insn *insns;
2249 machine_mode mode = GET_MODE (dest);
2250 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2251
2252 start_sequence ();
2253 if (TARGET_ILP32)
2254 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2255 else
2256 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2257 insns = get_insns ();
2258 end_sequence ();
2259
2260 RTL_CONST_CALL_P (insns) = 1;
2261 emit_libcall_block (insns, dest, result, imm);
2262 return;
2263 }
2264
2265 case SYMBOL_SMALL_TLSDESC:
2266 {
2267 machine_mode mode = GET_MODE (dest);
2268 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2269 rtx tp;
2270
2271 gcc_assert (mode == Pmode || mode == ptr_mode);
2272
2273 /* In ILP32, the got entry is always of SImode size. Unlike
2274 small GOT, the dest is fixed at reg 0. */
2275 if (TARGET_ILP32)
2276 emit_insn (gen_tlsdesc_small_si (imm));
2277 else
2278 emit_insn (gen_tlsdesc_small_di (imm));
2279 tp = aarch64_load_tp (NULL);
2280
2281 if (mode != Pmode)
2282 tp = gen_lowpart (mode, tp);
2283
2284 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2285 if (REG_P (dest))
2286 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2287 return;
2288 }
2289
2290 case SYMBOL_SMALL_TLSIE:
2291 {
2292 /* In ILP32, the mode of dest can be either SImode or DImode,
2293 while the got entry is always of SImode size. The mode of
2294 dest depends on how dest is used: if dest is assigned to a
2295 pointer (e.g. in the memory), it has SImode; it may have
2296 DImode if dest is dereferenced to access the memeory.
2297 This is why we have to handle three different tlsie_small
2298 patterns here (two patterns for ILP32). */
2299 machine_mode mode = GET_MODE (dest);
2300 rtx tmp_reg = gen_reg_rtx (mode);
2301 rtx tp = aarch64_load_tp (NULL);
2302
2303 if (mode == ptr_mode)
2304 {
2305 if (mode == DImode)
2306 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2307 else
2308 {
2309 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2310 tp = gen_lowpart (mode, tp);
2311 }
2312 }
2313 else
2314 {
2315 gcc_assert (mode == Pmode);
2316 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2317 }
2318
2319 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2320 if (REG_P (dest))
2321 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2322 return;
2323 }
2324
2325 case SYMBOL_TLSLE12:
2326 case SYMBOL_TLSLE24:
2327 case SYMBOL_TLSLE32:
2328 case SYMBOL_TLSLE48:
2329 {
2330 machine_mode mode = GET_MODE (dest);
2331 rtx tp = aarch64_load_tp (NULL);
2332
2333 if (mode != Pmode)
2334 tp = gen_lowpart (mode, tp);
2335
2336 switch (type)
2337 {
2338 case SYMBOL_TLSLE12:
2339 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2340 (dest, tp, imm));
2341 break;
2342 case SYMBOL_TLSLE24:
2343 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2344 (dest, tp, imm));
2345 break;
2346 case SYMBOL_TLSLE32:
2347 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2348 (dest, imm));
2349 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2350 (dest, dest, tp));
2351 break;
2352 case SYMBOL_TLSLE48:
2353 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2354 (dest, imm));
2355 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2356 (dest, dest, tp));
2357 break;
2358 default:
2359 gcc_unreachable ();
2360 }
2361
2362 if (REG_P (dest))
2363 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2364 return;
2365 }
2366
2367 case SYMBOL_TINY_GOT:
2368 emit_insn (gen_ldr_got_tiny (dest, imm));
2369 return;
2370
2371 case SYMBOL_TINY_TLSIE:
2372 {
2373 machine_mode mode = GET_MODE (dest);
2374 rtx tp = aarch64_load_tp (NULL);
2375
2376 if (mode == ptr_mode)
2377 {
2378 if (mode == DImode)
2379 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2380 else
2381 {
2382 tp = gen_lowpart (mode, tp);
2383 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2384 }
2385 }
2386 else
2387 {
2388 gcc_assert (mode == Pmode);
2389 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2390 }
2391
2392 if (REG_P (dest))
2393 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2394 return;
2395 }
2396
2397 default:
2398 gcc_unreachable ();
2399 }
2400 }
2401
2402 /* Emit a move from SRC to DEST. Assume that the move expanders can
2403 handle all moves if !can_create_pseudo_p (). The distinction is
2404 important because, unlike emit_move_insn, the move expanders know
2405 how to force Pmode objects into the constant pool even when the
2406 constant pool address is not itself legitimate. */
2407 static rtx
2408 aarch64_emit_move (rtx dest, rtx src)
2409 {
2410 return (can_create_pseudo_p ()
2411 ? emit_move_insn (dest, src)
2412 : emit_move_insn_1 (dest, src));
2413 }
2414
2415 /* Apply UNOPTAB to OP and store the result in DEST. */
2416
2417 static void
2418 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2419 {
2420 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2421 if (dest != tmp)
2422 emit_move_insn (dest, tmp);
2423 }
2424
2425 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2426
2427 static void
2428 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2429 {
2430 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2431 OPTAB_DIRECT);
2432 if (dest != tmp)
2433 emit_move_insn (dest, tmp);
2434 }
2435
2436 /* Split a 128-bit move operation into two 64-bit move operations,
2437 taking care to handle partial overlap of register to register
2438 copies. Special cases are needed when moving between GP regs and
2439 FP regs. SRC can be a register, constant or memory; DST a register
2440 or memory. If either operand is memory it must not have any side
2441 effects. */
2442 void
2443 aarch64_split_128bit_move (rtx dst, rtx src)
2444 {
2445 rtx dst_lo, dst_hi;
2446 rtx src_lo, src_hi;
2447
2448 machine_mode mode = GET_MODE (dst);
2449
2450 gcc_assert (mode == TImode || mode == TFmode);
2451 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2452 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2453
2454 if (REG_P (dst) && REG_P (src))
2455 {
2456 int src_regno = REGNO (src);
2457 int dst_regno = REGNO (dst);
2458
2459 /* Handle FP <-> GP regs. */
2460 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2461 {
2462 src_lo = gen_lowpart (word_mode, src);
2463 src_hi = gen_highpart (word_mode, src);
2464
2465 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2466 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2467 return;
2468 }
2469 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2470 {
2471 dst_lo = gen_lowpart (word_mode, dst);
2472 dst_hi = gen_highpart (word_mode, dst);
2473
2474 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2475 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2476 return;
2477 }
2478 }
2479
2480 dst_lo = gen_lowpart (word_mode, dst);
2481 dst_hi = gen_highpart (word_mode, dst);
2482 src_lo = gen_lowpart (word_mode, src);
2483 src_hi = gen_highpart_mode (word_mode, mode, src);
2484
2485 /* At most one pairing may overlap. */
2486 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2487 {
2488 aarch64_emit_move (dst_hi, src_hi);
2489 aarch64_emit_move (dst_lo, src_lo);
2490 }
2491 else
2492 {
2493 aarch64_emit_move (dst_lo, src_lo);
2494 aarch64_emit_move (dst_hi, src_hi);
2495 }
2496 }
2497
2498 bool
2499 aarch64_split_128bit_move_p (rtx dst, rtx src)
2500 {
2501 return (! REG_P (src)
2502 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2503 }
2504
2505 /* Split a complex SIMD combine. */
2506
2507 void
2508 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2509 {
2510 machine_mode src_mode = GET_MODE (src1);
2511 machine_mode dst_mode = GET_MODE (dst);
2512
2513 gcc_assert (VECTOR_MODE_P (dst_mode));
2514 gcc_assert (register_operand (dst, dst_mode)
2515 && register_operand (src1, src_mode)
2516 && register_operand (src2, src_mode));
2517
2518 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2519 return;
2520 }
2521
2522 /* Split a complex SIMD move. */
2523
2524 void
2525 aarch64_split_simd_move (rtx dst, rtx src)
2526 {
2527 machine_mode src_mode = GET_MODE (src);
2528 machine_mode dst_mode = GET_MODE (dst);
2529
2530 gcc_assert (VECTOR_MODE_P (dst_mode));
2531
2532 if (REG_P (dst) && REG_P (src))
2533 {
2534 gcc_assert (VECTOR_MODE_P (src_mode));
2535 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2536 }
2537 }
2538
2539 bool
2540 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2541 machine_mode ymode, rtx y)
2542 {
2543 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2544 gcc_assert (r != NULL);
2545 return rtx_equal_p (x, r);
2546 }
2547
2548
2549 /* Return TARGET if it is nonnull and a register of mode MODE.
2550 Otherwise, return a fresh register of mode MODE if we can,
2551 or TARGET reinterpreted as MODE if we can't. */
2552
2553 static rtx
2554 aarch64_target_reg (rtx target, machine_mode mode)
2555 {
2556 if (target && REG_P (target) && GET_MODE (target) == mode)
2557 return target;
2558 if (!can_create_pseudo_p ())
2559 {
2560 gcc_assert (target);
2561 return gen_lowpart (mode, target);
2562 }
2563 return gen_reg_rtx (mode);
2564 }
2565
2566 /* Return a register that contains the constant in BUILDER, given that
2567 the constant is a legitimate move operand. Use TARGET as the register
2568 if it is nonnull and convenient. */
2569
2570 static rtx
2571 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2572 {
2573 rtx src = builder.build ();
2574 target = aarch64_target_reg (target, GET_MODE (src));
2575 emit_insn (gen_rtx_SET (target, src));
2576 return target;
2577 }
2578
2579 static rtx
2580 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2581 {
2582 if (can_create_pseudo_p ())
2583 return force_reg (mode, value);
2584 else
2585 {
2586 gcc_assert (x);
2587 aarch64_emit_move (x, value);
2588 return x;
2589 }
2590 }
2591
2592 /* Return true if predicate value X is a constant in which every element
2593 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2594 value, i.e. as a predicate in which all bits are significant. */
2595
2596 static bool
2597 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2598 {
2599 if (GET_CODE (x) != CONST_VECTOR)
2600 return false;
2601
2602 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2603 GET_MODE_NUNITS (GET_MODE (x)));
2604 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2605 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2606 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2607
2608 unsigned int nelts = const_vector_encoded_nelts (x);
2609 for (unsigned int i = 0; i < nelts; ++i)
2610 {
2611 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2612 if (!CONST_INT_P (elt))
2613 return false;
2614
2615 builder.quick_push (elt);
2616 for (unsigned int j = 1; j < factor; ++j)
2617 builder.quick_push (const0_rtx);
2618 }
2619 builder.finalize ();
2620 return true;
2621 }
2622
2623 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2624 widest predicate element size it can have (that is, the largest size
2625 for which each element would still be 0 or 1). */
2626
2627 unsigned int
2628 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
2629 {
2630 /* Start with the most optimistic assumption: that we only need
2631 one bit per pattern. This is what we will use if only the first
2632 bit in each pattern is ever set. */
2633 unsigned int mask = GET_MODE_SIZE (DImode);
2634 mask |= builder.npatterns ();
2635
2636 /* Look for set bits. */
2637 unsigned int nelts = builder.encoded_nelts ();
2638 for (unsigned int i = 1; i < nelts; ++i)
2639 if (INTVAL (builder.elt (i)) != 0)
2640 {
2641 if (i & 1)
2642 return 1;
2643 mask |= i;
2644 }
2645 return mask & -mask;
2646 }
2647
2648 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
2649 that the constant would have with predicate element size ELT_SIZE
2650 (ignoring the upper bits in each element) and return:
2651
2652 * -1 if all bits are set
2653 * N if the predicate has N leading set bits followed by all clear bits
2654 * 0 if the predicate does not have any of these forms. */
2655
2656 int
2657 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
2658 unsigned int elt_size)
2659 {
2660 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
2661 followed by set bits. */
2662 if (builder.nelts_per_pattern () == 3)
2663 return 0;
2664
2665 /* Skip over leading set bits. */
2666 unsigned int nelts = builder.encoded_nelts ();
2667 unsigned int i = 0;
2668 for (; i < nelts; i += elt_size)
2669 if (INTVAL (builder.elt (i)) == 0)
2670 break;
2671 unsigned int vl = i / elt_size;
2672
2673 /* Check for the all-true case. */
2674 if (i == nelts)
2675 return -1;
2676
2677 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
2678 repeating pattern of set bits followed by clear bits. */
2679 if (builder.nelts_per_pattern () != 2)
2680 return 0;
2681
2682 /* We have a "foreground" value and a duplicated "background" value.
2683 If the background might repeat and the last set bit belongs to it,
2684 we might have set bits followed by clear bits followed by set bits. */
2685 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
2686 return 0;
2687
2688 /* Make sure that the rest are all clear. */
2689 for (; i < nelts; i += elt_size)
2690 if (INTVAL (builder.elt (i)) != 0)
2691 return 0;
2692
2693 return vl;
2694 }
2695
2696 /* See if there is an svpattern that encodes an SVE predicate of mode
2697 PRED_MODE in which the first VL bits are set and the rest are clear.
2698 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
2699 A VL of -1 indicates an all-true vector. */
2700
2701 aarch64_svpattern
2702 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
2703 {
2704 if (vl < 0)
2705 return AARCH64_SV_ALL;
2706
2707 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
2708 return AARCH64_NUM_SVPATTERNS;
2709
2710 if (vl >= 1 && vl <= 8)
2711 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
2712
2713 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
2714 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
2715
2716 int max_vl;
2717 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
2718 {
2719 if (vl == (max_vl / 3) * 3)
2720 return AARCH64_SV_MUL3;
2721 /* These would only trigger for non-power-of-2 lengths. */
2722 if (vl == (max_vl & -4))
2723 return AARCH64_SV_MUL4;
2724 if (vl == (1 << floor_log2 (max_vl)))
2725 return AARCH64_SV_POW2;
2726 if (vl == max_vl)
2727 return AARCH64_SV_ALL;
2728 }
2729 return AARCH64_NUM_SVPATTERNS;
2730 }
2731
2732 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
2733 bits has the lowest bit set and the upper bits clear. This is the
2734 VNx16BImode equivalent of a PTRUE for controlling elements of
2735 ELT_SIZE bytes. However, because the constant is VNx16BImode,
2736 all bits are significant, even the upper zeros. */
2737
2738 rtx
2739 aarch64_ptrue_all (unsigned int elt_size)
2740 {
2741 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
2742 builder.quick_push (const1_rtx);
2743 for (unsigned int i = 1; i < elt_size; ++i)
2744 builder.quick_push (const0_rtx);
2745 return builder.build ();
2746 }
2747
2748 /* Return an all-true predicate register of mode MODE. */
2749
2750 rtx
2751 aarch64_ptrue_reg (machine_mode mode)
2752 {
2753 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2754 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2755 return gen_lowpart (mode, reg);
2756 }
2757
2758 /* Return an all-false predicate register of mode MODE. */
2759
2760 rtx
2761 aarch64_pfalse_reg (machine_mode mode)
2762 {
2763 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
2764 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
2765 return gen_lowpart (mode, reg);
2766 }
2767
2768 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
2769 true, or alternatively if we know that the operation predicated by
2770 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
2771 aarch64_sve_gp_strictness operand that describes the operation
2772 predicated by PRED1[0]. */
2773
2774 bool
2775 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
2776 {
2777 machine_mode mode = GET_MODE (pred2);
2778 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2779 && mode == GET_MODE (pred1[0])
2780 && aarch64_sve_gp_strictness (pred1[1], SImode));
2781 return (pred1[0] == CONSTM1_RTX (mode)
2782 || INTVAL (pred1[1]) == SVE_RELAXED_GP
2783 || rtx_equal_p (pred1[0], pred2));
2784 }
2785
2786 /* Use a comparison to convert integer vector SRC into MODE, which is
2787 the corresponding SVE predicate mode. Use TARGET for the result
2788 if it's nonnull and convenient. */
2789
2790 static rtx
2791 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
2792 {
2793 machine_mode src_mode = GET_MODE (src);
2794 insn_code icode = code_for_aarch64_pred_cmp (NE, src_mode);
2795 expand_operand ops[4];
2796 create_output_operand (&ops[0], target, mode);
2797 create_input_operand (&ops[1], CONSTM1_RTX (mode), mode);
2798 create_input_operand (&ops[2], src, src_mode);
2799 create_input_operand (&ops[3], CONST0_RTX (src_mode), src_mode);
2800 expand_insn (icode, 4, ops);
2801 return ops[0].value;
2802 }
2803
2804 /* Return true if we can move VALUE into a register using a single
2805 CNT[BHWD] instruction. */
2806
2807 static bool
2808 aarch64_sve_cnt_immediate_p (poly_int64 value)
2809 {
2810 HOST_WIDE_INT factor = value.coeffs[0];
2811 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2812 return (value.coeffs[1] == factor
2813 && IN_RANGE (factor, 2, 16 * 16)
2814 && (factor & 1) == 0
2815 && factor <= 16 * (factor & -factor));
2816 }
2817
2818 /* Likewise for rtx X. */
2819
2820 bool
2821 aarch64_sve_cnt_immediate_p (rtx x)
2822 {
2823 poly_int64 value;
2824 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2825 }
2826
2827 /* Return the asm string for an instruction with a CNT-like vector size
2828 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2829 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2830 first part of the operands template (the part that comes before the
2831 vector size itself). FACTOR is the number of quadwords.
2832 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2833 If it is zero, we can use any element size. */
2834
2835 static char *
2836 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2837 unsigned int factor,
2838 unsigned int nelts_per_vq)
2839 {
2840 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2841
2842 if (nelts_per_vq == 0)
2843 /* There is some overlap in the ranges of the four CNT instructions.
2844 Here we always use the smallest possible element size, so that the
2845 multiplier is 1 whereever possible. */
2846 nelts_per_vq = factor & -factor;
2847 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2848 gcc_assert (IN_RANGE (shift, 1, 4));
2849 char suffix = "dwhb"[shift - 1];
2850
2851 factor >>= shift;
2852 unsigned int written;
2853 if (factor == 1)
2854 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2855 prefix, suffix, operands);
2856 else
2857 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2858 prefix, suffix, operands, factor);
2859 gcc_assert (written < sizeof (buffer));
2860 return buffer;
2861 }
2862
2863 /* Return the asm string for an instruction with a CNT-like vector size
2864 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2865 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2866 first part of the operands template (the part that comes before the
2867 vector size itself). X is the value of the vector size operand,
2868 as a polynomial integer rtx. */
2869
2870 char *
2871 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2872 rtx x)
2873 {
2874 poly_int64 value = rtx_to_poly_int64 (x);
2875 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2876 return aarch64_output_sve_cnt_immediate (prefix, operands,
2877 value.coeffs[1], 0);
2878 }
2879
2880 /* Return true if we can add VALUE to a register using a single ADDVL
2881 or ADDPL instruction. */
2882
2883 static bool
2884 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2885 {
2886 HOST_WIDE_INT factor = value.coeffs[0];
2887 if (factor == 0 || value.coeffs[1] != factor)
2888 return false;
2889 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2890 and a value of 16 is one vector width. */
2891 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2892 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2893 }
2894
2895 /* Likewise for rtx X. */
2896
2897 bool
2898 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2899 {
2900 poly_int64 value;
2901 return (poly_int_rtx_p (x, &value)
2902 && aarch64_sve_addvl_addpl_immediate_p (value));
2903 }
2904
2905 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2906 and storing the result in operand 0. */
2907
2908 char *
2909 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2910 {
2911 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2912 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2913 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2914
2915 /* Use INC or DEC if possible. */
2916 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2917 {
2918 if (aarch64_sve_cnt_immediate_p (offset_value))
2919 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2920 offset_value.coeffs[1], 0);
2921 if (aarch64_sve_cnt_immediate_p (-offset_value))
2922 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2923 -offset_value.coeffs[1], 0);
2924 }
2925
2926 int factor = offset_value.coeffs[1];
2927 if ((factor & 15) == 0)
2928 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2929 else
2930 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2931 return buffer;
2932 }
2933
2934 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2935 instruction. If it is, store the number of elements in each vector
2936 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2937 factor in *FACTOR_OUT (if nonnull). */
2938
2939 bool
2940 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2941 unsigned int *nelts_per_vq_out)
2942 {
2943 rtx elt;
2944 poly_int64 value;
2945
2946 if (!const_vec_duplicate_p (x, &elt)
2947 || !poly_int_rtx_p (elt, &value))
2948 return false;
2949
2950 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2951 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2952 /* There's no vector INCB. */
2953 return false;
2954
2955 HOST_WIDE_INT factor = value.coeffs[0];
2956 if (value.coeffs[1] != factor)
2957 return false;
2958
2959 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2960 if ((factor % nelts_per_vq) != 0
2961 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2962 return false;
2963
2964 if (factor_out)
2965 *factor_out = factor;
2966 if (nelts_per_vq_out)
2967 *nelts_per_vq_out = nelts_per_vq;
2968 return true;
2969 }
2970
2971 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2972 instruction. */
2973
2974 bool
2975 aarch64_sve_inc_dec_immediate_p (rtx x)
2976 {
2977 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2978 }
2979
2980 /* Return the asm template for an SVE vector INC or DEC instruction.
2981 OPERANDS gives the operands before the vector count and X is the
2982 value of the vector count operand itself. */
2983
2984 char *
2985 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2986 {
2987 int factor;
2988 unsigned int nelts_per_vq;
2989 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2990 gcc_unreachable ();
2991 if (factor < 0)
2992 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2993 nelts_per_vq);
2994 else
2995 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2996 nelts_per_vq);
2997 }
2998
2999 static int
3000 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3001 scalar_int_mode mode)
3002 {
3003 int i;
3004 unsigned HOST_WIDE_INT val, val2, mask;
3005 int one_match, zero_match;
3006 int num_insns;
3007
3008 val = INTVAL (imm);
3009
3010 if (aarch64_move_imm (val, mode))
3011 {
3012 if (generate)
3013 emit_insn (gen_rtx_SET (dest, imm));
3014 return 1;
3015 }
3016
3017 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3018 (with XXXX non-zero). In that case check to see if the move can be done in
3019 a smaller mode. */
3020 val2 = val & 0xffffffff;
3021 if (mode == DImode
3022 && aarch64_move_imm (val2, SImode)
3023 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3024 {
3025 if (generate)
3026 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3027
3028 /* Check if we have to emit a second instruction by checking to see
3029 if any of the upper 32 bits of the original DI mode value is set. */
3030 if (val == val2)
3031 return 1;
3032
3033 i = (val >> 48) ? 48 : 32;
3034
3035 if (generate)
3036 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3037 GEN_INT ((val >> i) & 0xffff)));
3038
3039 return 2;
3040 }
3041
3042 if ((val >> 32) == 0 || mode == SImode)
3043 {
3044 if (generate)
3045 {
3046 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3047 if (mode == SImode)
3048 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3049 GEN_INT ((val >> 16) & 0xffff)));
3050 else
3051 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3052 GEN_INT ((val >> 16) & 0xffff)));
3053 }
3054 return 2;
3055 }
3056
3057 /* Remaining cases are all for DImode. */
3058
3059 mask = 0xffff;
3060 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3061 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3062 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3063 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3064
3065 if (zero_match != 2 && one_match != 2)
3066 {
3067 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3068 For a 64-bit bitmask try whether changing 16 bits to all ones or
3069 zeroes creates a valid bitmask. To check any repeated bitmask,
3070 try using 16 bits from the other 32-bit half of val. */
3071
3072 for (i = 0; i < 64; i += 16, mask <<= 16)
3073 {
3074 val2 = val & ~mask;
3075 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3076 break;
3077 val2 = val | mask;
3078 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3079 break;
3080 val2 = val2 & ~mask;
3081 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3082 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3083 break;
3084 }
3085 if (i != 64)
3086 {
3087 if (generate)
3088 {
3089 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3090 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3091 GEN_INT ((val >> i) & 0xffff)));
3092 }
3093 return 2;
3094 }
3095 }
3096
3097 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3098 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3099 otherwise skip zero bits. */
3100
3101 num_insns = 1;
3102 mask = 0xffff;
3103 val2 = one_match > zero_match ? ~val : val;
3104 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3105
3106 if (generate)
3107 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3108 ? (val | ~(mask << i))
3109 : (val & (mask << i)))));
3110 for (i += 16; i < 64; i += 16)
3111 {
3112 if ((val2 & (mask << i)) == 0)
3113 continue;
3114 if (generate)
3115 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3116 GEN_INT ((val >> i) & 0xffff)));
3117 num_insns ++;
3118 }
3119
3120 return num_insns;
3121 }
3122
3123 /* Return whether imm is a 128-bit immediate which is simple enough to
3124 expand inline. */
3125 bool
3126 aarch64_mov128_immediate (rtx imm)
3127 {
3128 if (GET_CODE (imm) == CONST_INT)
3129 return true;
3130
3131 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3132
3133 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3134 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3135
3136 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3137 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3138 }
3139
3140
3141 /* Return the number of temporary registers that aarch64_add_offset_1
3142 would need to add OFFSET to a register. */
3143
3144 static unsigned int
3145 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3146 {
3147 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3148 }
3149
3150 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3151 a non-polynomial OFFSET. MODE is the mode of the addition.
3152 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3153 be set and CFA adjustments added to the generated instructions.
3154
3155 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3156 temporary if register allocation is already complete. This temporary
3157 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3158 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3159 the immediate again.
3160
3161 Since this function may be used to adjust the stack pointer, we must
3162 ensure that it cannot cause transient stack deallocation (for example
3163 by first incrementing SP and then decrementing when adjusting by a
3164 large immediate). */
3165
3166 static void
3167 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3168 rtx src, HOST_WIDE_INT offset, rtx temp1,
3169 bool frame_related_p, bool emit_move_imm)
3170 {
3171 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3172 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3173
3174 HOST_WIDE_INT moffset = abs_hwi (offset);
3175 rtx_insn *insn;
3176
3177 if (!moffset)
3178 {
3179 if (!rtx_equal_p (dest, src))
3180 {
3181 insn = emit_insn (gen_rtx_SET (dest, src));
3182 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3183 }
3184 return;
3185 }
3186
3187 /* Single instruction adjustment. */
3188 if (aarch64_uimm12_shift (moffset))
3189 {
3190 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3191 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3192 return;
3193 }
3194
3195 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3196 and either:
3197
3198 a) the offset cannot be loaded by a 16-bit move or
3199 b) there is no spare register into which we can move it. */
3200 if (moffset < 0x1000000
3201 && ((!temp1 && !can_create_pseudo_p ())
3202 || !aarch64_move_imm (moffset, mode)))
3203 {
3204 HOST_WIDE_INT low_off = moffset & 0xfff;
3205
3206 low_off = offset < 0 ? -low_off : low_off;
3207 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3208 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3209 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3210 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3211 return;
3212 }
3213
3214 /* Emit a move immediate if required and an addition/subtraction. */
3215 if (emit_move_imm)
3216 {
3217 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3218 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3219 }
3220 insn = emit_insn (offset < 0
3221 ? gen_sub3_insn (dest, src, temp1)
3222 : gen_add3_insn (dest, src, temp1));
3223 if (frame_related_p)
3224 {
3225 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3226 rtx adj = plus_constant (mode, src, offset);
3227 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3228 }
3229 }
3230
3231 /* Return the number of temporary registers that aarch64_add_offset
3232 would need to move OFFSET into a register or add OFFSET to a register;
3233 ADD_P is true if we want the latter rather than the former. */
3234
3235 static unsigned int
3236 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3237 {
3238 /* This follows the same structure as aarch64_add_offset. */
3239 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3240 return 0;
3241
3242 unsigned int count = 0;
3243 HOST_WIDE_INT factor = offset.coeffs[1];
3244 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3245 poly_int64 poly_offset (factor, factor);
3246 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3247 /* Need one register for the ADDVL/ADDPL result. */
3248 count += 1;
3249 else if (factor != 0)
3250 {
3251 factor = abs (factor);
3252 if (factor > 16 * (factor & -factor))
3253 /* Need one register for the CNT result and one for the multiplication
3254 factor. If necessary, the second temporary can be reused for the
3255 constant part of the offset. */
3256 return 2;
3257 /* Need one register for the CNT result (which might then
3258 be shifted). */
3259 count += 1;
3260 }
3261 return count + aarch64_add_offset_1_temporaries (constant);
3262 }
3263
3264 /* If X can be represented as a poly_int64, return the number
3265 of temporaries that are required to add it to a register.
3266 Return -1 otherwise. */
3267
3268 int
3269 aarch64_add_offset_temporaries (rtx x)
3270 {
3271 poly_int64 offset;
3272 if (!poly_int_rtx_p (x, &offset))
3273 return -1;
3274 return aarch64_offset_temporaries (true, offset);
3275 }
3276
3277 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3278 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3279 be set and CFA adjustments added to the generated instructions.
3280
3281 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3282 temporary if register allocation is already complete. This temporary
3283 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3284 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3285 false to avoid emitting the immediate again.
3286
3287 TEMP2, if nonnull, is a second temporary register that doesn't
3288 overlap either DEST or REG.
3289
3290 Since this function may be used to adjust the stack pointer, we must
3291 ensure that it cannot cause transient stack deallocation (for example
3292 by first incrementing SP and then decrementing when adjusting by a
3293 large immediate). */
3294
3295 static void
3296 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3297 poly_int64 offset, rtx temp1, rtx temp2,
3298 bool frame_related_p, bool emit_move_imm = true)
3299 {
3300 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3301 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3302 gcc_assert (temp1 == NULL_RTX
3303 || !frame_related_p
3304 || !reg_overlap_mentioned_p (temp1, dest));
3305 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3306
3307 /* Try using ADDVL or ADDPL to add the whole value. */
3308 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3309 {
3310 rtx offset_rtx = gen_int_mode (offset, mode);
3311 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3312 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3313 return;
3314 }
3315
3316 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3317 SVE vector register, over and above the minimum size of 128 bits.
3318 This is equivalent to half the value returned by CNTD with a
3319 vector shape of ALL. */
3320 HOST_WIDE_INT factor = offset.coeffs[1];
3321 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3322
3323 /* Try using ADDVL or ADDPL to add the VG-based part. */
3324 poly_int64 poly_offset (factor, factor);
3325 if (src != const0_rtx
3326 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3327 {
3328 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3329 if (frame_related_p)
3330 {
3331 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3332 RTX_FRAME_RELATED_P (insn) = true;
3333 src = dest;
3334 }
3335 else
3336 {
3337 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3338 src = aarch64_force_temporary (mode, temp1, addr);
3339 temp1 = temp2;
3340 temp2 = NULL_RTX;
3341 }
3342 }
3343 /* Otherwise use a CNT-based sequence. */
3344 else if (factor != 0)
3345 {
3346 /* Use a subtraction if we have a negative factor. */
3347 rtx_code code = PLUS;
3348 if (factor < 0)
3349 {
3350 factor = -factor;
3351 code = MINUS;
3352 }
3353
3354 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3355 into the multiplication. */
3356 rtx val;
3357 int shift = 0;
3358 if (factor & 1)
3359 /* Use a right shift by 1. */
3360 shift = -1;
3361 else
3362 factor /= 2;
3363 HOST_WIDE_INT low_bit = factor & -factor;
3364 if (factor <= 16 * low_bit)
3365 {
3366 if (factor > 16 * 8)
3367 {
3368 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3369 the value with the minimum multiplier and shift it into
3370 position. */
3371 int extra_shift = exact_log2 (low_bit);
3372 shift += extra_shift;
3373 factor >>= extra_shift;
3374 }
3375 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3376 }
3377 else
3378 {
3379 /* Use CNTD, then multiply it by FACTOR. */
3380 val = gen_int_mode (poly_int64 (2, 2), mode);
3381 val = aarch64_force_temporary (mode, temp1, val);
3382
3383 /* Go back to using a negative multiplication factor if we have
3384 no register from which to subtract. */
3385 if (code == MINUS && src == const0_rtx)
3386 {
3387 factor = -factor;
3388 code = PLUS;
3389 }
3390 rtx coeff1 = gen_int_mode (factor, mode);
3391 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3392 val = gen_rtx_MULT (mode, val, coeff1);
3393 }
3394
3395 if (shift > 0)
3396 {
3397 /* Multiply by 1 << SHIFT. */
3398 val = aarch64_force_temporary (mode, temp1, val);
3399 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3400 }
3401 else if (shift == -1)
3402 {
3403 /* Divide by 2. */
3404 val = aarch64_force_temporary (mode, temp1, val);
3405 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3406 }
3407
3408 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3409 if (src != const0_rtx)
3410 {
3411 val = aarch64_force_temporary (mode, temp1, val);
3412 val = gen_rtx_fmt_ee (code, mode, src, val);
3413 }
3414 else if (code == MINUS)
3415 {
3416 val = aarch64_force_temporary (mode, temp1, val);
3417 val = gen_rtx_NEG (mode, val);
3418 }
3419
3420 if (constant == 0 || frame_related_p)
3421 {
3422 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3423 if (frame_related_p)
3424 {
3425 RTX_FRAME_RELATED_P (insn) = true;
3426 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3427 gen_rtx_SET (dest, plus_constant (Pmode, src,
3428 poly_offset)));
3429 }
3430 src = dest;
3431 if (constant == 0)
3432 return;
3433 }
3434 else
3435 {
3436 src = aarch64_force_temporary (mode, temp1, val);
3437 temp1 = temp2;
3438 temp2 = NULL_RTX;
3439 }
3440
3441 emit_move_imm = true;
3442 }
3443
3444 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3445 frame_related_p, emit_move_imm);
3446 }
3447
3448 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3449 than a poly_int64. */
3450
3451 void
3452 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3453 rtx offset_rtx, rtx temp1, rtx temp2)
3454 {
3455 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3456 temp1, temp2, false);
3457 }
3458
3459 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3460 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
3461 if TEMP1 already contains abs (DELTA). */
3462
3463 static inline void
3464 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3465 {
3466 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3467 temp1, temp2, true, emit_move_imm);
3468 }
3469
3470 /* Subtract DELTA from the stack pointer, marking the instructions
3471 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
3472 if nonnull. */
3473
3474 static inline void
3475 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3476 bool emit_move_imm = true)
3477 {
3478 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3479 temp1, temp2, frame_related_p, emit_move_imm);
3480 }
3481
3482 /* Set DEST to (vec_series BASE STEP). */
3483
3484 static void
3485 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3486 {
3487 machine_mode mode = GET_MODE (dest);
3488 scalar_mode inner = GET_MODE_INNER (mode);
3489
3490 /* Each operand can be a register or an immediate in the range [-16, 15]. */
3491 if (!aarch64_sve_index_immediate_p (base))
3492 base = force_reg (inner, base);
3493 if (!aarch64_sve_index_immediate_p (step))
3494 step = force_reg (inner, step);
3495
3496 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3497 }
3498
3499 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
3500 register of mode MODE. Use TARGET for the result if it's nonnull
3501 and convenient.
3502
3503 The two vector modes must have the same element mode. The behavior
3504 is to duplicate architectural lane N of SRC into architectural lanes
3505 N + I * STEP of the result. On big-endian targets, architectural
3506 lane 0 of an Advanced SIMD vector is the last element of the vector
3507 in memory layout, so for big-endian targets this operation has the
3508 effect of reversing SRC before duplicating it. Callers need to
3509 account for this. */
3510
3511 rtx
3512 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
3513 {
3514 machine_mode src_mode = GET_MODE (src);
3515 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
3516 insn_code icode = (BYTES_BIG_ENDIAN
3517 ? code_for_aarch64_vec_duplicate_vq_be (mode)
3518 : code_for_aarch64_vec_duplicate_vq_le (mode));
3519
3520 unsigned int i = 0;
3521 expand_operand ops[3];
3522 create_output_operand (&ops[i++], target, mode);
3523 create_output_operand (&ops[i++], src, src_mode);
3524 if (BYTES_BIG_ENDIAN)
3525 {
3526 /* Create a PARALLEL describing the reversal of SRC. */
3527 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
3528 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
3529 nelts_per_vq - 1, -1);
3530 create_fixed_operand (&ops[i++], sel);
3531 }
3532 expand_insn (icode, i, ops);
3533 return ops[0].value;
3534 }
3535
3536 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
3537 the memory image into DEST. Return true on success. */
3538
3539 static bool
3540 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
3541 {
3542 src = force_const_mem (GET_MODE (src), src);
3543 if (!src)
3544 return false;
3545
3546 /* Make sure that the address is legitimate. */
3547 if (!aarch64_sve_ld1rq_operand_p (src))
3548 {
3549 rtx addr = force_reg (Pmode, XEXP (src, 0));
3550 src = replace_equiv_address (src, addr);
3551 }
3552
3553 machine_mode mode = GET_MODE (dest);
3554 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3555 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3556 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3557 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
3558 return true;
3559 }
3560
3561 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
3562 SVE data mode and isn't a legitimate constant. Use TARGET for the
3563 result if convenient.
3564
3565 The returned register can have whatever mode seems most natural
3566 given the contents of SRC. */
3567
3568 static rtx
3569 aarch64_expand_sve_const_vector (rtx target, rtx src)
3570 {
3571 machine_mode mode = GET_MODE (src);
3572 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3573 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3574 scalar_mode elt_mode = GET_MODE_INNER (mode);
3575 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
3576 unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
3577
3578 if (nelts_per_pattern == 1 && encoded_bits == 128)
3579 {
3580 /* The constant is a duplicated quadword but can't be narrowed
3581 beyond a quadword. Get the memory image of the first quadword
3582 as a 128-bit vector and try using LD1RQ to load it from memory.
3583
3584 The effect for both endiannesses is to load memory lane N into
3585 architectural lanes N + I * STEP of the result. On big-endian
3586 targets, the layout of the 128-bit vector in an Advanced SIMD
3587 register would be different from its layout in an SVE register,
3588 but this 128-bit vector is a memory value only. */
3589 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3590 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
3591 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
3592 return target;
3593 }
3594
3595 if (nelts_per_pattern == 1 && encoded_bits < 128)
3596 {
3597 /* The vector is a repeating sequence of 64 bits or fewer.
3598 See if we can load them using an Advanced SIMD move and then
3599 duplicate it to fill a vector. This is better than using a GPR
3600 move because it keeps everything in the same register file. */
3601 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
3602 rtx_vector_builder builder (vq_mode, npatterns, 1);
3603 for (unsigned int i = 0; i < npatterns; ++i)
3604 {
3605 /* We want memory lane N to go into architectural lane N,
3606 so reverse for big-endian targets. The DUP .Q pattern
3607 has a compensating reverse built-in. */
3608 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
3609 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
3610 }
3611 rtx vq_src = builder.build ();
3612 if (aarch64_simd_valid_immediate (vq_src, NULL))
3613 {
3614 vq_src = force_reg (vq_mode, vq_src);
3615 return aarch64_expand_sve_dupq (target, mode, vq_src);
3616 }
3617
3618 /* Get an integer representation of the repeating part of Advanced
3619 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
3620 which for big-endian targets is lane-swapped wrt a normal
3621 Advanced SIMD vector. This means that for both endiannesses,
3622 memory lane N of SVE vector SRC corresponds to architectural
3623 lane N of a register holding VQ_SRC. This in turn means that
3624 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
3625 as a single 128-bit value) and thus that memory lane 0 of SRC is
3626 in the lsb of the integer. Duplicating the integer therefore
3627 ensures that memory lane N of SRC goes into architectural lane
3628 N + I * INDEX of the SVE register. */
3629 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
3630 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
3631 if (elt_value)
3632 {
3633 /* Pretend that we had a vector of INT_MODE to start with. */
3634 elt_mode = int_mode;
3635 mode = aarch64_full_sve_mode (int_mode).require ();
3636
3637 /* If the integer can be moved into a general register by a
3638 single instruction, do that and duplicate the result. */
3639 if (CONST_INT_P (elt_value)
3640 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
3641 {
3642 elt_value = force_reg (elt_mode, elt_value);
3643 return expand_vector_broadcast (mode, elt_value);
3644 }
3645 }
3646 else if (npatterns == 1)
3647 /* We're duplicating a single value, but can't do better than
3648 force it to memory and load from there. This handles things
3649 like symbolic constants. */
3650 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
3651
3652 if (elt_value)
3653 {
3654 /* Load the element from memory if we can, otherwise move it into
3655 a register and use a DUP. */
3656 rtx op = force_const_mem (elt_mode, elt_value);
3657 if (!op)
3658 op = force_reg (elt_mode, elt_value);
3659 return expand_vector_broadcast (mode, op);
3660 }
3661 }
3662
3663 /* Try using INDEX. */
3664 rtx base, step;
3665 if (const_vec_series_p (src, &base, &step))
3666 {
3667 aarch64_expand_vec_series (target, base, step);
3668 return target;
3669 }
3670
3671 /* From here on, it's better to force the whole constant to memory
3672 if we can. */
3673 if (GET_MODE_NUNITS (mode).is_constant ())
3674 return NULL_RTX;
3675
3676 /* Expand each pattern individually. */
3677 gcc_assert (npatterns > 1);
3678 rtx_vector_builder builder;
3679 auto_vec<rtx, 16> vectors (npatterns);
3680 for (unsigned int i = 0; i < npatterns; ++i)
3681 {
3682 builder.new_vector (mode, 1, nelts_per_pattern);
3683 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3684 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3685 vectors.quick_push (force_reg (mode, builder.build ()));
3686 }
3687
3688 /* Use permutes to interleave the separate vectors. */
3689 while (npatterns > 1)
3690 {
3691 npatterns /= 2;
3692 for (unsigned int i = 0; i < npatterns; ++i)
3693 {
3694 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
3695 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3696 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3697 vectors[i] = tmp;
3698 }
3699 }
3700 gcc_assert (vectors[0] == target);
3701 return target;
3702 }
3703
3704 /* Use WHILE to set a predicate register of mode MODE in which the first
3705 VL bits are set and the rest are clear. Use TARGET for the register
3706 if it's nonnull and convenient. */
3707
3708 static rtx
3709 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
3710 unsigned int vl)
3711 {
3712 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
3713 target = aarch64_target_reg (target, mode);
3714 emit_insn (gen_while_ult (DImode, mode, target, const0_rtx, limit));
3715 return target;
3716 }
3717
3718 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
3719 constant in BUILDER into an SVE predicate register. Return the register
3720 on success, otherwise return null. Use TARGET for the register if
3721 nonnull and convenient. */
3722
3723 static rtx
3724 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder)
3725 {
3726 if (builder.encoded_nelts () == 1)
3727 /* A PFALSE or a PTRUE .B ALL. */
3728 return aarch64_emit_set_immediate (target, builder);
3729
3730 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
3731 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
3732 {
3733 /* If we can load the constant using PTRUE, use it as-is. */
3734 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
3735 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
3736 return aarch64_emit_set_immediate (target, builder);
3737
3738 /* Otherwise use WHILE to set the first VL bits. */
3739 return aarch64_sve_move_pred_via_while (target, mode, vl);
3740 }
3741
3742 return NULL_RTX;
3743 }
3744
3745 /* Return an SVE predicate register that contains the VNx16BImode
3746 constant in BUILDER, without going through the move expanders.
3747
3748 The returned register can have whatever mode seems most natural
3749 given the contents of BUILDER. Use TARGET for the result if
3750 convenient. */
3751
3752 static rtx
3753 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
3754 {
3755 /* Try loading the constant using pure predicate operations. */
3756 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder))
3757 return res;
3758
3759 /* Try forcing the constant to memory. */
3760 if (builder.full_nelts ().is_constant ())
3761 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
3762 {
3763 target = aarch64_target_reg (target, VNx16BImode);
3764 emit_move_insn (target, mem);
3765 return target;
3766 }
3767
3768 /* The last resort is to load the constant as an integer and then
3769 compare it against zero. Use -1 for set bits in order to increase
3770 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
3771 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
3772 builder.nelts_per_pattern ());
3773 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
3774 int_builder.quick_push (INTVAL (builder.elt (i))
3775 ? constm1_rtx : const0_rtx);
3776 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
3777 int_builder.build ());
3778 }
3779
3780 /* Set DEST to immediate IMM. */
3781
3782 void
3783 aarch64_expand_mov_immediate (rtx dest, rtx imm)
3784 {
3785 machine_mode mode = GET_MODE (dest);
3786
3787 /* Check on what type of symbol it is. */
3788 scalar_int_mode int_mode;
3789 if ((GET_CODE (imm) == SYMBOL_REF
3790 || GET_CODE (imm) == LABEL_REF
3791 || GET_CODE (imm) == CONST
3792 || GET_CODE (imm) == CONST_POLY_INT)
3793 && is_a <scalar_int_mode> (mode, &int_mode))
3794 {
3795 rtx mem;
3796 poly_int64 offset;
3797 HOST_WIDE_INT const_offset;
3798 enum aarch64_symbol_type sty;
3799
3800 /* If we have (const (plus symbol offset)), separate out the offset
3801 before we start classifying the symbol. */
3802 rtx base = strip_offset (imm, &offset);
3803
3804 /* We must always add an offset involving VL separately, rather than
3805 folding it into the relocation. */
3806 if (!offset.is_constant (&const_offset))
3807 {
3808 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3809 emit_insn (gen_rtx_SET (dest, imm));
3810 else
3811 {
3812 /* Do arithmetic on 32-bit values if the result is smaller
3813 than that. */
3814 if (partial_subreg_p (int_mode, SImode))
3815 {
3816 /* It is invalid to do symbol calculations in modes
3817 narrower than SImode. */
3818 gcc_assert (base == const0_rtx);
3819 dest = gen_lowpart (SImode, dest);
3820 int_mode = SImode;
3821 }
3822 if (base != const0_rtx)
3823 {
3824 base = aarch64_force_temporary (int_mode, dest, base);
3825 aarch64_add_offset (int_mode, dest, base, offset,
3826 NULL_RTX, NULL_RTX, false);
3827 }
3828 else
3829 aarch64_add_offset (int_mode, dest, base, offset,
3830 dest, NULL_RTX, false);
3831 }
3832 return;
3833 }
3834
3835 sty = aarch64_classify_symbol (base, const_offset);
3836 switch (sty)
3837 {
3838 case SYMBOL_FORCE_TO_MEM:
3839 if (const_offset != 0
3840 && targetm.cannot_force_const_mem (int_mode, imm))
3841 {
3842 gcc_assert (can_create_pseudo_p ());
3843 base = aarch64_force_temporary (int_mode, dest, base);
3844 aarch64_add_offset (int_mode, dest, base, const_offset,
3845 NULL_RTX, NULL_RTX, false);
3846 return;
3847 }
3848
3849 mem = force_const_mem (ptr_mode, imm);
3850 gcc_assert (mem);
3851
3852 /* If we aren't generating PC relative literals, then
3853 we need to expand the literal pool access carefully.
3854 This is something that needs to be done in a number
3855 of places, so could well live as a separate function. */
3856 if (!aarch64_pcrelative_literal_loads)
3857 {
3858 gcc_assert (can_create_pseudo_p ());
3859 base = gen_reg_rtx (ptr_mode);
3860 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3861 if (ptr_mode != Pmode)
3862 base = convert_memory_address (Pmode, base);
3863 mem = gen_rtx_MEM (ptr_mode, base);
3864 }
3865
3866 if (int_mode != ptr_mode)
3867 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3868
3869 emit_insn (gen_rtx_SET (dest, mem));
3870
3871 return;
3872
3873 case SYMBOL_SMALL_TLSGD:
3874 case SYMBOL_SMALL_TLSDESC:
3875 case SYMBOL_SMALL_TLSIE:
3876 case SYMBOL_SMALL_GOT_28K:
3877 case SYMBOL_SMALL_GOT_4G:
3878 case SYMBOL_TINY_GOT:
3879 case SYMBOL_TINY_TLSIE:
3880 if (const_offset != 0)
3881 {
3882 gcc_assert(can_create_pseudo_p ());
3883 base = aarch64_force_temporary (int_mode, dest, base);
3884 aarch64_add_offset (int_mode, dest, base, const_offset,
3885 NULL_RTX, NULL_RTX, false);
3886 return;
3887 }
3888 /* FALLTHRU */
3889
3890 case SYMBOL_SMALL_ABSOLUTE:
3891 case SYMBOL_TINY_ABSOLUTE:
3892 case SYMBOL_TLSLE12:
3893 case SYMBOL_TLSLE24:
3894 case SYMBOL_TLSLE32:
3895 case SYMBOL_TLSLE48:
3896 aarch64_load_symref_appropriately (dest, imm, sty);
3897 return;
3898
3899 default:
3900 gcc_unreachable ();
3901 }
3902 }
3903
3904 if (!CONST_INT_P (imm))
3905 {
3906 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
3907 {
3908 /* Only the low bit of each .H, .S and .D element is defined,
3909 so we can set the upper bits to whatever we like. If the
3910 predicate is all-true in MODE, prefer to set all the undefined
3911 bits as well, so that we can share a single .B predicate for
3912 all modes. */
3913 if (imm == CONSTM1_RTX (mode))
3914 imm = CONSTM1_RTX (VNx16BImode);
3915
3916 /* All methods for constructing predicate modes wider than VNx16BI
3917 will set the upper bits of each element to zero. Expose this
3918 by moving such constants as a VNx16BI, so that all bits are
3919 significant and so that constants for different modes can be
3920 shared. The wider constant will still be available as a
3921 REG_EQUAL note. */
3922 rtx_vector_builder builder;
3923 if (aarch64_get_sve_pred_bits (builder, imm))
3924 {
3925 rtx res = aarch64_expand_sve_const_pred (dest, builder);
3926 if (dest != res)
3927 emit_move_insn (dest, gen_lowpart (mode, res));
3928 return;
3929 }
3930 }
3931
3932 if (GET_CODE (imm) == HIGH
3933 || aarch64_simd_valid_immediate (imm, NULL))
3934 {
3935 emit_insn (gen_rtx_SET (dest, imm));
3936 return;
3937 }
3938
3939 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
3940 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
3941 {
3942 if (dest != res)
3943 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
3944 return;
3945 }
3946
3947 rtx mem = force_const_mem (mode, imm);
3948 gcc_assert (mem);
3949 emit_move_insn (dest, mem);
3950 return;
3951 }
3952
3953 aarch64_internal_mov_immediate (dest, imm, true,
3954 as_a <scalar_int_mode> (mode));
3955 }
3956
3957 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3958 that is known to contain PTRUE. */
3959
3960 void
3961 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3962 {
3963 expand_operand ops[3];
3964 machine_mode mode = GET_MODE (dest);
3965 create_output_operand (&ops[0], dest, mode);
3966 create_input_operand (&ops[1], pred, GET_MODE(pred));
3967 create_input_operand (&ops[2], src, mode);
3968 temporary_volatile_ok v (true);
3969 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3970 }
3971
3972 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3973 operand is in memory. In this case we need to use the predicated LD1
3974 and ST1 instead of LDR and STR, both for correctness on big-endian
3975 targets and because LD1 and ST1 support a wider range of addressing modes.
3976 PRED_MODE is the mode of the predicate.
3977
3978 See the comment at the head of aarch64-sve.md for details about the
3979 big-endian handling. */
3980
3981 void
3982 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3983 {
3984 machine_mode mode = GET_MODE (dest);
3985 rtx ptrue = aarch64_ptrue_reg (pred_mode);
3986 if (!register_operand (src, mode)
3987 && !register_operand (dest, mode))
3988 {
3989 rtx tmp = gen_reg_rtx (mode);
3990 if (MEM_P (src))
3991 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3992 else
3993 emit_move_insn (tmp, src);
3994 src = tmp;
3995 }
3996 aarch64_emit_sve_pred_move (dest, ptrue, src);
3997 }
3998
3999 /* Called only on big-endian targets. See whether an SVE vector move
4000 from SRC to DEST is effectively a REV[BHW] instruction, because at
4001 least one operand is a subreg of an SVE vector that has wider or
4002 narrower elements. Return true and emit the instruction if so.
4003
4004 For example:
4005
4006 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4007
4008 represents a VIEW_CONVERT between the following vectors, viewed
4009 in memory order:
4010
4011 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4012 R1: { [0], [1], [2], [3], ... }
4013
4014 The high part of lane X in R2 should therefore correspond to lane X*2
4015 of R1, but the register representations are:
4016
4017 msb lsb
4018 R2: ...... [1].high [1].low [0].high [0].low
4019 R1: ...... [3] [2] [1] [0]
4020
4021 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4022 We therefore need a reverse operation to swap the high and low values
4023 around.
4024
4025 This is purely an optimization. Without it we would spill the
4026 subreg operand to the stack in one mode and reload it in the
4027 other mode, which has the same effect as the REV. */
4028
4029 bool
4030 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4031 {
4032 gcc_assert (BYTES_BIG_ENDIAN);
4033 if (GET_CODE (dest) == SUBREG)
4034 dest = SUBREG_REG (dest);
4035 if (GET_CODE (src) == SUBREG)
4036 src = SUBREG_REG (src);
4037
4038 /* The optimization handles two single SVE REGs with different element
4039 sizes. */
4040 if (!REG_P (dest)
4041 || !REG_P (src)
4042 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4043 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4044 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4045 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4046 return false;
4047
4048 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4049 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4050 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4051 UNSPEC_REV_SUBREG);
4052 emit_insn (gen_rtx_SET (dest, unspec));
4053 return true;
4054 }
4055
4056 /* Return a copy of X with mode MODE, without changing its other
4057 attributes. Unlike gen_lowpart, this doesn't care whether the
4058 mode change is valid. */
4059
4060 static rtx
4061 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4062 {
4063 if (GET_MODE (x) == mode)
4064 return x;
4065
4066 x = shallow_copy_rtx (x);
4067 set_mode_and_regno (x, mode, REGNO (x));
4068 return x;
4069 }
4070
4071 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4072 operands. */
4073
4074 void
4075 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4076 {
4077 /* Decide which REV operation we need. The mode with narrower elements
4078 determines the mode of the operands and the mode with the wider
4079 elements determines the reverse width. */
4080 machine_mode mode_with_wider_elts = GET_MODE (dest);
4081 machine_mode mode_with_narrower_elts = GET_MODE (src);
4082 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4083 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4084 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4085
4086 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
4087 unsigned int unspec;
4088 if (wider_bytes == 8)
4089 unspec = UNSPEC_REV64;
4090 else if (wider_bytes == 4)
4091 unspec = UNSPEC_REV32;
4092 else if (wider_bytes == 2)
4093 unspec = UNSPEC_REV16;
4094 else
4095 gcc_unreachable ();
4096 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
4097
4098 /* Emit:
4099
4100 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
4101 UNSPEC_MERGE_PTRUE))
4102
4103 with the appropriate modes. */
4104 ptrue = gen_lowpart (pred_mode, ptrue);
4105 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
4106 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
4107 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
4108 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
4109 UNSPEC_MERGE_PTRUE);
4110 emit_insn (gen_rtx_SET (dest, src));
4111 }
4112
4113 static bool
4114 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
4115 tree exp ATTRIBUTE_UNUSED)
4116 {
4117 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
4118 return false;
4119
4120 return true;
4121 }
4122
4123 /* Implement TARGET_PASS_BY_REFERENCE. */
4124
4125 static bool
4126 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
4127 machine_mode mode,
4128 const_tree type,
4129 bool named ATTRIBUTE_UNUSED)
4130 {
4131 HOST_WIDE_INT size;
4132 machine_mode dummymode;
4133 int nregs;
4134
4135 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4136 if (mode == BLKmode && type)
4137 size = int_size_in_bytes (type);
4138 else
4139 /* No frontends can create types with variable-sized modes, so we
4140 shouldn't be asked to pass or return them. */
4141 size = GET_MODE_SIZE (mode).to_constant ();
4142
4143 /* Aggregates are passed by reference based on their size. */
4144 if (type && AGGREGATE_TYPE_P (type))
4145 {
4146 size = int_size_in_bytes (type);
4147 }
4148
4149 /* Variable sized arguments are always returned by reference. */
4150 if (size < 0)
4151 return true;
4152
4153 /* Can this be a candidate to be passed in fp/simd register(s)? */
4154 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4155 &dummymode, &nregs,
4156 NULL))
4157 return false;
4158
4159 /* Arguments which are variable sized or larger than 2 registers are
4160 passed by reference unless they are a homogenous floating point
4161 aggregate. */
4162 return size > 2 * UNITS_PER_WORD;
4163 }
4164
4165 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4166 static bool
4167 aarch64_return_in_msb (const_tree valtype)
4168 {
4169 machine_mode dummy_mode;
4170 int dummy_int;
4171
4172 /* Never happens in little-endian mode. */
4173 if (!BYTES_BIG_ENDIAN)
4174 return false;
4175
4176 /* Only composite types smaller than or equal to 16 bytes can
4177 be potentially returned in registers. */
4178 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4179 || int_size_in_bytes (valtype) <= 0
4180 || int_size_in_bytes (valtype) > 16)
4181 return false;
4182
4183 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4184 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4185 is always passed/returned in the least significant bits of fp/simd
4186 register(s). */
4187 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4188 &dummy_mode, &dummy_int, NULL))
4189 return false;
4190
4191 return true;
4192 }
4193
4194 /* Implement TARGET_FUNCTION_VALUE.
4195 Define how to find the value returned by a function. */
4196
4197 static rtx
4198 aarch64_function_value (const_tree type, const_tree func,
4199 bool outgoing ATTRIBUTE_UNUSED)
4200 {
4201 machine_mode mode;
4202 int unsignedp;
4203 int count;
4204 machine_mode ag_mode;
4205
4206 mode = TYPE_MODE (type);
4207 if (INTEGRAL_TYPE_P (type))
4208 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
4209
4210 if (aarch64_return_in_msb (type))
4211 {
4212 HOST_WIDE_INT size = int_size_in_bytes (type);
4213
4214 if (size % UNITS_PER_WORD != 0)
4215 {
4216 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4217 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4218 }
4219 }
4220
4221 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4222 &ag_mode, &count, NULL))
4223 {
4224 if (!aarch64_composite_type_p (type, mode))
4225 {
4226 gcc_assert (count == 1 && mode == ag_mode);
4227 return gen_rtx_REG (mode, V0_REGNUM);
4228 }
4229 else
4230 {
4231 int i;
4232 rtx par;
4233
4234 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
4235 for (i = 0; i < count; i++)
4236 {
4237 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
4238 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
4239 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4240 XVECEXP (par, 0, i) = tmp;
4241 }
4242 return par;
4243 }
4244 }
4245 else
4246 return gen_rtx_REG (mode, R0_REGNUM);
4247 }
4248
4249 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
4250 Return true if REGNO is the number of a hard register in which the values
4251 of called function may come back. */
4252
4253 static bool
4254 aarch64_function_value_regno_p (const unsigned int regno)
4255 {
4256 /* Maximum of 16 bytes can be returned in the general registers. Examples
4257 of 16-byte return values are: 128-bit integers and 16-byte small
4258 structures (excluding homogeneous floating-point aggregates). */
4259 if (regno == R0_REGNUM || regno == R1_REGNUM)
4260 return true;
4261
4262 /* Up to four fp/simd registers can return a function value, e.g. a
4263 homogeneous floating-point aggregate having four members. */
4264 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
4265 return TARGET_FLOAT;
4266
4267 return false;
4268 }
4269
4270 /* Implement TARGET_RETURN_IN_MEMORY.
4271
4272 If the type T of the result of a function is such that
4273 void func (T arg)
4274 would require that arg be passed as a value in a register (or set of
4275 registers) according to the parameter passing rules, then the result
4276 is returned in the same registers as would be used for such an
4277 argument. */
4278
4279 static bool
4280 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
4281 {
4282 HOST_WIDE_INT size;
4283 machine_mode ag_mode;
4284 int count;
4285
4286 if (!AGGREGATE_TYPE_P (type)
4287 && TREE_CODE (type) != COMPLEX_TYPE
4288 && TREE_CODE (type) != VECTOR_TYPE)
4289 /* Simple scalar types always returned in registers. */
4290 return false;
4291
4292 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
4293 type,
4294 &ag_mode,
4295 &count,
4296 NULL))
4297 return false;
4298
4299 /* Types larger than 2 registers returned in memory. */
4300 size = int_size_in_bytes (type);
4301 return (size < 0 || size > 2 * UNITS_PER_WORD);
4302 }
4303
4304 static bool
4305 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
4306 const_tree type, int *nregs)
4307 {
4308 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4309 return aarch64_vfp_is_call_or_return_candidate (mode,
4310 type,
4311 &pcum->aapcs_vfp_rmode,
4312 nregs,
4313 NULL);
4314 }
4315
4316 /* Given MODE and TYPE of a function argument, return the alignment in
4317 bits. The idea is to suppress any stronger alignment requested by
4318 the user and opt for the natural alignment (specified in AAPCS64 \S
4319 4.1). ABI_BREAK is set to true if the alignment was incorrectly
4320 calculated in versions of GCC prior to GCC-9. This is a helper
4321 function for local use only. */
4322
4323 static unsigned int
4324 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
4325 bool *abi_break)
4326 {
4327 *abi_break = false;
4328 if (!type)
4329 return GET_MODE_ALIGNMENT (mode);
4330
4331 if (integer_zerop (TYPE_SIZE (type)))
4332 return 0;
4333
4334 gcc_assert (TYPE_MODE (type) == mode);
4335
4336 if (!AGGREGATE_TYPE_P (type))
4337 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
4338
4339 if (TREE_CODE (type) == ARRAY_TYPE)
4340 return TYPE_ALIGN (TREE_TYPE (type));
4341
4342 unsigned int alignment = 0;
4343 unsigned int bitfield_alignment = 0;
4344 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
4345 if (TREE_CODE (field) == FIELD_DECL)
4346 {
4347 alignment = std::max (alignment, DECL_ALIGN (field));
4348 if (DECL_BIT_FIELD_TYPE (field))
4349 bitfield_alignment
4350 = std::max (bitfield_alignment,
4351 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
4352 }
4353
4354 if (bitfield_alignment > alignment)
4355 {
4356 *abi_break = true;
4357 return bitfield_alignment;
4358 }
4359
4360 return alignment;
4361 }
4362
4363 /* Layout a function argument according to the AAPCS64 rules. The rule
4364 numbers refer to the rule numbers in the AAPCS64. */
4365
4366 static void
4367 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
4368 const_tree type,
4369 bool named ATTRIBUTE_UNUSED)
4370 {
4371 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4372 int ncrn, nvrn, nregs;
4373 bool allocate_ncrn, allocate_nvrn;
4374 HOST_WIDE_INT size;
4375 bool abi_break;
4376
4377 /* We need to do this once per argument. */
4378 if (pcum->aapcs_arg_processed)
4379 return;
4380
4381 pcum->aapcs_arg_processed = true;
4382
4383 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
4384 if (type)
4385 size = int_size_in_bytes (type);
4386 else
4387 /* No frontends can create types with variable-sized modes, so we
4388 shouldn't be asked to pass or return them. */
4389 size = GET_MODE_SIZE (mode).to_constant ();
4390 size = ROUND_UP (size, UNITS_PER_WORD);
4391
4392 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
4393 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
4394 mode,
4395 type,
4396 &nregs);
4397
4398 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
4399 The following code thus handles passing by SIMD/FP registers first. */
4400
4401 nvrn = pcum->aapcs_nvrn;
4402
4403 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
4404 and homogenous short-vector aggregates (HVA). */
4405 if (allocate_nvrn)
4406 {
4407 if (!TARGET_FLOAT)
4408 aarch64_err_no_fpadvsimd (mode);
4409
4410 if (nvrn + nregs <= NUM_FP_ARG_REGS)
4411 {
4412 pcum->aapcs_nextnvrn = nvrn + nregs;
4413 if (!aarch64_composite_type_p (type, mode))
4414 {
4415 gcc_assert (nregs == 1);
4416 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
4417 }
4418 else
4419 {
4420 rtx par;
4421 int i;
4422 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4423 for (i = 0; i < nregs; i++)
4424 {
4425 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
4426 V0_REGNUM + nvrn + i);
4427 rtx offset = gen_int_mode
4428 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
4429 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
4430 XVECEXP (par, 0, i) = tmp;
4431 }
4432 pcum->aapcs_reg = par;
4433 }
4434 return;
4435 }
4436 else
4437 {
4438 /* C.3 NSRN is set to 8. */
4439 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
4440 goto on_stack;
4441 }
4442 }
4443
4444 ncrn = pcum->aapcs_ncrn;
4445 nregs = size / UNITS_PER_WORD;
4446
4447 /* C6 - C9. though the sign and zero extension semantics are
4448 handled elsewhere. This is the case where the argument fits
4449 entirely general registers. */
4450 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
4451 {
4452 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
4453
4454 /* C.8 if the argument has an alignment of 16 then the NGRN is
4455 rounded up to the next even number. */
4456 if (nregs == 2
4457 && ncrn % 2
4458 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
4459 comparison is there because for > 16 * BITS_PER_UNIT
4460 alignment nregs should be > 2 and therefore it should be
4461 passed by reference rather than value. */
4462 && (aarch64_function_arg_alignment (mode, type, &abi_break)
4463 == 16 * BITS_PER_UNIT))
4464 {
4465 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4466 inform (input_location, "parameter passing for argument of type "
4467 "%qT changed in GCC 9.1", type);
4468 ++ncrn;
4469 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
4470 }
4471
4472 /* NREGS can be 0 when e.g. an empty structure is to be passed.
4473 A reg is still generated for it, but the caller should be smart
4474 enough not to use it. */
4475 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
4476 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
4477 else
4478 {
4479 rtx par;
4480 int i;
4481
4482 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
4483 for (i = 0; i < nregs; i++)
4484 {
4485 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
4486 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
4487 GEN_INT (i * UNITS_PER_WORD));
4488 XVECEXP (par, 0, i) = tmp;
4489 }
4490 pcum->aapcs_reg = par;
4491 }
4492
4493 pcum->aapcs_nextncrn = ncrn + nregs;
4494 return;
4495 }
4496
4497 /* C.11 */
4498 pcum->aapcs_nextncrn = NUM_ARG_REGS;
4499
4500 /* The argument is passed on stack; record the needed number of words for
4501 this argument and align the total size if necessary. */
4502 on_stack:
4503 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
4504
4505 if (aarch64_function_arg_alignment (mode, type, &abi_break)
4506 == 16 * BITS_PER_UNIT)
4507 {
4508 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
4509 if (pcum->aapcs_stack_size != new_size)
4510 {
4511 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
4512 inform (input_location, "parameter passing for argument of type "
4513 "%qT changed in GCC 9.1", type);
4514 pcum->aapcs_stack_size = new_size;
4515 }
4516 }
4517 return;
4518 }
4519
4520 /* Implement TARGET_FUNCTION_ARG. */
4521
4522 static rtx
4523 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
4524 const_tree type, bool named)
4525 {
4526 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4527 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
4528
4529 if (mode == VOIDmode)
4530 return NULL_RTX;
4531
4532 aarch64_layout_arg (pcum_v, mode, type, named);
4533 return pcum->aapcs_reg;
4534 }
4535
4536 void
4537 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
4538 const_tree fntype ATTRIBUTE_UNUSED,
4539 rtx libname ATTRIBUTE_UNUSED,
4540 const_tree fndecl ATTRIBUTE_UNUSED,
4541 unsigned n_named ATTRIBUTE_UNUSED)
4542 {
4543 pcum->aapcs_ncrn = 0;
4544 pcum->aapcs_nvrn = 0;
4545 pcum->aapcs_nextncrn = 0;
4546 pcum->aapcs_nextnvrn = 0;
4547 pcum->pcs_variant = ARM_PCS_AAPCS64;
4548 pcum->aapcs_reg = NULL_RTX;
4549 pcum->aapcs_arg_processed = false;
4550 pcum->aapcs_stack_words = 0;
4551 pcum->aapcs_stack_size = 0;
4552
4553 if (!TARGET_FLOAT
4554 && fndecl && TREE_PUBLIC (fndecl)
4555 && fntype && fntype != error_mark_node)
4556 {
4557 const_tree type = TREE_TYPE (fntype);
4558 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
4559 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
4560 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4561 &mode, &nregs, NULL))
4562 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4563 }
4564 return;
4565 }
4566
4567 static void
4568 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4569 machine_mode mode,
4570 const_tree type,
4571 bool named)
4572 {
4573 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4574 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4575 {
4576 aarch64_layout_arg (pcum_v, mode, type, named);
4577 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4578 != (pcum->aapcs_stack_words != 0));
4579 pcum->aapcs_arg_processed = false;
4580 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4581 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4582 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4583 pcum->aapcs_stack_words = 0;
4584 pcum->aapcs_reg = NULL_RTX;
4585 }
4586 }
4587
4588 bool
4589 aarch64_function_arg_regno_p (unsigned regno)
4590 {
4591 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4592 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4593 }
4594
4595 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
4596 PARM_BOUNDARY bits of alignment, but will be given anything up
4597 to STACK_BOUNDARY bits if the type requires it. This makes sure
4598 that both before and after the layout of each argument, the Next
4599 Stacked Argument Address (NSAA) will have a minimum alignment of
4600 8 bytes. */
4601
4602 static unsigned int
4603 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4604 {
4605 bool abi_break;
4606 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4607 &abi_break);
4608 if (abi_break & warn_psabi)
4609 inform (input_location, "parameter passing for argument of type "
4610 "%qT changed in GCC 9.1", type);
4611
4612 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4613 }
4614
4615 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
4616
4617 static fixed_size_mode
4618 aarch64_get_reg_raw_mode (int regno)
4619 {
4620 if (TARGET_SVE && FP_REGNUM_P (regno))
4621 /* Don't use the SVE part of the register for __builtin_apply and
4622 __builtin_return. The SVE registers aren't used by the normal PCS,
4623 so using them there would be a waste of time. The PCS extensions
4624 for SVE types are fundamentally incompatible with the
4625 __builtin_return/__builtin_apply interface. */
4626 return as_a <fixed_size_mode> (V16QImode);
4627 return default_get_reg_raw_mode (regno);
4628 }
4629
4630 /* Implement TARGET_FUNCTION_ARG_PADDING.
4631
4632 Small aggregate types are placed in the lowest memory address.
4633
4634 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
4635
4636 static pad_direction
4637 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4638 {
4639 /* On little-endian targets, the least significant byte of every stack
4640 argument is passed at the lowest byte address of the stack slot. */
4641 if (!BYTES_BIG_ENDIAN)
4642 return PAD_UPWARD;
4643
4644 /* Otherwise, integral, floating-point and pointer types are padded downward:
4645 the least significant byte of a stack argument is passed at the highest
4646 byte address of the stack slot. */
4647 if (type
4648 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4649 || POINTER_TYPE_P (type))
4650 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4651 return PAD_DOWNWARD;
4652
4653 /* Everything else padded upward, i.e. data in first byte of stack slot. */
4654 return PAD_UPWARD;
4655 }
4656
4657 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4658
4659 It specifies padding for the last (may also be the only)
4660 element of a block move between registers and memory. If
4661 assuming the block is in the memory, padding upward means that
4662 the last element is padded after its highest significant byte,
4663 while in downward padding, the last element is padded at the
4664 its least significant byte side.
4665
4666 Small aggregates and small complex types are always padded
4667 upwards.
4668
4669 We don't need to worry about homogeneous floating-point or
4670 short-vector aggregates; their move is not affected by the
4671 padding direction determined here. Regardless of endianness,
4672 each element of such an aggregate is put in the least
4673 significant bits of a fp/simd register.
4674
4675 Return !BYTES_BIG_ENDIAN if the least significant byte of the
4676 register has useful data, and return the opposite if the most
4677 significant byte does. */
4678
4679 bool
4680 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4681 bool first ATTRIBUTE_UNUSED)
4682 {
4683
4684 /* Small composite types are always padded upward. */
4685 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4686 {
4687 HOST_WIDE_INT size;
4688 if (type)
4689 size = int_size_in_bytes (type);
4690 else
4691 /* No frontends can create types with variable-sized modes, so we
4692 shouldn't be asked to pass or return them. */
4693 size = GET_MODE_SIZE (mode).to_constant ();
4694 if (size < 2 * UNITS_PER_WORD)
4695 return true;
4696 }
4697
4698 /* Otherwise, use the default padding. */
4699 return !BYTES_BIG_ENDIAN;
4700 }
4701
4702 static scalar_int_mode
4703 aarch64_libgcc_cmp_return_mode (void)
4704 {
4705 return SImode;
4706 }
4707
4708 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4709
4710 /* We use the 12-bit shifted immediate arithmetic instructions so values
4711 must be multiple of (1 << 12), i.e. 4096. */
4712 #define ARITH_FACTOR 4096
4713
4714 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4715 #error Cannot use simple address calculation for stack probing
4716 #endif
4717
4718 /* The pair of scratch registers used for stack probing. */
4719 #define PROBE_STACK_FIRST_REG R9_REGNUM
4720 #define PROBE_STACK_SECOND_REG R10_REGNUM
4721
4722 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4723 inclusive. These are offsets from the current stack pointer. */
4724
4725 static void
4726 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4727 {
4728 HOST_WIDE_INT size;
4729 if (!poly_size.is_constant (&size))
4730 {
4731 sorry ("stack probes for SVE frames");
4732 return;
4733 }
4734
4735 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4736
4737 /* See the same assertion on PROBE_INTERVAL above. */
4738 gcc_assert ((first % ARITH_FACTOR) == 0);
4739
4740 /* See if we have a constant small number of probes to generate. If so,
4741 that's the easy case. */
4742 if (size <= PROBE_INTERVAL)
4743 {
4744 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4745
4746 emit_set_insn (reg1,
4747 plus_constant (Pmode,
4748 stack_pointer_rtx, -(first + base)));
4749 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4750 }
4751
4752 /* The run-time loop is made up of 8 insns in the generic case while the
4753 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
4754 else if (size <= 4 * PROBE_INTERVAL)
4755 {
4756 HOST_WIDE_INT i, rem;
4757
4758 emit_set_insn (reg1,
4759 plus_constant (Pmode,
4760 stack_pointer_rtx,
4761 -(first + PROBE_INTERVAL)));
4762 emit_stack_probe (reg1);
4763
4764 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4765 it exceeds SIZE. If only two probes are needed, this will not
4766 generate any code. Then probe at FIRST + SIZE. */
4767 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4768 {
4769 emit_set_insn (reg1,
4770 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4771 emit_stack_probe (reg1);
4772 }
4773
4774 rem = size - (i - PROBE_INTERVAL);
4775 if (rem > 256)
4776 {
4777 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4778
4779 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4780 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4781 }
4782 else
4783 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4784 }
4785
4786 /* Otherwise, do the same as above, but in a loop. Note that we must be
4787 extra careful with variables wrapping around because we might be at
4788 the very top (or the very bottom) of the address space and we have
4789 to be able to handle this case properly; in particular, we use an
4790 equality test for the loop condition. */
4791 else
4792 {
4793 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4794
4795 /* Step 1: round SIZE to the previous multiple of the interval. */
4796
4797 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4798
4799
4800 /* Step 2: compute initial and final value of the loop counter. */
4801
4802 /* TEST_ADDR = SP + FIRST. */
4803 emit_set_insn (reg1,
4804 plus_constant (Pmode, stack_pointer_rtx, -first));
4805
4806 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
4807 HOST_WIDE_INT adjustment = - (first + rounded_size);
4808 if (! aarch64_uimm12_shift (adjustment))
4809 {
4810 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4811 true, Pmode);
4812 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4813 }
4814 else
4815 emit_set_insn (reg2,
4816 plus_constant (Pmode, stack_pointer_rtx, adjustment));
4817
4818 /* Step 3: the loop
4819
4820 do
4821 {
4822 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4823 probe at TEST_ADDR
4824 }
4825 while (TEST_ADDR != LAST_ADDR)
4826
4827 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4828 until it is equal to ROUNDED_SIZE. */
4829
4830 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4831
4832
4833 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4834 that SIZE is equal to ROUNDED_SIZE. */
4835
4836 if (size != rounded_size)
4837 {
4838 HOST_WIDE_INT rem = size - rounded_size;
4839
4840 if (rem > 256)
4841 {
4842 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4843
4844 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4845 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4846 }
4847 else
4848 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4849 }
4850 }
4851
4852 /* Make sure nothing is scheduled before we are done. */
4853 emit_insn (gen_blockage ());
4854 }
4855
4856 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
4857 absolute addresses. */
4858
4859 const char *
4860 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4861 {
4862 static int labelno = 0;
4863 char loop_lab[32];
4864 rtx xops[2];
4865
4866 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4867
4868 /* Loop. */
4869 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4870
4871 HOST_WIDE_INT stack_clash_probe_interval
4872 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4873
4874 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
4875 xops[0] = reg1;
4876 HOST_WIDE_INT interval;
4877 if (flag_stack_clash_protection)
4878 interval = stack_clash_probe_interval;
4879 else
4880 interval = PROBE_INTERVAL;
4881
4882 gcc_assert (aarch64_uimm12_shift (interval));
4883 xops[1] = GEN_INT (interval);
4884
4885 output_asm_insn ("sub\t%0, %0, %1", xops);
4886
4887 /* If doing stack clash protection then we probe up by the ABI specified
4888 amount. We do this because we're dropping full pages at a time in the
4889 loop. But if we're doing non-stack clash probing, probe at SP 0. */
4890 if (flag_stack_clash_protection)
4891 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4892 else
4893 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4894
4895 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
4896 by this amount for each iteration. */
4897 output_asm_insn ("str\txzr, [%0, %1]", xops);
4898
4899 /* Test if TEST_ADDR == LAST_ADDR. */
4900 xops[1] = reg2;
4901 output_asm_insn ("cmp\t%0, %1", xops);
4902
4903 /* Branch. */
4904 fputs ("\tb.ne\t", asm_out_file);
4905 assemble_name_raw (asm_out_file, loop_lab);
4906 fputc ('\n', asm_out_file);
4907
4908 return "";
4909 }
4910
4911 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4912 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4913 of GUARD_SIZE. When a probe is emitted it is done at most
4914 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4915 at most MIN_PROBE_THRESHOLD. By the end of this function
4916 BASE = BASE - ADJUSTMENT. */
4917
4918 const char *
4919 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4920 rtx min_probe_threshold, rtx guard_size)
4921 {
4922 /* This function is not allowed to use any instruction generation function
4923 like gen_ and friends. If you do you'll likely ICE during CFG validation,
4924 so instead emit the code you want using output_asm_insn. */
4925 gcc_assert (flag_stack_clash_protection);
4926 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4927 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4928
4929 /* The minimum required allocation before the residual requires probing. */
4930 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4931
4932 /* Clamp the value down to the nearest value that can be used with a cmp. */
4933 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4934 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4935
4936 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4937 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4938
4939 static int labelno = 0;
4940 char loop_start_lab[32];
4941 char loop_end_lab[32];
4942 rtx xops[2];
4943
4944 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4945 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4946
4947 /* Emit loop start label. */
4948 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4949
4950 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
4951 xops[0] = adjustment;
4952 xops[1] = probe_offset_value_rtx;
4953 output_asm_insn ("cmp\t%0, %1", xops);
4954
4955 /* Branch to end if not enough adjustment to probe. */
4956 fputs ("\tb.lt\t", asm_out_file);
4957 assemble_name_raw (asm_out_file, loop_end_lab);
4958 fputc ('\n', asm_out_file);
4959
4960 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
4961 xops[0] = base;
4962 xops[1] = probe_offset_value_rtx;
4963 output_asm_insn ("sub\t%0, %0, %1", xops);
4964
4965 /* Probe at BASE. */
4966 xops[1] = const0_rtx;
4967 output_asm_insn ("str\txzr, [%0, %1]", xops);
4968
4969 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
4970 xops[0] = adjustment;
4971 xops[1] = probe_offset_value_rtx;
4972 output_asm_insn ("sub\t%0, %0, %1", xops);
4973
4974 /* Branch to start if still more bytes to allocate. */
4975 fputs ("\tb\t", asm_out_file);
4976 assemble_name_raw (asm_out_file, loop_start_lab);
4977 fputc ('\n', asm_out_file);
4978
4979 /* No probe leave. */
4980 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4981
4982 /* BASE = BASE - ADJUSTMENT. */
4983 xops[0] = base;
4984 xops[1] = adjustment;
4985 output_asm_insn ("sub\t%0, %0, %1", xops);
4986 return "";
4987 }
4988
4989 /* Determine whether a frame chain needs to be generated. */
4990 static bool
4991 aarch64_needs_frame_chain (void)
4992 {
4993 /* Force a frame chain for EH returns so the return address is at FP+8. */
4994 if (frame_pointer_needed || crtl->calls_eh_return)
4995 return true;
4996
4997 /* A leaf function cannot have calls or write LR. */
4998 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4999
5000 /* Don't use a frame chain in leaf functions if leaf frame pointers
5001 are disabled. */
5002 if (flag_omit_leaf_frame_pointer && is_leaf)
5003 return false;
5004
5005 return aarch64_use_frame_pointer;
5006 }
5007
5008 /* Mark the registers that need to be saved by the callee and calculate
5009 the size of the callee-saved registers area and frame record (both FP
5010 and LR may be omitted). */
5011 static void
5012 aarch64_layout_frame (void)
5013 {
5014 HOST_WIDE_INT offset = 0;
5015 int regno, last_fp_reg = INVALID_REGNUM;
5016 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5017
5018 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
5019
5020 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5021 the mid-end is doing. */
5022 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5023
5024 #define SLOT_NOT_REQUIRED (-2)
5025 #define SLOT_REQUIRED (-1)
5026
5027 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
5028 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
5029
5030 /* If this is a non-leaf simd function with calls we assume that
5031 at least one of those calls is to a non-simd function and thus
5032 we must save V8 to V23 in the prologue. */
5033
5034 if (simd_function && !crtl->is_leaf)
5035 {
5036 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5037 if (FP_SIMD_SAVED_REGNUM_P (regno))
5038 df_set_regs_ever_live (regno, true);
5039 }
5040
5041 /* First mark all the registers that really need to be saved... */
5042 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5043 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5044
5045 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5046 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5047
5048 /* ... that includes the eh data registers (if needed)... */
5049 if (crtl->calls_eh_return)
5050 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5051 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
5052 = SLOT_REQUIRED;
5053
5054 /* ... and any callee saved register that dataflow says is live. */
5055 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5056 if (df_regs_ever_live_p (regno)
5057 && (regno == R30_REGNUM
5058 || !call_used_regs[regno]))
5059 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5060
5061 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5062 if (df_regs_ever_live_p (regno)
5063 && (!call_used_regs[regno]
5064 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
5065 {
5066 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
5067 last_fp_reg = regno;
5068 }
5069
5070 if (cfun->machine->frame.emit_frame_chain)
5071 {
5072 /* FP and LR are placed in the linkage record. */
5073 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
5074 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
5075 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
5076 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
5077 offset = 2 * UNITS_PER_WORD;
5078 }
5079
5080 /* With stack-clash, LR must be saved in non-leaf functions. */
5081 gcc_assert (crtl->is_leaf
5082 || (cfun->machine->frame.reg_offset[R30_REGNUM]
5083 != SLOT_NOT_REQUIRED));
5084
5085 /* Now assign stack slots for them. */
5086 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5087 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5088 {
5089 cfun->machine->frame.reg_offset[regno] = offset;
5090 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5091 cfun->machine->frame.wb_candidate1 = regno;
5092 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
5093 cfun->machine->frame.wb_candidate2 = regno;
5094 offset += UNITS_PER_WORD;
5095 }
5096
5097 HOST_WIDE_INT max_int_offset = offset;
5098 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5099 bool has_align_gap = offset != max_int_offset;
5100
5101 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5102 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
5103 {
5104 /* If there is an alignment gap between integer and fp callee-saves,
5105 allocate the last fp register to it if possible. */
5106 if (regno == last_fp_reg
5107 && has_align_gap
5108 && !simd_function
5109 && (offset & 8) == 0)
5110 {
5111 cfun->machine->frame.reg_offset[regno] = max_int_offset;
5112 break;
5113 }
5114
5115 cfun->machine->frame.reg_offset[regno] = offset;
5116 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
5117 cfun->machine->frame.wb_candidate1 = regno;
5118 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
5119 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
5120 cfun->machine->frame.wb_candidate2 = regno;
5121 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
5122 }
5123
5124 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
5125
5126 cfun->machine->frame.saved_regs_size = offset;
5127
5128 HOST_WIDE_INT varargs_and_saved_regs_size
5129 = offset + cfun->machine->frame.saved_varargs_size;
5130
5131 cfun->machine->frame.hard_fp_offset
5132 = aligned_upper_bound (varargs_and_saved_regs_size
5133 + get_frame_size (),
5134 STACK_BOUNDARY / BITS_PER_UNIT);
5135
5136 /* Both these values are already aligned. */
5137 gcc_assert (multiple_p (crtl->outgoing_args_size,
5138 STACK_BOUNDARY / BITS_PER_UNIT));
5139 cfun->machine->frame.frame_size
5140 = (cfun->machine->frame.hard_fp_offset
5141 + crtl->outgoing_args_size);
5142
5143 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
5144
5145 cfun->machine->frame.initial_adjust = 0;
5146 cfun->machine->frame.final_adjust = 0;
5147 cfun->machine->frame.callee_adjust = 0;
5148 cfun->machine->frame.callee_offset = 0;
5149
5150 HOST_WIDE_INT max_push_offset = 0;
5151 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
5152 max_push_offset = 512;
5153 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
5154 max_push_offset = 256;
5155
5156 HOST_WIDE_INT const_size, const_fp_offset;
5157 if (cfun->machine->frame.frame_size.is_constant (&const_size)
5158 && const_size < max_push_offset
5159 && known_eq (crtl->outgoing_args_size, 0))
5160 {
5161 /* Simple, small frame with no outgoing arguments:
5162 stp reg1, reg2, [sp, -frame_size]!
5163 stp reg3, reg4, [sp, 16] */
5164 cfun->machine->frame.callee_adjust = const_size;
5165 }
5166 else if (known_lt (crtl->outgoing_args_size
5167 + cfun->machine->frame.saved_regs_size, 512)
5168 && !(cfun->calls_alloca
5169 && known_lt (cfun->machine->frame.hard_fp_offset,
5170 max_push_offset)))
5171 {
5172 /* Frame with small outgoing arguments:
5173 sub sp, sp, frame_size
5174 stp reg1, reg2, [sp, outgoing_args_size]
5175 stp reg3, reg4, [sp, outgoing_args_size + 16] */
5176 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
5177 cfun->machine->frame.callee_offset
5178 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
5179 }
5180 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
5181 && const_fp_offset < max_push_offset)
5182 {
5183 /* Frame with large outgoing arguments but a small local area:
5184 stp reg1, reg2, [sp, -hard_fp_offset]!
5185 stp reg3, reg4, [sp, 16]
5186 sub sp, sp, outgoing_args_size */
5187 cfun->machine->frame.callee_adjust = const_fp_offset;
5188 cfun->machine->frame.final_adjust
5189 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
5190 }
5191 else
5192 {
5193 /* Frame with large local area and outgoing arguments using frame pointer:
5194 sub sp, sp, hard_fp_offset
5195 stp x29, x30, [sp, 0]
5196 add x29, sp, 0
5197 stp reg3, reg4, [sp, 16]
5198 sub sp, sp, outgoing_args_size */
5199 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
5200 cfun->machine->frame.final_adjust
5201 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
5202 }
5203
5204 cfun->machine->frame.laid_out = true;
5205 }
5206
5207 /* Return true if the register REGNO is saved on entry to
5208 the current function. */
5209
5210 static bool
5211 aarch64_register_saved_on_entry (int regno)
5212 {
5213 return cfun->machine->frame.reg_offset[regno] >= 0;
5214 }
5215
5216 /* Return the next register up from REGNO up to LIMIT for the callee
5217 to save. */
5218
5219 static unsigned
5220 aarch64_next_callee_save (unsigned regno, unsigned limit)
5221 {
5222 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
5223 regno ++;
5224 return regno;
5225 }
5226
5227 /* Push the register number REGNO of mode MODE to the stack with write-back
5228 adjusting the stack by ADJUSTMENT. */
5229
5230 static void
5231 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
5232 HOST_WIDE_INT adjustment)
5233 {
5234 rtx base_rtx = stack_pointer_rtx;
5235 rtx insn, reg, mem;
5236
5237 reg = gen_rtx_REG (mode, regno);
5238 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
5239 plus_constant (Pmode, base_rtx, -adjustment));
5240 mem = gen_frame_mem (mode, mem);
5241
5242 insn = emit_move_insn (mem, reg);
5243 RTX_FRAME_RELATED_P (insn) = 1;
5244 }
5245
5246 /* Generate and return an instruction to store the pair of registers
5247 REG and REG2 of mode MODE to location BASE with write-back adjusting
5248 the stack location BASE by ADJUSTMENT. */
5249
5250 static rtx
5251 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5252 HOST_WIDE_INT adjustment)
5253 {
5254 switch (mode)
5255 {
5256 case E_DImode:
5257 return gen_storewb_pairdi_di (base, base, reg, reg2,
5258 GEN_INT (-adjustment),
5259 GEN_INT (UNITS_PER_WORD - adjustment));
5260 case E_DFmode:
5261 return gen_storewb_pairdf_di (base, base, reg, reg2,
5262 GEN_INT (-adjustment),
5263 GEN_INT (UNITS_PER_WORD - adjustment));
5264 case E_TFmode:
5265 return gen_storewb_pairtf_di (base, base, reg, reg2,
5266 GEN_INT (-adjustment),
5267 GEN_INT (UNITS_PER_VREG - adjustment));
5268 default:
5269 gcc_unreachable ();
5270 }
5271 }
5272
5273 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
5274 stack pointer by ADJUSTMENT. */
5275
5276 static void
5277 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
5278 {
5279 rtx_insn *insn;
5280 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5281
5282 if (regno2 == INVALID_REGNUM)
5283 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
5284
5285 rtx reg1 = gen_rtx_REG (mode, regno1);
5286 rtx reg2 = gen_rtx_REG (mode, regno2);
5287
5288 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
5289 reg2, adjustment));
5290 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
5291 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5292 RTX_FRAME_RELATED_P (insn) = 1;
5293 }
5294
5295 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
5296 adjusting it by ADJUSTMENT afterwards. */
5297
5298 static rtx
5299 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
5300 HOST_WIDE_INT adjustment)
5301 {
5302 switch (mode)
5303 {
5304 case E_DImode:
5305 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
5306 GEN_INT (UNITS_PER_WORD));
5307 case E_DFmode:
5308 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
5309 GEN_INT (UNITS_PER_WORD));
5310 case E_TFmode:
5311 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
5312 GEN_INT (UNITS_PER_VREG));
5313 default:
5314 gcc_unreachable ();
5315 }
5316 }
5317
5318 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
5319 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
5320 into CFI_OPS. */
5321
5322 static void
5323 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
5324 rtx *cfi_ops)
5325 {
5326 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
5327 rtx reg1 = gen_rtx_REG (mode, regno1);
5328
5329 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
5330
5331 if (regno2 == INVALID_REGNUM)
5332 {
5333 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
5334 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
5335 emit_move_insn (reg1, gen_frame_mem (mode, mem));
5336 }
5337 else
5338 {
5339 rtx reg2 = gen_rtx_REG (mode, regno2);
5340 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5341 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
5342 reg2, adjustment));
5343 }
5344 }
5345
5346 /* Generate and return a store pair instruction of mode MODE to store
5347 register REG1 to MEM1 and register REG2 to MEM2. */
5348
5349 static rtx
5350 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
5351 rtx reg2)
5352 {
5353 switch (mode)
5354 {
5355 case E_DImode:
5356 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
5357
5358 case E_DFmode:
5359 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
5360
5361 case E_TFmode:
5362 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
5363
5364 default:
5365 gcc_unreachable ();
5366 }
5367 }
5368
5369 /* Generate and regurn a load pair isntruction of mode MODE to load register
5370 REG1 from MEM1 and register REG2 from MEM2. */
5371
5372 static rtx
5373 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
5374 rtx mem2)
5375 {
5376 switch (mode)
5377 {
5378 case E_DImode:
5379 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
5380
5381 case E_DFmode:
5382 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
5383
5384 case E_TFmode:
5385 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
5386
5387 default:
5388 gcc_unreachable ();
5389 }
5390 }
5391
5392 /* Return TRUE if return address signing should be enabled for the current
5393 function, otherwise return FALSE. */
5394
5395 bool
5396 aarch64_return_address_signing_enabled (void)
5397 {
5398 /* This function should only be called after frame laid out. */
5399 gcc_assert (cfun->machine->frame.laid_out);
5400
5401 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
5402 if its LR is pushed onto stack. */
5403 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
5404 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
5405 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
5406 }
5407
5408 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
5409 bool
5410 aarch64_bti_enabled (void)
5411 {
5412 return (aarch64_enable_bti == 1);
5413 }
5414
5415 /* Emit code to save the callee-saved registers from register number START
5416 to LIMIT to the stack at the location starting at offset START_OFFSET,
5417 skipping any write-back candidates if SKIP_WB is true. */
5418
5419 static void
5420 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
5421 unsigned start, unsigned limit, bool skip_wb)
5422 {
5423 rtx_insn *insn;
5424 unsigned regno;
5425 unsigned regno2;
5426
5427 for (regno = aarch64_next_callee_save (start, limit);
5428 regno <= limit;
5429 regno = aarch64_next_callee_save (regno + 1, limit))
5430 {
5431 rtx reg, mem;
5432 poly_int64 offset;
5433 int offset_diff;
5434
5435 if (skip_wb
5436 && (regno == cfun->machine->frame.wb_candidate1
5437 || regno == cfun->machine->frame.wb_candidate2))
5438 continue;
5439
5440 if (cfun->machine->reg_is_wrapped_separately[regno])
5441 continue;
5442
5443 reg = gen_rtx_REG (mode, regno);
5444 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5445 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5446 offset));
5447
5448 regno2 = aarch64_next_callee_save (regno + 1, limit);
5449 offset_diff = cfun->machine->frame.reg_offset[regno2]
5450 - cfun->machine->frame.reg_offset[regno];
5451
5452 if (regno2 <= limit
5453 && !cfun->machine->reg_is_wrapped_separately[regno2]
5454 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5455 {
5456 rtx reg2 = gen_rtx_REG (mode, regno2);
5457 rtx mem2;
5458
5459 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5460 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
5461 offset));
5462 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
5463 reg2));
5464
5465 /* The first part of a frame-related parallel insn is
5466 always assumed to be relevant to the frame
5467 calculations; subsequent parts, are only
5468 frame-related if explicitly marked. */
5469 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
5470 regno = regno2;
5471 }
5472 else
5473 insn = emit_move_insn (mem, reg);
5474
5475 RTX_FRAME_RELATED_P (insn) = 1;
5476 }
5477 }
5478
5479 /* Emit code to restore the callee registers of mode MODE from register
5480 number START up to and including LIMIT. Restore from the stack offset
5481 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
5482 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
5483
5484 static void
5485 aarch64_restore_callee_saves (machine_mode mode,
5486 poly_int64 start_offset, unsigned start,
5487 unsigned limit, bool skip_wb, rtx *cfi_ops)
5488 {
5489 rtx base_rtx = stack_pointer_rtx;
5490 unsigned regno;
5491 unsigned regno2;
5492 poly_int64 offset;
5493
5494 for (regno = aarch64_next_callee_save (start, limit);
5495 regno <= limit;
5496 regno = aarch64_next_callee_save (regno + 1, limit))
5497 {
5498 if (cfun->machine->reg_is_wrapped_separately[regno])
5499 continue;
5500
5501 rtx reg, mem;
5502 int offset_diff;
5503
5504 if (skip_wb
5505 && (regno == cfun->machine->frame.wb_candidate1
5506 || regno == cfun->machine->frame.wb_candidate2))
5507 continue;
5508
5509 reg = gen_rtx_REG (mode, regno);
5510 offset = start_offset + cfun->machine->frame.reg_offset[regno];
5511 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5512
5513 regno2 = aarch64_next_callee_save (regno + 1, limit);
5514 offset_diff = cfun->machine->frame.reg_offset[regno2]
5515 - cfun->machine->frame.reg_offset[regno];
5516
5517 if (regno2 <= limit
5518 && !cfun->machine->reg_is_wrapped_separately[regno2]
5519 && known_eq (GET_MODE_SIZE (mode), offset_diff))
5520 {
5521 rtx reg2 = gen_rtx_REG (mode, regno2);
5522 rtx mem2;
5523
5524 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
5525 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
5526 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5527
5528 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
5529 regno = regno2;
5530 }
5531 else
5532 emit_move_insn (reg, mem);
5533 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
5534 }
5535 }
5536
5537 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
5538 of MODE. */
5539
5540 static inline bool
5541 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5542 {
5543 HOST_WIDE_INT multiple;
5544 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5545 && IN_RANGE (multiple, -8, 7));
5546 }
5547
5548 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5549 of MODE. */
5550
5551 static inline bool
5552 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5553 {
5554 HOST_WIDE_INT multiple;
5555 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5556 && IN_RANGE (multiple, 0, 63));
5557 }
5558
5559 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5560 of MODE. */
5561
5562 bool
5563 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5564 {
5565 HOST_WIDE_INT multiple;
5566 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5567 && IN_RANGE (multiple, -64, 63));
5568 }
5569
5570 /* Return true if OFFSET is a signed 9-bit value. */
5571
5572 bool
5573 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5574 poly_int64 offset)
5575 {
5576 HOST_WIDE_INT const_offset;
5577 return (offset.is_constant (&const_offset)
5578 && IN_RANGE (const_offset, -256, 255));
5579 }
5580
5581 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5582 of MODE. */
5583
5584 static inline bool
5585 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5586 {
5587 HOST_WIDE_INT multiple;
5588 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5589 && IN_RANGE (multiple, -256, 255));
5590 }
5591
5592 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5593 of MODE. */
5594
5595 static inline bool
5596 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5597 {
5598 HOST_WIDE_INT multiple;
5599 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5600 && IN_RANGE (multiple, 0, 4095));
5601 }
5602
5603 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
5604
5605 static sbitmap
5606 aarch64_get_separate_components (void)
5607 {
5608 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5609 bitmap_clear (components);
5610
5611 /* The registers we need saved to the frame. */
5612 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5613 if (aarch64_register_saved_on_entry (regno))
5614 {
5615 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5616 if (!frame_pointer_needed)
5617 offset += cfun->machine->frame.frame_size
5618 - cfun->machine->frame.hard_fp_offset;
5619 /* Check that we can access the stack slot of the register with one
5620 direct load with no adjustments needed. */
5621 if (offset_12bit_unsigned_scaled_p (DImode, offset))
5622 bitmap_set_bit (components, regno);
5623 }
5624
5625 /* Don't mess with the hard frame pointer. */
5626 if (frame_pointer_needed)
5627 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5628
5629 unsigned reg1 = cfun->machine->frame.wb_candidate1;
5630 unsigned reg2 = cfun->machine->frame.wb_candidate2;
5631 /* If registers have been chosen to be stored/restored with
5632 writeback don't interfere with them to avoid having to output explicit
5633 stack adjustment instructions. */
5634 if (reg2 != INVALID_REGNUM)
5635 bitmap_clear_bit (components, reg2);
5636 if (reg1 != INVALID_REGNUM)
5637 bitmap_clear_bit (components, reg1);
5638
5639 bitmap_clear_bit (components, LR_REGNUM);
5640 bitmap_clear_bit (components, SP_REGNUM);
5641
5642 return components;
5643 }
5644
5645 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
5646
5647 static sbitmap
5648 aarch64_components_for_bb (basic_block bb)
5649 {
5650 bitmap in = DF_LIVE_IN (bb);
5651 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5652 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5653 bool simd_function = aarch64_simd_decl_p (cfun->decl);
5654
5655 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5656 bitmap_clear (components);
5657
5658 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
5659 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5660 if ((!call_used_regs[regno]
5661 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5662 && (bitmap_bit_p (in, regno)
5663 || bitmap_bit_p (gen, regno)
5664 || bitmap_bit_p (kill, regno)))
5665 {
5666 unsigned regno2, offset, offset2;
5667 bitmap_set_bit (components, regno);
5668
5669 /* If there is a callee-save at an adjacent offset, add it too
5670 to increase the use of LDP/STP. */
5671 offset = cfun->machine->frame.reg_offset[regno];
5672 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5673
5674 if (regno2 <= LAST_SAVED_REGNUM)
5675 {
5676 offset2 = cfun->machine->frame.reg_offset[regno2];
5677 if ((offset & ~8) == (offset2 & ~8))
5678 bitmap_set_bit (components, regno2);
5679 }
5680 }
5681
5682 return components;
5683 }
5684
5685 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5686 Nothing to do for aarch64. */
5687
5688 static void
5689 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5690 {
5691 }
5692
5693 /* Return the next set bit in BMP from START onwards. Return the total number
5694 of bits in BMP if no set bit is found at or after START. */
5695
5696 static unsigned int
5697 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5698 {
5699 unsigned int nbits = SBITMAP_SIZE (bmp);
5700 if (start == nbits)
5701 return start;
5702
5703 gcc_assert (start < nbits);
5704 for (unsigned int i = start; i < nbits; i++)
5705 if (bitmap_bit_p (bmp, i))
5706 return i;
5707
5708 return nbits;
5709 }
5710
5711 /* Do the work for aarch64_emit_prologue_components and
5712 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
5713 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5714 for these components or the epilogue sequence. That is, it determines
5715 whether we should emit stores or loads and what kind of CFA notes to attach
5716 to the insns. Otherwise the logic for the two sequences is very
5717 similar. */
5718
5719 static void
5720 aarch64_process_components (sbitmap components, bool prologue_p)
5721 {
5722 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5723 ? HARD_FRAME_POINTER_REGNUM
5724 : STACK_POINTER_REGNUM);
5725
5726 unsigned last_regno = SBITMAP_SIZE (components);
5727 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5728 rtx_insn *insn = NULL;
5729
5730 while (regno != last_regno)
5731 {
5732 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5733 so DFmode for the vector registers is enough. For simd functions
5734 we want to save the low 128 bits. */
5735 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5736
5737 rtx reg = gen_rtx_REG (mode, regno);
5738 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5739 if (!frame_pointer_needed)
5740 offset += cfun->machine->frame.frame_size
5741 - cfun->machine->frame.hard_fp_offset;
5742 rtx addr = plus_constant (Pmode, ptr_reg, offset);
5743 rtx mem = gen_frame_mem (mode, addr);
5744
5745 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5746 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5747 /* No more registers to handle after REGNO.
5748 Emit a single save/restore and exit. */
5749 if (regno2 == last_regno)
5750 {
5751 insn = emit_insn (set);
5752 RTX_FRAME_RELATED_P (insn) = 1;
5753 if (prologue_p)
5754 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5755 else
5756 add_reg_note (insn, REG_CFA_RESTORE, reg);
5757 break;
5758 }
5759
5760 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5761 /* The next register is not of the same class or its offset is not
5762 mergeable with the current one into a pair. */
5763 if (!satisfies_constraint_Ump (mem)
5764 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5765 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5766 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5767 GET_MODE_SIZE (mode)))
5768 {
5769 insn = emit_insn (set);
5770 RTX_FRAME_RELATED_P (insn) = 1;
5771 if (prologue_p)
5772 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5773 else
5774 add_reg_note (insn, REG_CFA_RESTORE, reg);
5775
5776 regno = regno2;
5777 continue;
5778 }
5779
5780 /* REGNO2 can be saved/restored in a pair with REGNO. */
5781 rtx reg2 = gen_rtx_REG (mode, regno2);
5782 if (!frame_pointer_needed)
5783 offset2 += cfun->machine->frame.frame_size
5784 - cfun->machine->frame.hard_fp_offset;
5785 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5786 rtx mem2 = gen_frame_mem (mode, addr2);
5787 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5788 : gen_rtx_SET (reg2, mem2);
5789
5790 if (prologue_p)
5791 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5792 else
5793 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5794
5795 RTX_FRAME_RELATED_P (insn) = 1;
5796 if (prologue_p)
5797 {
5798 add_reg_note (insn, REG_CFA_OFFSET, set);
5799 add_reg_note (insn, REG_CFA_OFFSET, set2);
5800 }
5801 else
5802 {
5803 add_reg_note (insn, REG_CFA_RESTORE, reg);
5804 add_reg_note (insn, REG_CFA_RESTORE, reg2);
5805 }
5806
5807 regno = aarch64_get_next_set_bit (components, regno2 + 1);
5808 }
5809 }
5810
5811 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
5812
5813 static void
5814 aarch64_emit_prologue_components (sbitmap components)
5815 {
5816 aarch64_process_components (components, true);
5817 }
5818
5819 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
5820
5821 static void
5822 aarch64_emit_epilogue_components (sbitmap components)
5823 {
5824 aarch64_process_components (components, false);
5825 }
5826
5827 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
5828
5829 static void
5830 aarch64_set_handled_components (sbitmap components)
5831 {
5832 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5833 if (bitmap_bit_p (components, regno))
5834 cfun->machine->reg_is_wrapped_separately[regno] = true;
5835 }
5836
5837 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
5838 determining the probe offset for alloca. */
5839
5840 static HOST_WIDE_INT
5841 aarch64_stack_clash_protection_alloca_probe_range (void)
5842 {
5843 return STACK_CLASH_CALLER_GUARD;
5844 }
5845
5846
5847 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5848 registers. If POLY_SIZE is not large enough to require a probe this function
5849 will only adjust the stack. When allocating the stack space
5850 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5851 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5852 arguments. If we are then we ensure that any allocation larger than the ABI
5853 defined buffer needs a probe so that the invariant of having a 1KB buffer is
5854 maintained.
5855
5856 We emit barriers after each stack adjustment to prevent optimizations from
5857 breaking the invariant that we never drop the stack more than a page. This
5858 invariant is needed to make it easier to correctly handle asynchronous
5859 events, e.g. if we were to allow the stack to be dropped by more than a page
5860 and then have multiple probes up and we take a signal somewhere in between
5861 then the signal handler doesn't know the state of the stack and can make no
5862 assumptions about which pages have been probed. */
5863
5864 static void
5865 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5866 poly_int64 poly_size,
5867 bool frame_related_p,
5868 bool final_adjustment_p)
5869 {
5870 HOST_WIDE_INT guard_size
5871 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5872 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5873 /* When doing the final adjustment for the outgoing argument size we can't
5874 assume that LR was saved at position 0. So subtract it's offset from the
5875 ABI safe buffer so that we don't accidentally allow an adjustment that
5876 would result in an allocation larger than the ABI buffer without
5877 probing. */
5878 HOST_WIDE_INT min_probe_threshold
5879 = final_adjustment_p
5880 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5881 : guard_size - guard_used_by_caller;
5882
5883 poly_int64 frame_size = cfun->machine->frame.frame_size;
5884
5885 /* We should always have a positive probe threshold. */
5886 gcc_assert (min_probe_threshold > 0);
5887
5888 if (flag_stack_clash_protection && !final_adjustment_p)
5889 {
5890 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5891 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5892
5893 if (known_eq (frame_size, 0))
5894 {
5895 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5896 }
5897 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5898 && known_lt (final_adjust, guard_used_by_caller))
5899 {
5900 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5901 }
5902 }
5903
5904 /* If SIZE is not large enough to require probing, just adjust the stack and
5905 exit. */
5906 if (known_lt (poly_size, min_probe_threshold)
5907 || !flag_stack_clash_protection)
5908 {
5909 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5910 return;
5911 }
5912
5913 HOST_WIDE_INT size;
5914 /* Handle the SVE non-constant case first. */
5915 if (!poly_size.is_constant (&size))
5916 {
5917 if (dump_file)
5918 {
5919 fprintf (dump_file, "Stack clash SVE prologue: ");
5920 print_dec (poly_size, dump_file);
5921 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5922 }
5923
5924 /* First calculate the amount of bytes we're actually spilling. */
5925 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5926 poly_size, temp1, temp2, false, true);
5927
5928 rtx_insn *insn = get_last_insn ();
5929
5930 if (frame_related_p)
5931 {
5932 /* This is done to provide unwinding information for the stack
5933 adjustments we're about to do, however to prevent the optimizers
5934 from removing the R11 move and leaving the CFA note (which would be
5935 very wrong) we tie the old and new stack pointer together.
5936 The tie will expand to nothing but the optimizers will not touch
5937 the instruction. */
5938 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5939 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5940 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5941
5942 /* We want the CFA independent of the stack pointer for the
5943 duration of the loop. */
5944 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5945 RTX_FRAME_RELATED_P (insn) = 1;
5946 }
5947
5948 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5949 rtx guard_const = gen_int_mode (guard_size, Pmode);
5950
5951 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5952 stack_pointer_rtx, temp1,
5953 probe_const, guard_const));
5954
5955 /* Now reset the CFA register if needed. */
5956 if (frame_related_p)
5957 {
5958 add_reg_note (insn, REG_CFA_DEF_CFA,
5959 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5960 gen_int_mode (poly_size, Pmode)));
5961 RTX_FRAME_RELATED_P (insn) = 1;
5962 }
5963
5964 return;
5965 }
5966
5967 if (dump_file)
5968 fprintf (dump_file,
5969 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5970 " bytes, probing will be required.\n", size);
5971
5972 /* Round size to the nearest multiple of guard_size, and calculate the
5973 residual as the difference between the original size and the rounded
5974 size. */
5975 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5976 HOST_WIDE_INT residual = size - rounded_size;
5977
5978 /* We can handle a small number of allocations/probes inline. Otherwise
5979 punt to a loop. */
5980 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5981 {
5982 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5983 {
5984 aarch64_sub_sp (NULL, temp2, guard_size, true);
5985 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5986 guard_used_by_caller));
5987 emit_insn (gen_blockage ());
5988 }
5989 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5990 }
5991 else
5992 {
5993 /* Compute the ending address. */
5994 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5995 temp1, NULL, false, true);
5996 rtx_insn *insn = get_last_insn ();
5997
5998 /* For the initial allocation, we don't have a frame pointer
5999 set up, so we always need CFI notes. If we're doing the
6000 final allocation, then we may have a frame pointer, in which
6001 case it is the CFA, otherwise we need CFI notes.
6002
6003 We can determine which allocation we are doing by looking at
6004 the value of FRAME_RELATED_P since the final allocations are not
6005 frame related. */
6006 if (frame_related_p)
6007 {
6008 /* We want the CFA independent of the stack pointer for the
6009 duration of the loop. */
6010 add_reg_note (insn, REG_CFA_DEF_CFA,
6011 plus_constant (Pmode, temp1, rounded_size));
6012 RTX_FRAME_RELATED_P (insn) = 1;
6013 }
6014
6015 /* This allocates and probes the stack. Note that this re-uses some of
6016 the existing Ada stack protection code. However we are guaranteed not
6017 to enter the non loop or residual branches of that code.
6018
6019 The non-loop part won't be entered because if our allocation amount
6020 doesn't require a loop, the case above would handle it.
6021
6022 The residual amount won't be entered because TEMP1 is a mutliple of
6023 the allocation size. The residual will always be 0. As such, the only
6024 part we are actually using from that code is the loop setup. The
6025 actual probing is done in aarch64_output_probe_stack_range. */
6026 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
6027 stack_pointer_rtx, temp1));
6028
6029 /* Now reset the CFA register if needed. */
6030 if (frame_related_p)
6031 {
6032 add_reg_note (insn, REG_CFA_DEF_CFA,
6033 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
6034 RTX_FRAME_RELATED_P (insn) = 1;
6035 }
6036
6037 emit_insn (gen_blockage ());
6038 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
6039 }
6040
6041 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
6042 be probed. This maintains the requirement that each page is probed at
6043 least once. For initial probing we probe only if the allocation is
6044 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
6045 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
6046 GUARD_SIZE. This works that for any allocation that is large enough to
6047 trigger a probe here, we'll have at least one, and if they're not large
6048 enough for this code to emit anything for them, The page would have been
6049 probed by the saving of FP/LR either by this function or any callees. If
6050 we don't have any callees then we won't have more stack adjustments and so
6051 are still safe. */
6052 if (residual)
6053 {
6054 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
6055 /* If we're doing final adjustments, and we've done any full page
6056 allocations then any residual needs to be probed. */
6057 if (final_adjustment_p && rounded_size != 0)
6058 min_probe_threshold = 0;
6059 /* If doing a small final adjustment, we always probe at offset 0.
6060 This is done to avoid issues when LR is not at position 0 or when
6061 the final adjustment is smaller than the probing offset. */
6062 else if (final_adjustment_p && rounded_size == 0)
6063 residual_probe_offset = 0;
6064
6065 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
6066 if (residual >= min_probe_threshold)
6067 {
6068 if (dump_file)
6069 fprintf (dump_file,
6070 "Stack clash AArch64 prologue residuals: "
6071 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
6072 "\n", residual);
6073
6074 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
6075 residual_probe_offset));
6076 emit_insn (gen_blockage ());
6077 }
6078 }
6079 }
6080
6081 /* Return 1 if the register is used by the epilogue. We need to say the
6082 return register is used, but only after epilogue generation is complete.
6083 Note that in the case of sibcalls, the values "used by the epilogue" are
6084 considered live at the start of the called function.
6085
6086 For SIMD functions we need to return 1 for FP registers that are saved and
6087 restored by a function but are not zero in call_used_regs. If we do not do
6088 this optimizations may remove the restore of the register. */
6089
6090 int
6091 aarch64_epilogue_uses (int regno)
6092 {
6093 if (epilogue_completed)
6094 {
6095 if (regno == LR_REGNUM)
6096 return 1;
6097 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
6098 return 1;
6099 }
6100 return 0;
6101 }
6102
6103 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6104 is saved at BASE + OFFSET. */
6105
6106 static void
6107 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
6108 rtx base, poly_int64 offset)
6109 {
6110 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
6111 add_reg_note (insn, REG_CFA_EXPRESSION,
6112 gen_rtx_SET (mem, regno_reg_rtx[reg]));
6113 }
6114
6115 /* AArch64 stack frames generated by this compiler look like:
6116
6117 +-------------------------------+
6118 | |
6119 | incoming stack arguments |
6120 | |
6121 +-------------------------------+
6122 | | <-- incoming stack pointer (aligned)
6123 | callee-allocated save area |
6124 | for register varargs |
6125 | |
6126 +-------------------------------+
6127 | local variables | <-- frame_pointer_rtx
6128 | |
6129 +-------------------------------+
6130 | padding | \
6131 +-------------------------------+ |
6132 | callee-saved registers | | frame.saved_regs_size
6133 +-------------------------------+ |
6134 | LR' | |
6135 +-------------------------------+ |
6136 | FP' | / <- hard_frame_pointer_rtx (aligned)
6137 +-------------------------------+
6138 | dynamic allocation |
6139 +-------------------------------+
6140 | padding |
6141 +-------------------------------+
6142 | outgoing stack arguments | <-- arg_pointer
6143 | |
6144 +-------------------------------+
6145 | | <-- stack_pointer_rtx (aligned)
6146
6147 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
6148 but leave frame_pointer_rtx and hard_frame_pointer_rtx
6149 unchanged.
6150
6151 By default for stack-clash we assume the guard is at least 64KB, but this
6152 value is configurable to either 4KB or 64KB. We also force the guard size to
6153 be the same as the probing interval and both values are kept in sync.
6154
6155 With those assumptions the callee can allocate up to 63KB (or 3KB depending
6156 on the guard size) of stack space without probing.
6157
6158 When probing is needed, we emit a probe at the start of the prologue
6159 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
6160
6161 We have to track how much space has been allocated and the only stores
6162 to the stack we track as implicit probes are the FP/LR stores.
6163
6164 For outgoing arguments we probe if the size is larger than 1KB, such that
6165 the ABI specified buffer is maintained for the next callee.
6166
6167 The following registers are reserved during frame layout and should not be
6168 used for any other purpose:
6169
6170 - r11: Used by stack clash protection when SVE is enabled.
6171 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
6172 - r14 and r15: Used for speculation tracking.
6173 - r16(IP0), r17(IP1): Used by indirect tailcalls.
6174 - r30(LR), r29(FP): Used by standard frame layout.
6175
6176 These registers must be avoided in frame layout related code unless the
6177 explicit intention is to interact with one of the features listed above. */
6178
6179 /* Generate the prologue instructions for entry into a function.
6180 Establish the stack frame by decreasing the stack pointer with a
6181 properly calculated size and, if necessary, create a frame record
6182 filled with the values of LR and previous frame pointer. The
6183 current FP is also set up if it is in use. */
6184
6185 void
6186 aarch64_expand_prologue (void)
6187 {
6188 poly_int64 frame_size = cfun->machine->frame.frame_size;
6189 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6190 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6191 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6192 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6193 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6194 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6195 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
6196 rtx_insn *insn;
6197
6198 /* Sign return address for functions. */
6199 if (aarch64_return_address_signing_enabled ())
6200 {
6201 switch (aarch64_ra_sign_key)
6202 {
6203 case AARCH64_KEY_A:
6204 insn = emit_insn (gen_paciasp ());
6205 break;
6206 case AARCH64_KEY_B:
6207 insn = emit_insn (gen_pacibsp ());
6208 break;
6209 default:
6210 gcc_unreachable ();
6211 }
6212 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6213 RTX_FRAME_RELATED_P (insn) = 1;
6214 }
6215
6216 if (flag_stack_usage_info)
6217 current_function_static_stack_size = constant_lower_bound (frame_size);
6218
6219 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
6220 {
6221 if (crtl->is_leaf && !cfun->calls_alloca)
6222 {
6223 if (maybe_gt (frame_size, PROBE_INTERVAL)
6224 && maybe_gt (frame_size, get_stack_check_protect ()))
6225 aarch64_emit_probe_stack_range (get_stack_check_protect (),
6226 (frame_size
6227 - get_stack_check_protect ()));
6228 }
6229 else if (maybe_gt (frame_size, 0))
6230 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
6231 }
6232
6233 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6234 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6235
6236 /* In theory we should never have both an initial adjustment
6237 and a callee save adjustment. Verify that is the case since the
6238 code below does not handle it for -fstack-clash-protection. */
6239 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
6240
6241 /* Will only probe if the initial adjustment is larger than the guard
6242 less the amount of the guard reserved for use by the caller's
6243 outgoing args. */
6244 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
6245 true, false);
6246
6247 if (callee_adjust != 0)
6248 aarch64_push_regs (reg1, reg2, callee_adjust);
6249
6250 if (emit_frame_chain)
6251 {
6252 poly_int64 reg_offset = callee_adjust;
6253 if (callee_adjust == 0)
6254 {
6255 reg1 = R29_REGNUM;
6256 reg2 = R30_REGNUM;
6257 reg_offset = callee_offset;
6258 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
6259 }
6260 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
6261 stack_pointer_rtx, callee_offset,
6262 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
6263 if (frame_pointer_needed && !frame_size.is_constant ())
6264 {
6265 /* Variable-sized frames need to describe the save slot
6266 address using DW_CFA_expression rather than DW_CFA_offset.
6267 This means that, without taking further action, the
6268 locations of the registers that we've already saved would
6269 remain based on the stack pointer even after we redefine
6270 the CFA based on the frame pointer. We therefore need new
6271 DW_CFA_expressions to re-express the save slots with addresses
6272 based on the frame pointer. */
6273 rtx_insn *insn = get_last_insn ();
6274 gcc_assert (RTX_FRAME_RELATED_P (insn));
6275
6276 /* Add an explicit CFA definition if this was previously
6277 implicit. */
6278 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
6279 {
6280 rtx src = plus_constant (Pmode, stack_pointer_rtx,
6281 callee_offset);
6282 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6283 gen_rtx_SET (hard_frame_pointer_rtx, src));
6284 }
6285
6286 /* Change the save slot expressions for the registers that
6287 we've already saved. */
6288 reg_offset -= callee_offset;
6289 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
6290 reg_offset + UNITS_PER_WORD);
6291 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
6292 reg_offset);
6293 }
6294 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
6295 }
6296
6297 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6298 callee_adjust != 0 || emit_frame_chain);
6299 if (aarch64_simd_decl_p (cfun->decl))
6300 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6301 callee_adjust != 0 || emit_frame_chain);
6302 else
6303 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6304 callee_adjust != 0 || emit_frame_chain);
6305
6306 /* We may need to probe the final adjustment if it is larger than the guard
6307 that is assumed by the called. */
6308 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
6309 !frame_pointer_needed, true);
6310 }
6311
6312 /* Return TRUE if we can use a simple_return insn.
6313
6314 This function checks whether the callee saved stack is empty, which
6315 means no restore actions are need. The pro_and_epilogue will use
6316 this to check whether shrink-wrapping opt is feasible. */
6317
6318 bool
6319 aarch64_use_return_insn_p (void)
6320 {
6321 if (!reload_completed)
6322 return false;
6323
6324 if (crtl->profile)
6325 return false;
6326
6327 return known_eq (cfun->machine->frame.frame_size, 0);
6328 }
6329
6330 /* Return false for non-leaf SIMD functions in order to avoid
6331 shrink-wrapping them. Doing this will lose the necessary
6332 save/restore of FP registers. */
6333
6334 bool
6335 aarch64_use_simple_return_insn_p (void)
6336 {
6337 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
6338 return false;
6339
6340 return true;
6341 }
6342
6343 /* Generate the epilogue instructions for returning from a function.
6344 This is almost exactly the reverse of the prolog sequence, except
6345 that we need to insert barriers to avoid scheduling loads that read
6346 from a deallocated stack, and we optimize the unwind records by
6347 emitting them all together if possible. */
6348 void
6349 aarch64_expand_epilogue (bool for_sibcall)
6350 {
6351 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
6352 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
6353 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
6354 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
6355 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6356 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6357 rtx cfi_ops = NULL;
6358 rtx_insn *insn;
6359 /* A stack clash protection prologue may not have left EP0_REGNUM or
6360 EP1_REGNUM in a usable state. The same is true for allocations
6361 with an SVE component, since we then need both temporary registers
6362 for each allocation. For stack clash we are in a usable state if
6363 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
6364 HOST_WIDE_INT guard_size
6365 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
6366 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
6367
6368 /* We can re-use the registers when the allocation amount is smaller than
6369 guard_size - guard_used_by_caller because we won't be doing any probes
6370 then. In such situations the register should remain live with the correct
6371 value. */
6372 bool can_inherit_p = (initial_adjust.is_constant ()
6373 && final_adjust.is_constant ())
6374 && (!flag_stack_clash_protection
6375 || known_lt (initial_adjust,
6376 guard_size - guard_used_by_caller));
6377
6378 /* We need to add memory barrier to prevent read from deallocated stack. */
6379 bool need_barrier_p
6380 = maybe_ne (get_frame_size ()
6381 + cfun->machine->frame.saved_varargs_size, 0);
6382
6383 /* Emit a barrier to prevent loads from a deallocated stack. */
6384 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
6385 || cfun->calls_alloca
6386 || crtl->calls_eh_return)
6387 {
6388 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6389 need_barrier_p = false;
6390 }
6391
6392 /* Restore the stack pointer from the frame pointer if it may not
6393 be the same as the stack pointer. */
6394 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
6395 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
6396 if (frame_pointer_needed
6397 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
6398 /* If writeback is used when restoring callee-saves, the CFA
6399 is restored on the instruction doing the writeback. */
6400 aarch64_add_offset (Pmode, stack_pointer_rtx,
6401 hard_frame_pointer_rtx, -callee_offset,
6402 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
6403 else
6404 /* The case where we need to re-use the register here is very rare, so
6405 avoid the complicated condition and just always emit a move if the
6406 immediate doesn't fit. */
6407 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
6408
6409 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
6410 callee_adjust != 0, &cfi_ops);
6411 if (aarch64_simd_decl_p (cfun->decl))
6412 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6413 callee_adjust != 0, &cfi_ops);
6414 else
6415 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
6416 callee_adjust != 0, &cfi_ops);
6417
6418 if (need_barrier_p)
6419 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
6420
6421 if (callee_adjust != 0)
6422 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
6423
6424 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
6425 {
6426 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
6427 insn = get_last_insn ();
6428 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
6429 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
6430 RTX_FRAME_RELATED_P (insn) = 1;
6431 cfi_ops = NULL;
6432 }
6433
6434 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
6435 add restriction on emit_move optimization to leaf functions. */
6436 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
6437 (!can_inherit_p || !crtl->is_leaf
6438 || df_regs_ever_live_p (EP0_REGNUM)));
6439
6440 if (cfi_ops)
6441 {
6442 /* Emit delayed restores and reset the CFA to be SP. */
6443 insn = get_last_insn ();
6444 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
6445 REG_NOTES (insn) = cfi_ops;
6446 RTX_FRAME_RELATED_P (insn) = 1;
6447 }
6448
6449 /* We prefer to emit the combined return/authenticate instruction RETAA,
6450 however there are three cases in which we must instead emit an explicit
6451 authentication instruction.
6452
6453 1) Sibcalls don't return in a normal way, so if we're about to call one
6454 we must authenticate.
6455
6456 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
6457 generating code for !TARGET_ARMV8_3 we can't use it and must
6458 explicitly authenticate.
6459
6460 3) On an eh_return path we make extra stack adjustments to update the
6461 canonical frame address to be the exception handler's CFA. We want
6462 to authenticate using the CFA of the function which calls eh_return.
6463 */
6464 if (aarch64_return_address_signing_enabled ()
6465 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
6466 {
6467 switch (aarch64_ra_sign_key)
6468 {
6469 case AARCH64_KEY_A:
6470 insn = emit_insn (gen_autiasp ());
6471 break;
6472 case AARCH64_KEY_B:
6473 insn = emit_insn (gen_autibsp ());
6474 break;
6475 default:
6476 gcc_unreachable ();
6477 }
6478 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
6479 RTX_FRAME_RELATED_P (insn) = 1;
6480 }
6481
6482 /* Stack adjustment for exception handler. */
6483 if (crtl->calls_eh_return && !for_sibcall)
6484 {
6485 /* We need to unwind the stack by the offset computed by
6486 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
6487 to be SP; letting the CFA move during this adjustment
6488 is just as correct as retaining the CFA from the body
6489 of the function. Therefore, do nothing special. */
6490 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
6491 }
6492
6493 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
6494 if (!for_sibcall)
6495 emit_jump_insn (ret_rtx);
6496 }
6497
6498 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
6499 normally or return to a previous frame after unwinding.
6500
6501 An EH return uses a single shared return sequence. The epilogue is
6502 exactly like a normal epilogue except that it has an extra input
6503 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
6504 that must be applied after the frame has been destroyed. An extra label
6505 is inserted before the epilogue which initializes this register to zero,
6506 and this is the entry point for a normal return.
6507
6508 An actual EH return updates the return address, initializes the stack
6509 adjustment and jumps directly into the epilogue (bypassing the zeroing
6510 of the adjustment). Since the return address is typically saved on the
6511 stack when a function makes a call, the saved LR must be updated outside
6512 the epilogue.
6513
6514 This poses problems as the store is generated well before the epilogue,
6515 so the offset of LR is not known yet. Also optimizations will remove the
6516 store as it appears dead, even after the epilogue is generated (as the
6517 base or offset for loading LR is different in many cases).
6518
6519 To avoid these problems this implementation forces the frame pointer
6520 in eh_return functions so that the location of LR is fixed and known early.
6521 It also marks the store volatile, so no optimization is permitted to
6522 remove the store. */
6523 rtx
6524 aarch64_eh_return_handler_rtx (void)
6525 {
6526 rtx tmp = gen_frame_mem (Pmode,
6527 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
6528
6529 /* Mark the store volatile, so no optimization is permitted to remove it. */
6530 MEM_VOLATILE_P (tmp) = true;
6531 return tmp;
6532 }
6533
6534 /* Output code to add DELTA to the first argument, and then jump
6535 to FUNCTION. Used for C++ multiple inheritance. */
6536 static void
6537 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
6538 HOST_WIDE_INT delta,
6539 HOST_WIDE_INT vcall_offset,
6540 tree function)
6541 {
6542 /* The this pointer is always in x0. Note that this differs from
6543 Arm where the this pointer maybe bumped to r1 if r0 is required
6544 to return a pointer to an aggregate. On AArch64 a result value
6545 pointer will be in x8. */
6546 int this_regno = R0_REGNUM;
6547 rtx this_rtx, temp0, temp1, addr, funexp;
6548 rtx_insn *insn;
6549 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
6550
6551 if (aarch64_bti_enabled ())
6552 emit_insn (gen_bti_c());
6553
6554 reload_completed = 1;
6555 emit_note (NOTE_INSN_PROLOGUE_END);
6556
6557 this_rtx = gen_rtx_REG (Pmode, this_regno);
6558 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
6559 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
6560
6561 if (vcall_offset == 0)
6562 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
6563 else
6564 {
6565 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6566
6567 addr = this_rtx;
6568 if (delta != 0)
6569 {
6570 if (delta >= -256 && delta < 256)
6571 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6572 plus_constant (Pmode, this_rtx, delta));
6573 else
6574 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6575 temp1, temp0, false);
6576 }
6577
6578 if (Pmode == ptr_mode)
6579 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6580 else
6581 aarch64_emit_move (temp0,
6582 gen_rtx_ZERO_EXTEND (Pmode,
6583 gen_rtx_MEM (ptr_mode, addr)));
6584
6585 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6586 addr = plus_constant (Pmode, temp0, vcall_offset);
6587 else
6588 {
6589 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6590 Pmode);
6591 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6592 }
6593
6594 if (Pmode == ptr_mode)
6595 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6596 else
6597 aarch64_emit_move (temp1,
6598 gen_rtx_SIGN_EXTEND (Pmode,
6599 gen_rtx_MEM (ptr_mode, addr)));
6600
6601 emit_insn (gen_add2_insn (this_rtx, temp1));
6602 }
6603
6604 /* Generate a tail call to the target function. */
6605 if (!TREE_USED (function))
6606 {
6607 assemble_external (function);
6608 TREE_USED (function) = 1;
6609 }
6610 funexp = XEXP (DECL_RTL (function), 0);
6611 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6612 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6613 SIBLING_CALL_P (insn) = 1;
6614
6615 insn = get_insns ();
6616 shorten_branches (insn);
6617
6618 assemble_start_function (thunk, fnname);
6619 final_start_function (insn, file, 1);
6620 final (insn, file, 1);
6621 final_end_function ();
6622 assemble_end_function (thunk, fnname);
6623
6624 /* Stop pretending to be a post-reload pass. */
6625 reload_completed = 0;
6626 }
6627
6628 static bool
6629 aarch64_tls_referenced_p (rtx x)
6630 {
6631 if (!TARGET_HAVE_TLS)
6632 return false;
6633 subrtx_iterator::array_type array;
6634 FOR_EACH_SUBRTX (iter, array, x, ALL)
6635 {
6636 const_rtx x = *iter;
6637 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6638 return true;
6639 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6640 TLS offsets, not real symbol references. */
6641 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6642 iter.skip_subrtxes ();
6643 }
6644 return false;
6645 }
6646
6647
6648 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6649 a left shift of 0 or 12 bits. */
6650 bool
6651 aarch64_uimm12_shift (HOST_WIDE_INT val)
6652 {
6653 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6654 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6655 );
6656 }
6657
6658 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6659 that can be created with a left shift of 0 or 12. */
6660 static HOST_WIDE_INT
6661 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6662 {
6663 /* Check to see if the value fits in 24 bits, as that is the maximum we can
6664 handle correctly. */
6665 gcc_assert ((val & 0xffffff) == val);
6666
6667 if (((val & 0xfff) << 0) == val)
6668 return val;
6669
6670 return val & (0xfff << 12);
6671 }
6672
6673 /* Return true if val is an immediate that can be loaded into a
6674 register by a MOVZ instruction. */
6675 static bool
6676 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6677 {
6678 if (GET_MODE_SIZE (mode) > 4)
6679 {
6680 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6681 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6682 return 1;
6683 }
6684 else
6685 {
6686 /* Ignore sign extension. */
6687 val &= (HOST_WIDE_INT) 0xffffffff;
6688 }
6689 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6690 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6691 }
6692
6693 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
6694 64-bit (DImode) integer. */
6695
6696 static unsigned HOST_WIDE_INT
6697 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6698 {
6699 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6700 while (size < 64)
6701 {
6702 val &= (HOST_WIDE_INT_1U << size) - 1;
6703 val |= val << size;
6704 size *= 2;
6705 }
6706 return val;
6707 }
6708
6709 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
6710
6711 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6712 {
6713 0x0000000100000001ull,
6714 0x0001000100010001ull,
6715 0x0101010101010101ull,
6716 0x1111111111111111ull,
6717 0x5555555555555555ull,
6718 };
6719
6720
6721 /* Return true if val is a valid bitmask immediate. */
6722
6723 bool
6724 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6725 {
6726 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6727 int bits;
6728
6729 /* Check for a single sequence of one bits and return quickly if so.
6730 The special cases of all ones and all zeroes returns false. */
6731 val = aarch64_replicate_bitmask_imm (val_in, mode);
6732 tmp = val + (val & -val);
6733
6734 if (tmp == (tmp & -tmp))
6735 return (val + 1) > 1;
6736
6737 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
6738 if (mode == SImode)
6739 val = (val << 32) | (val & 0xffffffff);
6740
6741 /* Invert if the immediate doesn't start with a zero bit - this means we
6742 only need to search for sequences of one bits. */
6743 if (val & 1)
6744 val = ~val;
6745
6746 /* Find the first set bit and set tmp to val with the first sequence of one
6747 bits removed. Return success if there is a single sequence of ones. */
6748 first_one = val & -val;
6749 tmp = val & (val + first_one);
6750
6751 if (tmp == 0)
6752 return true;
6753
6754 /* Find the next set bit and compute the difference in bit position. */
6755 next_one = tmp & -tmp;
6756 bits = clz_hwi (first_one) - clz_hwi (next_one);
6757 mask = val ^ tmp;
6758
6759 /* Check the bit position difference is a power of 2, and that the first
6760 sequence of one bits fits within 'bits' bits. */
6761 if ((mask >> bits) != 0 || bits != (bits & -bits))
6762 return false;
6763
6764 /* Check the sequence of one bits is repeated 64/bits times. */
6765 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6766 }
6767
6768 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6769 Assumed precondition: VAL_IN Is not zero. */
6770
6771 unsigned HOST_WIDE_INT
6772 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6773 {
6774 int lowest_bit_set = ctz_hwi (val_in);
6775 int highest_bit_set = floor_log2 (val_in);
6776 gcc_assert (val_in != 0);
6777
6778 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6779 (HOST_WIDE_INT_1U << lowest_bit_set));
6780 }
6781
6782 /* Create constant where bits outside of lowest bit set to highest bit set
6783 are set to 1. */
6784
6785 unsigned HOST_WIDE_INT
6786 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6787 {
6788 return val_in | ~aarch64_and_split_imm1 (val_in);
6789 }
6790
6791 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
6792
6793 bool
6794 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6795 {
6796 scalar_int_mode int_mode;
6797 if (!is_a <scalar_int_mode> (mode, &int_mode))
6798 return false;
6799
6800 if (aarch64_bitmask_imm (val_in, int_mode))
6801 return false;
6802
6803 if (aarch64_move_imm (val_in, int_mode))
6804 return false;
6805
6806 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6807
6808 return aarch64_bitmask_imm (imm2, int_mode);
6809 }
6810
6811 /* Return true if val is an immediate that can be loaded into a
6812 register in a single instruction. */
6813 bool
6814 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6815 {
6816 scalar_int_mode int_mode;
6817 if (!is_a <scalar_int_mode> (mode, &int_mode))
6818 return false;
6819
6820 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6821 return 1;
6822 return aarch64_bitmask_imm (val, int_mode);
6823 }
6824
6825 static bool
6826 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6827 {
6828 rtx base, offset;
6829
6830 if (GET_CODE (x) == HIGH)
6831 return true;
6832
6833 /* There's no way to calculate VL-based values using relocations. */
6834 subrtx_iterator::array_type array;
6835 FOR_EACH_SUBRTX (iter, array, x, ALL)
6836 if (GET_CODE (*iter) == CONST_POLY_INT)
6837 return true;
6838
6839 split_const (x, &base, &offset);
6840 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6841 {
6842 if (aarch64_classify_symbol (base, INTVAL (offset))
6843 != SYMBOL_FORCE_TO_MEM)
6844 return true;
6845 else
6846 /* Avoid generating a 64-bit relocation in ILP32; leave
6847 to aarch64_expand_mov_immediate to handle it properly. */
6848 return mode != ptr_mode;
6849 }
6850
6851 return aarch64_tls_referenced_p (x);
6852 }
6853
6854 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6855 The expansion for a table switch is quite expensive due to the number
6856 of instructions, the table lookup and hard to predict indirect jump.
6857 When optimizing for speed, and -O3 enabled, use the per-core tuning if
6858 set, otherwise use tables for > 16 cases as a tradeoff between size and
6859 performance. When optimizing for size, use the default setting. */
6860
6861 static unsigned int
6862 aarch64_case_values_threshold (void)
6863 {
6864 /* Use the specified limit for the number of cases before using jump
6865 tables at higher optimization levels. */
6866 if (optimize > 2
6867 && selected_cpu->tune->max_case_values != 0)
6868 return selected_cpu->tune->max_case_values;
6869 else
6870 return optimize_size ? default_case_values_threshold () : 17;
6871 }
6872
6873 /* Return true if register REGNO is a valid index register.
6874 STRICT_P is true if REG_OK_STRICT is in effect. */
6875
6876 bool
6877 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6878 {
6879 if (!HARD_REGISTER_NUM_P (regno))
6880 {
6881 if (!strict_p)
6882 return true;
6883
6884 if (!reg_renumber)
6885 return false;
6886
6887 regno = reg_renumber[regno];
6888 }
6889 return GP_REGNUM_P (regno);
6890 }
6891
6892 /* Return true if register REGNO is a valid base register for mode MODE.
6893 STRICT_P is true if REG_OK_STRICT is in effect. */
6894
6895 bool
6896 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6897 {
6898 if (!HARD_REGISTER_NUM_P (regno))
6899 {
6900 if (!strict_p)
6901 return true;
6902
6903 if (!reg_renumber)
6904 return false;
6905
6906 regno = reg_renumber[regno];
6907 }
6908
6909 /* The fake registers will be eliminated to either the stack or
6910 hard frame pointer, both of which are usually valid base registers.
6911 Reload deals with the cases where the eliminated form isn't valid. */
6912 return (GP_REGNUM_P (regno)
6913 || regno == SP_REGNUM
6914 || regno == FRAME_POINTER_REGNUM
6915 || regno == ARG_POINTER_REGNUM);
6916 }
6917
6918 /* Return true if X is a valid base register for mode MODE.
6919 STRICT_P is true if REG_OK_STRICT is in effect. */
6920
6921 static bool
6922 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6923 {
6924 if (!strict_p
6925 && GET_CODE (x) == SUBREG
6926 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6927 x = SUBREG_REG (x);
6928
6929 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6930 }
6931
6932 /* Return true if address offset is a valid index. If it is, fill in INFO
6933 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
6934
6935 static bool
6936 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6937 machine_mode mode, bool strict_p)
6938 {
6939 enum aarch64_address_type type;
6940 rtx index;
6941 int shift;
6942
6943 /* (reg:P) */
6944 if ((REG_P (x) || GET_CODE (x) == SUBREG)
6945 && GET_MODE (x) == Pmode)
6946 {
6947 type = ADDRESS_REG_REG;
6948 index = x;
6949 shift = 0;
6950 }
6951 /* (sign_extend:DI (reg:SI)) */
6952 else if ((GET_CODE (x) == SIGN_EXTEND
6953 || GET_CODE (x) == ZERO_EXTEND)
6954 && GET_MODE (x) == DImode
6955 && GET_MODE (XEXP (x, 0)) == SImode)
6956 {
6957 type = (GET_CODE (x) == SIGN_EXTEND)
6958 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6959 index = XEXP (x, 0);
6960 shift = 0;
6961 }
6962 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6963 else if (GET_CODE (x) == MULT
6964 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6965 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6966 && GET_MODE (XEXP (x, 0)) == DImode
6967 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6968 && CONST_INT_P (XEXP (x, 1)))
6969 {
6970 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6971 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6972 index = XEXP (XEXP (x, 0), 0);
6973 shift = exact_log2 (INTVAL (XEXP (x, 1)));
6974 }
6975 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6976 else if (GET_CODE (x) == ASHIFT
6977 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6978 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6979 && GET_MODE (XEXP (x, 0)) == DImode
6980 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6981 && CONST_INT_P (XEXP (x, 1)))
6982 {
6983 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6984 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6985 index = XEXP (XEXP (x, 0), 0);
6986 shift = INTVAL (XEXP (x, 1));
6987 }
6988 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6989 else if ((GET_CODE (x) == SIGN_EXTRACT
6990 || GET_CODE (x) == ZERO_EXTRACT)
6991 && GET_MODE (x) == DImode
6992 && GET_CODE (XEXP (x, 0)) == MULT
6993 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6994 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6995 {
6996 type = (GET_CODE (x) == SIGN_EXTRACT)
6997 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6998 index = XEXP (XEXP (x, 0), 0);
6999 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7000 if (INTVAL (XEXP (x, 1)) != 32 + shift
7001 || INTVAL (XEXP (x, 2)) != 0)
7002 shift = -1;
7003 }
7004 /* (and:DI (mult:DI (reg:DI) (const_int scale))
7005 (const_int 0xffffffff<<shift)) */
7006 else if (GET_CODE (x) == AND
7007 && GET_MODE (x) == DImode
7008 && GET_CODE (XEXP (x, 0)) == MULT
7009 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7010 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7011 && CONST_INT_P (XEXP (x, 1)))
7012 {
7013 type = ADDRESS_REG_UXTW;
7014 index = XEXP (XEXP (x, 0), 0);
7015 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
7016 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7017 shift = -1;
7018 }
7019 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
7020 else if ((GET_CODE (x) == SIGN_EXTRACT
7021 || GET_CODE (x) == ZERO_EXTRACT)
7022 && GET_MODE (x) == DImode
7023 && GET_CODE (XEXP (x, 0)) == ASHIFT
7024 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7025 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
7026 {
7027 type = (GET_CODE (x) == SIGN_EXTRACT)
7028 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
7029 index = XEXP (XEXP (x, 0), 0);
7030 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7031 if (INTVAL (XEXP (x, 1)) != 32 + shift
7032 || INTVAL (XEXP (x, 2)) != 0)
7033 shift = -1;
7034 }
7035 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
7036 (const_int 0xffffffff<<shift)) */
7037 else if (GET_CODE (x) == AND
7038 && GET_MODE (x) == DImode
7039 && GET_CODE (XEXP (x, 0)) == ASHIFT
7040 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
7041 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7042 && CONST_INT_P (XEXP (x, 1)))
7043 {
7044 type = ADDRESS_REG_UXTW;
7045 index = XEXP (XEXP (x, 0), 0);
7046 shift = INTVAL (XEXP (XEXP (x, 0), 1));
7047 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
7048 shift = -1;
7049 }
7050 /* (mult:P (reg:P) (const_int scale)) */
7051 else if (GET_CODE (x) == MULT
7052 && GET_MODE (x) == Pmode
7053 && GET_MODE (XEXP (x, 0)) == Pmode
7054 && CONST_INT_P (XEXP (x, 1)))
7055 {
7056 type = ADDRESS_REG_REG;
7057 index = XEXP (x, 0);
7058 shift = exact_log2 (INTVAL (XEXP (x, 1)));
7059 }
7060 /* (ashift:P (reg:P) (const_int shift)) */
7061 else if (GET_CODE (x) == ASHIFT
7062 && GET_MODE (x) == Pmode
7063 && GET_MODE (XEXP (x, 0)) == Pmode
7064 && CONST_INT_P (XEXP (x, 1)))
7065 {
7066 type = ADDRESS_REG_REG;
7067 index = XEXP (x, 0);
7068 shift = INTVAL (XEXP (x, 1));
7069 }
7070 else
7071 return false;
7072
7073 if (!strict_p
7074 && GET_CODE (index) == SUBREG
7075 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
7076 index = SUBREG_REG (index);
7077
7078 if (aarch64_sve_data_mode_p (mode))
7079 {
7080 if (type != ADDRESS_REG_REG
7081 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
7082 return false;
7083 }
7084 else
7085 {
7086 if (shift != 0
7087 && !(IN_RANGE (shift, 1, 3)
7088 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
7089 return false;
7090 }
7091
7092 if (REG_P (index)
7093 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
7094 {
7095 info->type = type;
7096 info->offset = index;
7097 info->shift = shift;
7098 return true;
7099 }
7100
7101 return false;
7102 }
7103
7104 /* Return true if MODE is one of the modes for which we
7105 support LDP/STP operations. */
7106
7107 static bool
7108 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
7109 {
7110 return mode == SImode || mode == DImode
7111 || mode == SFmode || mode == DFmode
7112 || (aarch64_vector_mode_supported_p (mode)
7113 && (known_eq (GET_MODE_SIZE (mode), 8)
7114 || (known_eq (GET_MODE_SIZE (mode), 16)
7115 && (aarch64_tune_params.extra_tuning_flags
7116 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
7117 }
7118
7119 /* Return true if REGNO is a virtual pointer register, or an eliminable
7120 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
7121 include stack_pointer or hard_frame_pointer. */
7122 static bool
7123 virt_or_elim_regno_p (unsigned regno)
7124 {
7125 return ((regno >= FIRST_VIRTUAL_REGISTER
7126 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
7127 || regno == FRAME_POINTER_REGNUM
7128 || regno == ARG_POINTER_REGNUM);
7129 }
7130
7131 /* Return true if X is a valid address of type TYPE for machine mode MODE.
7132 If it is, fill in INFO appropriately. STRICT_P is true if
7133 REG_OK_STRICT is in effect. */
7134
7135 bool
7136 aarch64_classify_address (struct aarch64_address_info *info,
7137 rtx x, machine_mode mode, bool strict_p,
7138 aarch64_addr_query_type type)
7139 {
7140 enum rtx_code code = GET_CODE (x);
7141 rtx op0, op1;
7142 poly_int64 offset;
7143
7144 HOST_WIDE_INT const_size;
7145
7146 /* On BE, we use load/store pair for all large int mode load/stores.
7147 TI/TFmode may also use a load/store pair. */
7148 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7149 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
7150 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
7151 || type == ADDR_QUERY_LDP_STP_N
7152 || mode == TImode
7153 || mode == TFmode
7154 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
7155
7156 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
7157 corresponds to the actual size of the memory being loaded/stored and the
7158 mode of the corresponding addressing mode is half of that. */
7159 if (type == ADDR_QUERY_LDP_STP_N
7160 && known_eq (GET_MODE_SIZE (mode), 16))
7161 mode = DFmode;
7162
7163 bool allow_reg_index_p = (!load_store_pair_p
7164 && (known_lt (GET_MODE_SIZE (mode), 16)
7165 || vec_flags == VEC_ADVSIMD
7166 || vec_flags & VEC_SVE_DATA));
7167
7168 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
7169 [Rn, #offset, MUL VL]. */
7170 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
7171 && (code != REG && code != PLUS))
7172 return false;
7173
7174 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
7175 REG addressing. */
7176 if (advsimd_struct_p
7177 && !BYTES_BIG_ENDIAN
7178 && (code != POST_INC && code != REG))
7179 return false;
7180
7181 gcc_checking_assert (GET_MODE (x) == VOIDmode
7182 || SCALAR_INT_MODE_P (GET_MODE (x)));
7183
7184 switch (code)
7185 {
7186 case REG:
7187 case SUBREG:
7188 info->type = ADDRESS_REG_IMM;
7189 info->base = x;
7190 info->offset = const0_rtx;
7191 info->const_offset = 0;
7192 return aarch64_base_register_rtx_p (x, strict_p);
7193
7194 case PLUS:
7195 op0 = XEXP (x, 0);
7196 op1 = XEXP (x, 1);
7197
7198 if (! strict_p
7199 && REG_P (op0)
7200 && virt_or_elim_regno_p (REGNO (op0))
7201 && poly_int_rtx_p (op1, &offset))
7202 {
7203 info->type = ADDRESS_REG_IMM;
7204 info->base = op0;
7205 info->offset = op1;
7206 info->const_offset = offset;
7207
7208 return true;
7209 }
7210
7211 if (maybe_ne (GET_MODE_SIZE (mode), 0)
7212 && aarch64_base_register_rtx_p (op0, strict_p)
7213 && poly_int_rtx_p (op1, &offset))
7214 {
7215 info->type = ADDRESS_REG_IMM;
7216 info->base = op0;
7217 info->offset = op1;
7218 info->const_offset = offset;
7219
7220 /* TImode and TFmode values are allowed in both pairs of X
7221 registers and individual Q registers. The available
7222 address modes are:
7223 X,X: 7-bit signed scaled offset
7224 Q: 9-bit signed offset
7225 We conservatively require an offset representable in either mode.
7226 When performing the check for pairs of X registers i.e. LDP/STP
7227 pass down DImode since that is the natural size of the LDP/STP
7228 instruction memory accesses. */
7229 if (mode == TImode || mode == TFmode)
7230 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
7231 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7232 || offset_12bit_unsigned_scaled_p (mode, offset)));
7233
7234 /* A 7bit offset check because OImode will emit a ldp/stp
7235 instruction (only big endian will get here).
7236 For ldp/stp instructions, the offset is scaled for the size of a
7237 single element of the pair. */
7238 if (mode == OImode)
7239 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
7240
7241 /* Three 9/12 bit offsets checks because CImode will emit three
7242 ldr/str instructions (only big endian will get here). */
7243 if (mode == CImode)
7244 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7245 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
7246 offset + 32)
7247 || offset_12bit_unsigned_scaled_p (V16QImode,
7248 offset + 32)));
7249
7250 /* Two 7bit offsets checks because XImode will emit two ldp/stp
7251 instructions (only big endian will get here). */
7252 if (mode == XImode)
7253 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
7254 && aarch64_offset_7bit_signed_scaled_p (TImode,
7255 offset + 32));
7256
7257 /* Make "m" use the LD1 offset range for SVE data modes, so
7258 that pre-RTL optimizers like ivopts will work to that
7259 instead of the wider LDR/STR range. */
7260 if (vec_flags == VEC_SVE_DATA)
7261 return (type == ADDR_QUERY_M
7262 ? offset_4bit_signed_scaled_p (mode, offset)
7263 : offset_9bit_signed_scaled_p (mode, offset));
7264
7265 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
7266 {
7267 poly_int64 end_offset = (offset
7268 + GET_MODE_SIZE (mode)
7269 - BYTES_PER_SVE_VECTOR);
7270 return (type == ADDR_QUERY_M
7271 ? offset_4bit_signed_scaled_p (mode, offset)
7272 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
7273 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
7274 end_offset)));
7275 }
7276
7277 if (vec_flags == VEC_SVE_PRED)
7278 return offset_9bit_signed_scaled_p (mode, offset);
7279
7280 if (load_store_pair_p)
7281 return ((known_eq (GET_MODE_SIZE (mode), 4)
7282 || known_eq (GET_MODE_SIZE (mode), 8)
7283 || known_eq (GET_MODE_SIZE (mode), 16))
7284 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7285 else
7286 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
7287 || offset_12bit_unsigned_scaled_p (mode, offset));
7288 }
7289
7290 if (allow_reg_index_p)
7291 {
7292 /* Look for base + (scaled/extended) index register. */
7293 if (aarch64_base_register_rtx_p (op0, strict_p)
7294 && aarch64_classify_index (info, op1, mode, strict_p))
7295 {
7296 info->base = op0;
7297 return true;
7298 }
7299 if (aarch64_base_register_rtx_p (op1, strict_p)
7300 && aarch64_classify_index (info, op0, mode, strict_p))
7301 {
7302 info->base = op1;
7303 return true;
7304 }
7305 }
7306
7307 return false;
7308
7309 case POST_INC:
7310 case POST_DEC:
7311 case PRE_INC:
7312 case PRE_DEC:
7313 info->type = ADDRESS_REG_WB;
7314 info->base = XEXP (x, 0);
7315 info->offset = NULL_RTX;
7316 return aarch64_base_register_rtx_p (info->base, strict_p);
7317
7318 case POST_MODIFY:
7319 case PRE_MODIFY:
7320 info->type = ADDRESS_REG_WB;
7321 info->base = XEXP (x, 0);
7322 if (GET_CODE (XEXP (x, 1)) == PLUS
7323 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
7324 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
7325 && aarch64_base_register_rtx_p (info->base, strict_p))
7326 {
7327 info->offset = XEXP (XEXP (x, 1), 1);
7328 info->const_offset = offset;
7329
7330 /* TImode and TFmode values are allowed in both pairs of X
7331 registers and individual Q registers. The available
7332 address modes are:
7333 X,X: 7-bit signed scaled offset
7334 Q: 9-bit signed offset
7335 We conservatively require an offset representable in either mode.
7336 */
7337 if (mode == TImode || mode == TFmode)
7338 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
7339 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
7340
7341 if (load_store_pair_p)
7342 return ((known_eq (GET_MODE_SIZE (mode), 4)
7343 || known_eq (GET_MODE_SIZE (mode), 8)
7344 || known_eq (GET_MODE_SIZE (mode), 16))
7345 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
7346 else
7347 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
7348 }
7349 return false;
7350
7351 case CONST:
7352 case SYMBOL_REF:
7353 case LABEL_REF:
7354 /* load literal: pc-relative constant pool entry. Only supported
7355 for SI mode or larger. */
7356 info->type = ADDRESS_SYMBOLIC;
7357
7358 if (!load_store_pair_p
7359 && GET_MODE_SIZE (mode).is_constant (&const_size)
7360 && const_size >= 4)
7361 {
7362 rtx sym, addend;
7363
7364 split_const (x, &sym, &addend);
7365 return ((GET_CODE (sym) == LABEL_REF
7366 || (GET_CODE (sym) == SYMBOL_REF
7367 && CONSTANT_POOL_ADDRESS_P (sym)
7368 && aarch64_pcrelative_literal_loads)));
7369 }
7370 return false;
7371
7372 case LO_SUM:
7373 info->type = ADDRESS_LO_SUM;
7374 info->base = XEXP (x, 0);
7375 info->offset = XEXP (x, 1);
7376 if (allow_reg_index_p
7377 && aarch64_base_register_rtx_p (info->base, strict_p))
7378 {
7379 rtx sym, offs;
7380 split_const (info->offset, &sym, &offs);
7381 if (GET_CODE (sym) == SYMBOL_REF
7382 && (aarch64_classify_symbol (sym, INTVAL (offs))
7383 == SYMBOL_SMALL_ABSOLUTE))
7384 {
7385 /* The symbol and offset must be aligned to the access size. */
7386 unsigned int align;
7387
7388 if (CONSTANT_POOL_ADDRESS_P (sym))
7389 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
7390 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
7391 {
7392 tree exp = SYMBOL_REF_DECL (sym);
7393 align = TYPE_ALIGN (TREE_TYPE (exp));
7394 align = aarch64_constant_alignment (exp, align);
7395 }
7396 else if (SYMBOL_REF_DECL (sym))
7397 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
7398 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
7399 && SYMBOL_REF_BLOCK (sym) != NULL)
7400 align = SYMBOL_REF_BLOCK (sym)->alignment;
7401 else
7402 align = BITS_PER_UNIT;
7403
7404 poly_int64 ref_size = GET_MODE_SIZE (mode);
7405 if (known_eq (ref_size, 0))
7406 ref_size = GET_MODE_SIZE (DImode);
7407
7408 return (multiple_p (INTVAL (offs), ref_size)
7409 && multiple_p (align / BITS_PER_UNIT, ref_size));
7410 }
7411 }
7412 return false;
7413
7414 default:
7415 return false;
7416 }
7417 }
7418
7419 /* Return true if the address X is valid for a PRFM instruction.
7420 STRICT_P is true if we should do strict checking with
7421 aarch64_classify_address. */
7422
7423 bool
7424 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
7425 {
7426 struct aarch64_address_info addr;
7427
7428 /* PRFM accepts the same addresses as DImode... */
7429 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
7430 if (!res)
7431 return false;
7432
7433 /* ... except writeback forms. */
7434 return addr.type != ADDRESS_REG_WB;
7435 }
7436
7437 bool
7438 aarch64_symbolic_address_p (rtx x)
7439 {
7440 rtx offset;
7441
7442 split_const (x, &x, &offset);
7443 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
7444 }
7445
7446 /* Classify the base of symbolic expression X. */
7447
7448 enum aarch64_symbol_type
7449 aarch64_classify_symbolic_expression (rtx x)
7450 {
7451 rtx offset;
7452
7453 split_const (x, &x, &offset);
7454 return aarch64_classify_symbol (x, INTVAL (offset));
7455 }
7456
7457
7458 /* Return TRUE if X is a legitimate address for accessing memory in
7459 mode MODE. */
7460 static bool
7461 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
7462 {
7463 struct aarch64_address_info addr;
7464
7465 return aarch64_classify_address (&addr, x, mode, strict_p);
7466 }
7467
7468 /* Return TRUE if X is a legitimate address of type TYPE for accessing
7469 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
7470 bool
7471 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
7472 aarch64_addr_query_type type)
7473 {
7474 struct aarch64_address_info addr;
7475
7476 return aarch64_classify_address (&addr, x, mode, strict_p, type);
7477 }
7478
7479 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
7480
7481 static bool
7482 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
7483 poly_int64 orig_offset,
7484 machine_mode mode)
7485 {
7486 HOST_WIDE_INT size;
7487 if (GET_MODE_SIZE (mode).is_constant (&size))
7488 {
7489 HOST_WIDE_INT const_offset, second_offset;
7490
7491 /* A general SVE offset is A * VQ + B. Remove the A component from
7492 coefficient 0 in order to get the constant B. */
7493 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
7494
7495 /* Split an out-of-range address displacement into a base and
7496 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
7497 range otherwise to increase opportunities for sharing the base
7498 address of different sizes. Unaligned accesses use the signed
7499 9-bit range, TImode/TFmode use the intersection of signed
7500 scaled 7-bit and signed 9-bit offset. */
7501 if (mode == TImode || mode == TFmode)
7502 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
7503 else if ((const_offset & (size - 1)) != 0)
7504 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
7505 else
7506 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
7507
7508 if (second_offset == 0 || known_eq (orig_offset, second_offset))
7509 return false;
7510
7511 /* Split the offset into second_offset and the rest. */
7512 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7513 *offset2 = gen_int_mode (second_offset, Pmode);
7514 return true;
7515 }
7516 else
7517 {
7518 /* Get the mode we should use as the basis of the range. For structure
7519 modes this is the mode of one vector. */
7520 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7521 machine_mode step_mode
7522 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
7523
7524 /* Get the "mul vl" multiplier we'd like to use. */
7525 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
7526 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
7527 if (vec_flags & VEC_SVE_DATA)
7528 /* LDR supports a 9-bit range, but the move patterns for
7529 structure modes require all vectors to be in range of the
7530 same base. The simplest way of accomodating that while still
7531 promoting reuse of anchor points between different modes is
7532 to use an 8-bit range unconditionally. */
7533 vnum = ((vnum + 128) & 255) - 128;
7534 else
7535 /* Predicates are only handled singly, so we might as well use
7536 the full range. */
7537 vnum = ((vnum + 256) & 511) - 256;
7538 if (vnum == 0)
7539 return false;
7540
7541 /* Convert the "mul vl" multiplier into a byte offset. */
7542 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
7543 if (known_eq (second_offset, orig_offset))
7544 return false;
7545
7546 /* Split the offset into second_offset and the rest. */
7547 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
7548 *offset2 = gen_int_mode (second_offset, Pmode);
7549 return true;
7550 }
7551 }
7552
7553 /* Return the binary representation of floating point constant VALUE in INTVAL.
7554 If the value cannot be converted, return false without setting INTVAL.
7555 The conversion is done in the given MODE. */
7556 bool
7557 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
7558 {
7559
7560 /* We make a general exception for 0. */
7561 if (aarch64_float_const_zero_rtx_p (value))
7562 {
7563 *intval = 0;
7564 return true;
7565 }
7566
7567 scalar_float_mode mode;
7568 if (GET_CODE (value) != CONST_DOUBLE
7569 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7570 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7571 /* Only support up to DF mode. */
7572 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7573 return false;
7574
7575 unsigned HOST_WIDE_INT ival = 0;
7576
7577 long res[2];
7578 real_to_target (res,
7579 CONST_DOUBLE_REAL_VALUE (value),
7580 REAL_MODE_FORMAT (mode));
7581
7582 if (mode == DFmode)
7583 {
7584 int order = BYTES_BIG_ENDIAN ? 1 : 0;
7585 ival = zext_hwi (res[order], 32);
7586 ival |= (zext_hwi (res[1 - order], 32) << 32);
7587 }
7588 else
7589 ival = zext_hwi (res[0], 32);
7590
7591 *intval = ival;
7592 return true;
7593 }
7594
7595 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7596 single MOV(+MOVK) followed by an FMOV. */
7597 bool
7598 aarch64_float_const_rtx_p (rtx x)
7599 {
7600 machine_mode mode = GET_MODE (x);
7601 if (mode == VOIDmode)
7602 return false;
7603
7604 /* Determine whether it's cheaper to write float constants as
7605 mov/movk pairs over ldr/adrp pairs. */
7606 unsigned HOST_WIDE_INT ival;
7607
7608 if (GET_CODE (x) == CONST_DOUBLE
7609 && SCALAR_FLOAT_MODE_P (mode)
7610 && aarch64_reinterpret_float_as_int (x, &ival))
7611 {
7612 scalar_int_mode imode = (mode == HFmode
7613 ? SImode
7614 : int_mode_for_mode (mode).require ());
7615 int num_instr = aarch64_internal_mov_immediate
7616 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
7617 return num_instr < 3;
7618 }
7619
7620 return false;
7621 }
7622
7623 /* Return TRUE if rtx X is immediate constant 0.0 */
7624 bool
7625 aarch64_float_const_zero_rtx_p (rtx x)
7626 {
7627 if (GET_MODE (x) == VOIDmode)
7628 return false;
7629
7630 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7631 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7632 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7633 }
7634
7635 /* Return TRUE if rtx X is immediate constant that fits in a single
7636 MOVI immediate operation. */
7637 bool
7638 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7639 {
7640 if (!TARGET_SIMD)
7641 return false;
7642
7643 machine_mode vmode;
7644 scalar_int_mode imode;
7645 unsigned HOST_WIDE_INT ival;
7646
7647 if (GET_CODE (x) == CONST_DOUBLE
7648 && SCALAR_FLOAT_MODE_P (mode))
7649 {
7650 if (!aarch64_reinterpret_float_as_int (x, &ival))
7651 return false;
7652
7653 /* We make a general exception for 0. */
7654 if (aarch64_float_const_zero_rtx_p (x))
7655 return true;
7656
7657 imode = int_mode_for_mode (mode).require ();
7658 }
7659 else if (GET_CODE (x) == CONST_INT
7660 && is_a <scalar_int_mode> (mode, &imode))
7661 ival = INTVAL (x);
7662 else
7663 return false;
7664
7665 /* use a 64 bit mode for everything except for DI/DF mode, where we use
7666 a 128 bit vector mode. */
7667 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7668
7669 vmode = aarch64_simd_container_mode (imode, width);
7670 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7671
7672 return aarch64_simd_valid_immediate (v_op, NULL);
7673 }
7674
7675
7676 /* Return the fixed registers used for condition codes. */
7677
7678 static bool
7679 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7680 {
7681 *p1 = CC_REGNUM;
7682 *p2 = INVALID_REGNUM;
7683 return true;
7684 }
7685
7686 /* This function is used by the call expanders of the machine description.
7687 RESULT is the register in which the result is returned. It's NULL for
7688 "call" and "sibcall".
7689 MEM is the location of the function call.
7690 SIBCALL indicates whether this function call is normal call or sibling call.
7691 It will generate different pattern accordingly. */
7692
7693 void
7694 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7695 {
7696 rtx call, callee, tmp;
7697 rtvec vec;
7698 machine_mode mode;
7699
7700 gcc_assert (MEM_P (mem));
7701 callee = XEXP (mem, 0);
7702 mode = GET_MODE (callee);
7703 gcc_assert (mode == Pmode);
7704
7705 /* Decide if we should generate indirect calls by loading the
7706 address of the callee into a register before performing
7707 the branch-and-link. */
7708 if (SYMBOL_REF_P (callee)
7709 ? (aarch64_is_long_call_p (callee)
7710 || aarch64_is_noplt_call_p (callee))
7711 : !REG_P (callee))
7712 XEXP (mem, 0) = force_reg (mode, callee);
7713
7714 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7715
7716 if (result != NULL_RTX)
7717 call = gen_rtx_SET (result, call);
7718
7719 if (sibcall)
7720 tmp = ret_rtx;
7721 else
7722 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7723
7724 vec = gen_rtvec (2, call, tmp);
7725 call = gen_rtx_PARALLEL (VOIDmode, vec);
7726
7727 aarch64_emit_call_insn (call);
7728 }
7729
7730 /* Emit call insn with PAT and do aarch64-specific handling. */
7731
7732 void
7733 aarch64_emit_call_insn (rtx pat)
7734 {
7735 rtx insn = emit_call_insn (pat);
7736
7737 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7738 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7739 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7740 }
7741
7742 machine_mode
7743 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7744 {
7745 machine_mode mode_x = GET_MODE (x);
7746 rtx_code code_x = GET_CODE (x);
7747
7748 /* All floating point compares return CCFP if it is an equality
7749 comparison, and CCFPE otherwise. */
7750 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7751 {
7752 switch (code)
7753 {
7754 case EQ:
7755 case NE:
7756 case UNORDERED:
7757 case ORDERED:
7758 case UNLT:
7759 case UNLE:
7760 case UNGT:
7761 case UNGE:
7762 case UNEQ:
7763 return CCFPmode;
7764
7765 case LT:
7766 case LE:
7767 case GT:
7768 case GE:
7769 case LTGT:
7770 return CCFPEmode;
7771
7772 default:
7773 gcc_unreachable ();
7774 }
7775 }
7776
7777 /* Equality comparisons of short modes against zero can be performed
7778 using the TST instruction with the appropriate bitmask. */
7779 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7780 && (code == EQ || code == NE)
7781 && (mode_x == HImode || mode_x == QImode))
7782 return CC_NZmode;
7783
7784 /* Similarly, comparisons of zero_extends from shorter modes can
7785 be performed using an ANDS with an immediate mask. */
7786 if (y == const0_rtx && code_x == ZERO_EXTEND
7787 && (mode_x == SImode || mode_x == DImode)
7788 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7789 && (code == EQ || code == NE))
7790 return CC_NZmode;
7791
7792 if ((mode_x == SImode || mode_x == DImode)
7793 && y == const0_rtx
7794 && (code == EQ || code == NE || code == LT || code == GE)
7795 && (code_x == PLUS || code_x == MINUS || code_x == AND
7796 || code_x == NEG
7797 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7798 && CONST_INT_P (XEXP (x, 2)))))
7799 return CC_NZmode;
7800
7801 /* A compare with a shifted operand. Because of canonicalization,
7802 the comparison will have to be swapped when we emit the assembly
7803 code. */
7804 if ((mode_x == SImode || mode_x == DImode)
7805 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7806 && (code_x == ASHIFT || code_x == ASHIFTRT
7807 || code_x == LSHIFTRT
7808 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7809 return CC_SWPmode;
7810
7811 /* Similarly for a negated operand, but we can only do this for
7812 equalities. */
7813 if ((mode_x == SImode || mode_x == DImode)
7814 && (REG_P (y) || GET_CODE (y) == SUBREG)
7815 && (code == EQ || code == NE)
7816 && code_x == NEG)
7817 return CC_Zmode;
7818
7819 /* A test for unsigned overflow from an addition. */
7820 if ((mode_x == DImode || mode_x == TImode)
7821 && (code == LTU || code == GEU)
7822 && code_x == PLUS
7823 && rtx_equal_p (XEXP (x, 0), y))
7824 return CC_Cmode;
7825
7826 /* A test for unsigned overflow from an add with carry. */
7827 if ((mode_x == DImode || mode_x == TImode)
7828 && (code == LTU || code == GEU)
7829 && code_x == PLUS
7830 && CONST_SCALAR_INT_P (y)
7831 && (rtx_mode_t (y, mode_x)
7832 == (wi::shwi (1, mode_x)
7833 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7834 return CC_ADCmode;
7835
7836 /* A test for signed overflow. */
7837 if ((mode_x == DImode || mode_x == TImode)
7838 && code == NE
7839 && code_x == PLUS
7840 && GET_CODE (y) == SIGN_EXTEND)
7841 return CC_Vmode;
7842
7843 /* For everything else, return CCmode. */
7844 return CCmode;
7845 }
7846
7847 static int
7848 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7849
7850 int
7851 aarch64_get_condition_code (rtx x)
7852 {
7853 machine_mode mode = GET_MODE (XEXP (x, 0));
7854 enum rtx_code comp_code = GET_CODE (x);
7855
7856 if (GET_MODE_CLASS (mode) != MODE_CC)
7857 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7858 return aarch64_get_condition_code_1 (mode, comp_code);
7859 }
7860
7861 static int
7862 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7863 {
7864 switch (mode)
7865 {
7866 case E_CCFPmode:
7867 case E_CCFPEmode:
7868 switch (comp_code)
7869 {
7870 case GE: return AARCH64_GE;
7871 case GT: return AARCH64_GT;
7872 case LE: return AARCH64_LS;
7873 case LT: return AARCH64_MI;
7874 case NE: return AARCH64_NE;
7875 case EQ: return AARCH64_EQ;
7876 case ORDERED: return AARCH64_VC;
7877 case UNORDERED: return AARCH64_VS;
7878 case UNLT: return AARCH64_LT;
7879 case UNLE: return AARCH64_LE;
7880 case UNGT: return AARCH64_HI;
7881 case UNGE: return AARCH64_PL;
7882 default: return -1;
7883 }
7884 break;
7885
7886 case E_CCmode:
7887 switch (comp_code)
7888 {
7889 case NE: return AARCH64_NE;
7890 case EQ: return AARCH64_EQ;
7891 case GE: return AARCH64_GE;
7892 case GT: return AARCH64_GT;
7893 case LE: return AARCH64_LE;
7894 case LT: return AARCH64_LT;
7895 case GEU: return AARCH64_CS;
7896 case GTU: return AARCH64_HI;
7897 case LEU: return AARCH64_LS;
7898 case LTU: return AARCH64_CC;
7899 default: return -1;
7900 }
7901 break;
7902
7903 case E_CC_SWPmode:
7904 switch (comp_code)
7905 {
7906 case NE: return AARCH64_NE;
7907 case EQ: return AARCH64_EQ;
7908 case GE: return AARCH64_LE;
7909 case GT: return AARCH64_LT;
7910 case LE: return AARCH64_GE;
7911 case LT: return AARCH64_GT;
7912 case GEU: return AARCH64_LS;
7913 case GTU: return AARCH64_CC;
7914 case LEU: return AARCH64_CS;
7915 case LTU: return AARCH64_HI;
7916 default: return -1;
7917 }
7918 break;
7919
7920 case E_CC_NZCmode:
7921 switch (comp_code)
7922 {
7923 case NE: return AARCH64_NE; /* = any */
7924 case EQ: return AARCH64_EQ; /* = none */
7925 case GE: return AARCH64_PL; /* = nfrst */
7926 case LT: return AARCH64_MI; /* = first */
7927 case GEU: return AARCH64_CS; /* = nlast */
7928 case GTU: return AARCH64_HI; /* = pmore */
7929 case LEU: return AARCH64_LS; /* = plast */
7930 case LTU: return AARCH64_CC; /* = last */
7931 default: return -1;
7932 }
7933 break;
7934
7935 case E_CC_NZmode:
7936 switch (comp_code)
7937 {
7938 case NE: return AARCH64_NE;
7939 case EQ: return AARCH64_EQ;
7940 case GE: return AARCH64_PL;
7941 case LT: return AARCH64_MI;
7942 default: return -1;
7943 }
7944 break;
7945
7946 case E_CC_Zmode:
7947 switch (comp_code)
7948 {
7949 case NE: return AARCH64_NE;
7950 case EQ: return AARCH64_EQ;
7951 default: return -1;
7952 }
7953 break;
7954
7955 case E_CC_Cmode:
7956 switch (comp_code)
7957 {
7958 case LTU: return AARCH64_CS;
7959 case GEU: return AARCH64_CC;
7960 default: return -1;
7961 }
7962 break;
7963
7964 case E_CC_ADCmode:
7965 switch (comp_code)
7966 {
7967 case GEU: return AARCH64_CS;
7968 case LTU: return AARCH64_CC;
7969 default: return -1;
7970 }
7971 break;
7972
7973 case E_CC_Vmode:
7974 switch (comp_code)
7975 {
7976 case NE: return AARCH64_VS;
7977 case EQ: return AARCH64_VC;
7978 default: return -1;
7979 }
7980 break;
7981
7982 default:
7983 return -1;
7984 }
7985
7986 return -1;
7987 }
7988
7989 bool
7990 aarch64_const_vec_all_same_in_range_p (rtx x,
7991 HOST_WIDE_INT minval,
7992 HOST_WIDE_INT maxval)
7993 {
7994 rtx elt;
7995 return (const_vec_duplicate_p (x, &elt)
7996 && CONST_INT_P (elt)
7997 && IN_RANGE (INTVAL (elt), minval, maxval));
7998 }
7999
8000 bool
8001 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
8002 {
8003 return aarch64_const_vec_all_same_in_range_p (x, val, val);
8004 }
8005
8006 /* Return true if VEC is a constant in which every element is in the range
8007 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
8008
8009 static bool
8010 aarch64_const_vec_all_in_range_p (rtx vec,
8011 HOST_WIDE_INT minval,
8012 HOST_WIDE_INT maxval)
8013 {
8014 if (GET_CODE (vec) != CONST_VECTOR
8015 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
8016 return false;
8017
8018 int nunits;
8019 if (!CONST_VECTOR_STEPPED_P (vec))
8020 nunits = const_vector_encoded_nelts (vec);
8021 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
8022 return false;
8023
8024 for (int i = 0; i < nunits; i++)
8025 {
8026 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
8027 if (!CONST_INT_P (vec_elem)
8028 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
8029 return false;
8030 }
8031 return true;
8032 }
8033
8034 /* N Z C V. */
8035 #define AARCH64_CC_V 1
8036 #define AARCH64_CC_C (1 << 1)
8037 #define AARCH64_CC_Z (1 << 2)
8038 #define AARCH64_CC_N (1 << 3)
8039
8040 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
8041 static const int aarch64_nzcv_codes[] =
8042 {
8043 0, /* EQ, Z == 1. */
8044 AARCH64_CC_Z, /* NE, Z == 0. */
8045 0, /* CS, C == 1. */
8046 AARCH64_CC_C, /* CC, C == 0. */
8047 0, /* MI, N == 1. */
8048 AARCH64_CC_N, /* PL, N == 0. */
8049 0, /* VS, V == 1. */
8050 AARCH64_CC_V, /* VC, V == 0. */
8051 0, /* HI, C ==1 && Z == 0. */
8052 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
8053 AARCH64_CC_V, /* GE, N == V. */
8054 0, /* LT, N != V. */
8055 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
8056 0, /* LE, !(Z == 0 && N == V). */
8057 0, /* AL, Any. */
8058 0 /* NV, Any. */
8059 };
8060
8061 /* Print floating-point vector immediate operand X to F, negating it
8062 first if NEGATE is true. Return true on success, false if it isn't
8063 a constant we can handle. */
8064
8065 static bool
8066 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
8067 {
8068 rtx elt;
8069
8070 if (!const_vec_duplicate_p (x, &elt))
8071 return false;
8072
8073 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
8074 if (negate)
8075 r = real_value_negate (&r);
8076
8077 /* We only handle the SVE single-bit immediates here. */
8078 if (real_equal (&r, &dconst0))
8079 asm_fprintf (f, "0.0");
8080 else if (real_equal (&r, &dconst1))
8081 asm_fprintf (f, "1.0");
8082 else if (real_equal (&r, &dconsthalf))
8083 asm_fprintf (f, "0.5");
8084 else
8085 return false;
8086
8087 return true;
8088 }
8089
8090 /* Return the equivalent letter for size. */
8091 static char
8092 sizetochar (int size)
8093 {
8094 switch (size)
8095 {
8096 case 64: return 'd';
8097 case 32: return 's';
8098 case 16: return 'h';
8099 case 8 : return 'b';
8100 default: gcc_unreachable ();
8101 }
8102 }
8103
8104 /* Print operand X to file F in a target specific manner according to CODE.
8105 The acceptable formatting commands given by CODE are:
8106 'c': An integer or symbol address without a preceding #
8107 sign.
8108 'C': Take the duplicated element in a vector constant
8109 and print it in hex.
8110 'D': Take the duplicated element in a vector constant
8111 and print it as an unsigned integer, in decimal.
8112 'e': Print the sign/zero-extend size as a character 8->b,
8113 16->h, 32->w.
8114 'p': Prints N such that 2^N == X (X must be power of 2 and
8115 const int).
8116 'P': Print the number of non-zero bits in X (a const_int).
8117 'H': Print the higher numbered register of a pair (TImode)
8118 of regs.
8119 'm': Print a condition (eq, ne, etc).
8120 'M': Same as 'm', but invert condition.
8121 'N': Take the duplicated element in a vector constant
8122 and print the negative of it in decimal.
8123 'b/h/s/d/q': Print a scalar FP/SIMD register name.
8124 'S/T/U/V': Print a FP/SIMD register name for a register list.
8125 The register printed is the FP/SIMD register name
8126 of X + 0/1/2/3 for S/T/U/V.
8127 'R': Print a scalar FP/SIMD register name + 1.
8128 'X': Print bottom 16 bits of integer constant in hex.
8129 'w/x': Print a general register name or the zero register
8130 (32-bit or 64-bit).
8131 '0': Print a normal operand, if it's a general register,
8132 then we assume DImode.
8133 'k': Print NZCV for conditional compare instructions.
8134 'A': Output address constant representing the first
8135 argument of X, specifying a relocation offset
8136 if appropriate.
8137 'L': Output constant address specified by X
8138 with a relocation offset if appropriate.
8139 'G': Prints address of X, specifying a PC relative
8140 relocation mode if appropriate.
8141 'y': Output address of LDP or STP - this is used for
8142 some LDP/STPs which don't use a PARALLEL in their
8143 pattern (so the mode needs to be adjusted).
8144 'z': Output address of a typical LDP or STP. */
8145
8146 static void
8147 aarch64_print_operand (FILE *f, rtx x, int code)
8148 {
8149 rtx elt;
8150 switch (code)
8151 {
8152 case 'c':
8153 switch (GET_CODE (x))
8154 {
8155 case CONST_INT:
8156 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8157 break;
8158
8159 case SYMBOL_REF:
8160 output_addr_const (f, x);
8161 break;
8162
8163 case CONST:
8164 if (GET_CODE (XEXP (x, 0)) == PLUS
8165 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
8166 {
8167 output_addr_const (f, x);
8168 break;
8169 }
8170 /* Fall through. */
8171
8172 default:
8173 output_operand_lossage ("unsupported operand for code '%c'", code);
8174 }
8175 break;
8176
8177 case 'e':
8178 {
8179 int n;
8180
8181 if (!CONST_INT_P (x)
8182 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
8183 {
8184 output_operand_lossage ("invalid operand for '%%%c'", code);
8185 return;
8186 }
8187
8188 switch (n)
8189 {
8190 case 3:
8191 fputc ('b', f);
8192 break;
8193 case 4:
8194 fputc ('h', f);
8195 break;
8196 case 5:
8197 fputc ('w', f);
8198 break;
8199 default:
8200 output_operand_lossage ("invalid operand for '%%%c'", code);
8201 return;
8202 }
8203 }
8204 break;
8205
8206 case 'p':
8207 {
8208 int n;
8209
8210 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
8211 {
8212 output_operand_lossage ("invalid operand for '%%%c'", code);
8213 return;
8214 }
8215
8216 asm_fprintf (f, "%d", n);
8217 }
8218 break;
8219
8220 case 'P':
8221 if (!CONST_INT_P (x))
8222 {
8223 output_operand_lossage ("invalid operand for '%%%c'", code);
8224 return;
8225 }
8226
8227 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
8228 break;
8229
8230 case 'H':
8231 if (x == const0_rtx)
8232 {
8233 asm_fprintf (f, "xzr");
8234 break;
8235 }
8236
8237 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
8238 {
8239 output_operand_lossage ("invalid operand for '%%%c'", code);
8240 return;
8241 }
8242
8243 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
8244 break;
8245
8246 case 'M':
8247 case 'm':
8248 {
8249 int cond_code;
8250 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
8251 if (x == const_true_rtx)
8252 {
8253 if (code == 'M')
8254 fputs ("nv", f);
8255 return;
8256 }
8257
8258 if (!COMPARISON_P (x))
8259 {
8260 output_operand_lossage ("invalid operand for '%%%c'", code);
8261 return;
8262 }
8263
8264 cond_code = aarch64_get_condition_code (x);
8265 gcc_assert (cond_code >= 0);
8266 if (code == 'M')
8267 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
8268 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
8269 fputs (aarch64_sve_condition_codes[cond_code], f);
8270 else
8271 fputs (aarch64_condition_codes[cond_code], f);
8272 }
8273 break;
8274
8275 case 'N':
8276 if (!const_vec_duplicate_p (x, &elt))
8277 {
8278 output_operand_lossage ("invalid vector constant");
8279 return;
8280 }
8281
8282 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8283 asm_fprintf (f, "%wd", -INTVAL (elt));
8284 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8285 && aarch64_print_vector_float_operand (f, x, true))
8286 ;
8287 else
8288 {
8289 output_operand_lossage ("invalid vector constant");
8290 return;
8291 }
8292 break;
8293
8294 case 'b':
8295 case 'h':
8296 case 's':
8297 case 'd':
8298 case 'q':
8299 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8300 {
8301 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8302 return;
8303 }
8304 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
8305 break;
8306
8307 case 'S':
8308 case 'T':
8309 case 'U':
8310 case 'V':
8311 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8312 {
8313 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8314 return;
8315 }
8316 asm_fprintf (f, "%c%d",
8317 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
8318 REGNO (x) - V0_REGNUM + (code - 'S'));
8319 break;
8320
8321 case 'R':
8322 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
8323 {
8324 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
8325 return;
8326 }
8327 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
8328 break;
8329
8330 case 'X':
8331 if (!CONST_INT_P (x))
8332 {
8333 output_operand_lossage ("invalid operand for '%%%c'", code);
8334 return;
8335 }
8336 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
8337 break;
8338
8339 case 'C':
8340 {
8341 /* Print a replicated constant in hex. */
8342 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8343 {
8344 output_operand_lossage ("invalid operand for '%%%c'", code);
8345 return;
8346 }
8347 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8348 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8349 }
8350 break;
8351
8352 case 'D':
8353 {
8354 /* Print a replicated constant in decimal, treating it as
8355 unsigned. */
8356 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
8357 {
8358 output_operand_lossage ("invalid operand for '%%%c'", code);
8359 return;
8360 }
8361 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
8362 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
8363 }
8364 break;
8365
8366 case 'w':
8367 case 'x':
8368 if (x == const0_rtx
8369 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
8370 {
8371 asm_fprintf (f, "%czr", code);
8372 break;
8373 }
8374
8375 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
8376 {
8377 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
8378 break;
8379 }
8380
8381 if (REG_P (x) && REGNO (x) == SP_REGNUM)
8382 {
8383 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
8384 break;
8385 }
8386
8387 /* Fall through */
8388
8389 case 0:
8390 if (x == NULL)
8391 {
8392 output_operand_lossage ("missing operand");
8393 return;
8394 }
8395
8396 switch (GET_CODE (x))
8397 {
8398 case REG:
8399 if (aarch64_sve_data_mode_p (GET_MODE (x)))
8400 {
8401 if (REG_NREGS (x) == 1)
8402 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
8403 else
8404 {
8405 char suffix
8406 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
8407 asm_fprintf (f, "{z%d.%c - z%d.%c}",
8408 REGNO (x) - V0_REGNUM, suffix,
8409 END_REGNO (x) - V0_REGNUM - 1, suffix);
8410 }
8411 }
8412 else
8413 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
8414 break;
8415
8416 case MEM:
8417 output_address (GET_MODE (x), XEXP (x, 0));
8418 break;
8419
8420 case LABEL_REF:
8421 case SYMBOL_REF:
8422 output_addr_const (asm_out_file, x);
8423 break;
8424
8425 case CONST_INT:
8426 asm_fprintf (f, "%wd", INTVAL (x));
8427 break;
8428
8429 case CONST:
8430 if (!VECTOR_MODE_P (GET_MODE (x)))
8431 {
8432 output_addr_const (asm_out_file, x);
8433 break;
8434 }
8435 /* fall through */
8436
8437 case CONST_VECTOR:
8438 if (!const_vec_duplicate_p (x, &elt))
8439 {
8440 output_operand_lossage ("invalid vector constant");
8441 return;
8442 }
8443
8444 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
8445 asm_fprintf (f, "%wd", INTVAL (elt));
8446 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
8447 && aarch64_print_vector_float_operand (f, x, false))
8448 ;
8449 else
8450 {
8451 output_operand_lossage ("invalid vector constant");
8452 return;
8453 }
8454 break;
8455
8456 case CONST_DOUBLE:
8457 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
8458 be getting CONST_DOUBLEs holding integers. */
8459 gcc_assert (GET_MODE (x) != VOIDmode);
8460 if (aarch64_float_const_zero_rtx_p (x))
8461 {
8462 fputc ('0', f);
8463 break;
8464 }
8465 else if (aarch64_float_const_representable_p (x))
8466 {
8467 #define buf_size 20
8468 char float_buf[buf_size] = {'\0'};
8469 real_to_decimal_for_mode (float_buf,
8470 CONST_DOUBLE_REAL_VALUE (x),
8471 buf_size, buf_size,
8472 1, GET_MODE (x));
8473 asm_fprintf (asm_out_file, "%s", float_buf);
8474 break;
8475 #undef buf_size
8476 }
8477 output_operand_lossage ("invalid constant");
8478 return;
8479 default:
8480 output_operand_lossage ("invalid operand");
8481 return;
8482 }
8483 break;
8484
8485 case 'A':
8486 if (GET_CODE (x) == HIGH)
8487 x = XEXP (x, 0);
8488
8489 switch (aarch64_classify_symbolic_expression (x))
8490 {
8491 case SYMBOL_SMALL_GOT_4G:
8492 asm_fprintf (asm_out_file, ":got:");
8493 break;
8494
8495 case SYMBOL_SMALL_TLSGD:
8496 asm_fprintf (asm_out_file, ":tlsgd:");
8497 break;
8498
8499 case SYMBOL_SMALL_TLSDESC:
8500 asm_fprintf (asm_out_file, ":tlsdesc:");
8501 break;
8502
8503 case SYMBOL_SMALL_TLSIE:
8504 asm_fprintf (asm_out_file, ":gottprel:");
8505 break;
8506
8507 case SYMBOL_TLSLE24:
8508 asm_fprintf (asm_out_file, ":tprel:");
8509 break;
8510
8511 case SYMBOL_TINY_GOT:
8512 gcc_unreachable ();
8513 break;
8514
8515 default:
8516 break;
8517 }
8518 output_addr_const (asm_out_file, x);
8519 break;
8520
8521 case 'L':
8522 switch (aarch64_classify_symbolic_expression (x))
8523 {
8524 case SYMBOL_SMALL_GOT_4G:
8525 asm_fprintf (asm_out_file, ":lo12:");
8526 break;
8527
8528 case SYMBOL_SMALL_TLSGD:
8529 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
8530 break;
8531
8532 case SYMBOL_SMALL_TLSDESC:
8533 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
8534 break;
8535
8536 case SYMBOL_SMALL_TLSIE:
8537 asm_fprintf (asm_out_file, ":gottprel_lo12:");
8538 break;
8539
8540 case SYMBOL_TLSLE12:
8541 asm_fprintf (asm_out_file, ":tprel_lo12:");
8542 break;
8543
8544 case SYMBOL_TLSLE24:
8545 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
8546 break;
8547
8548 case SYMBOL_TINY_GOT:
8549 asm_fprintf (asm_out_file, ":got:");
8550 break;
8551
8552 case SYMBOL_TINY_TLSIE:
8553 asm_fprintf (asm_out_file, ":gottprel:");
8554 break;
8555
8556 default:
8557 break;
8558 }
8559 output_addr_const (asm_out_file, x);
8560 break;
8561
8562 case 'G':
8563 switch (aarch64_classify_symbolic_expression (x))
8564 {
8565 case SYMBOL_TLSLE24:
8566 asm_fprintf (asm_out_file, ":tprel_hi12:");
8567 break;
8568 default:
8569 break;
8570 }
8571 output_addr_const (asm_out_file, x);
8572 break;
8573
8574 case 'k':
8575 {
8576 HOST_WIDE_INT cond_code;
8577
8578 if (!CONST_INT_P (x))
8579 {
8580 output_operand_lossage ("invalid operand for '%%%c'", code);
8581 return;
8582 }
8583
8584 cond_code = INTVAL (x);
8585 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8586 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8587 }
8588 break;
8589
8590 case 'y':
8591 case 'z':
8592 {
8593 machine_mode mode = GET_MODE (x);
8594
8595 if (GET_CODE (x) != MEM
8596 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8597 {
8598 output_operand_lossage ("invalid operand for '%%%c'", code);
8599 return;
8600 }
8601
8602 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8603 code == 'y'
8604 ? ADDR_QUERY_LDP_STP_N
8605 : ADDR_QUERY_LDP_STP))
8606 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8607 }
8608 break;
8609
8610 default:
8611 output_operand_lossage ("invalid operand prefix '%%%c'", code);
8612 return;
8613 }
8614 }
8615
8616 /* Print address 'x' of a memory access with mode 'mode'.
8617 'op' is the context required by aarch64_classify_address. It can either be
8618 MEM for a normal memory access or PARALLEL for LDP/STP. */
8619 static bool
8620 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8621 aarch64_addr_query_type type)
8622 {
8623 struct aarch64_address_info addr;
8624 unsigned int size;
8625
8626 /* Check all addresses are Pmode - including ILP32. */
8627 if (GET_MODE (x) != Pmode
8628 && (!CONST_INT_P (x)
8629 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8630 {
8631 output_operand_lossage ("invalid address mode");
8632 return false;
8633 }
8634
8635 if (aarch64_classify_address (&addr, x, mode, true, type))
8636 switch (addr.type)
8637 {
8638 case ADDRESS_REG_IMM:
8639 if (known_eq (addr.const_offset, 0))
8640 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8641 else if (aarch64_sve_data_mode_p (mode))
8642 {
8643 HOST_WIDE_INT vnum
8644 = exact_div (addr.const_offset,
8645 BYTES_PER_SVE_VECTOR).to_constant ();
8646 asm_fprintf (f, "[%s, #%wd, mul vl]",
8647 reg_names[REGNO (addr.base)], vnum);
8648 }
8649 else if (aarch64_sve_pred_mode_p (mode))
8650 {
8651 HOST_WIDE_INT vnum
8652 = exact_div (addr.const_offset,
8653 BYTES_PER_SVE_PRED).to_constant ();
8654 asm_fprintf (f, "[%s, #%wd, mul vl]",
8655 reg_names[REGNO (addr.base)], vnum);
8656 }
8657 else
8658 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8659 INTVAL (addr.offset));
8660 return true;
8661
8662 case ADDRESS_REG_REG:
8663 if (addr.shift == 0)
8664 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8665 reg_names [REGNO (addr.offset)]);
8666 else
8667 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8668 reg_names [REGNO (addr.offset)], addr.shift);
8669 return true;
8670
8671 case ADDRESS_REG_UXTW:
8672 if (addr.shift == 0)
8673 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8674 REGNO (addr.offset) - R0_REGNUM);
8675 else
8676 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8677 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8678 return true;
8679
8680 case ADDRESS_REG_SXTW:
8681 if (addr.shift == 0)
8682 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8683 REGNO (addr.offset) - R0_REGNUM);
8684 else
8685 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8686 REGNO (addr.offset) - R0_REGNUM, addr.shift);
8687 return true;
8688
8689 case ADDRESS_REG_WB:
8690 /* Writeback is only supported for fixed-width modes. */
8691 size = GET_MODE_SIZE (mode).to_constant ();
8692 switch (GET_CODE (x))
8693 {
8694 case PRE_INC:
8695 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8696 return true;
8697 case POST_INC:
8698 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8699 return true;
8700 case PRE_DEC:
8701 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8702 return true;
8703 case POST_DEC:
8704 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8705 return true;
8706 case PRE_MODIFY:
8707 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8708 INTVAL (addr.offset));
8709 return true;
8710 case POST_MODIFY:
8711 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8712 INTVAL (addr.offset));
8713 return true;
8714 default:
8715 break;
8716 }
8717 break;
8718
8719 case ADDRESS_LO_SUM:
8720 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8721 output_addr_const (f, addr.offset);
8722 asm_fprintf (f, "]");
8723 return true;
8724
8725 case ADDRESS_SYMBOLIC:
8726 output_addr_const (f, x);
8727 return true;
8728 }
8729
8730 return false;
8731 }
8732
8733 /* Print address 'x' of a memory access with mode 'mode'. */
8734 static void
8735 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8736 {
8737 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8738 output_addr_const (f, x);
8739 }
8740
8741 bool
8742 aarch64_label_mentioned_p (rtx x)
8743 {
8744 const char *fmt;
8745 int i;
8746
8747 if (GET_CODE (x) == LABEL_REF)
8748 return true;
8749
8750 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8751 referencing instruction, but they are constant offsets, not
8752 symbols. */
8753 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8754 return false;
8755
8756 fmt = GET_RTX_FORMAT (GET_CODE (x));
8757 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8758 {
8759 if (fmt[i] == 'E')
8760 {
8761 int j;
8762
8763 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8764 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8765 return 1;
8766 }
8767 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8768 return 1;
8769 }
8770
8771 return 0;
8772 }
8773
8774 /* Implement REGNO_REG_CLASS. */
8775
8776 enum reg_class
8777 aarch64_regno_regclass (unsigned regno)
8778 {
8779 if (GP_REGNUM_P (regno))
8780 return GENERAL_REGS;
8781
8782 if (regno == SP_REGNUM)
8783 return STACK_REG;
8784
8785 if (regno == FRAME_POINTER_REGNUM
8786 || regno == ARG_POINTER_REGNUM)
8787 return POINTER_REGS;
8788
8789 if (FP_REGNUM_P (regno))
8790 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
8791 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
8792
8793 if (PR_REGNUM_P (regno))
8794 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8795
8796 return NO_REGS;
8797 }
8798
8799 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8800 If OFFSET is out of range, return an offset of an anchor point
8801 that is in range. Return 0 otherwise. */
8802
8803 static HOST_WIDE_INT
8804 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8805 machine_mode mode)
8806 {
8807 /* Does it look like we'll need a 16-byte load/store-pair operation? */
8808 if (size > 16)
8809 return (offset + 0x400) & ~0x7f0;
8810
8811 /* For offsets that aren't a multiple of the access size, the limit is
8812 -256...255. */
8813 if (offset & (size - 1))
8814 {
8815 /* BLKmode typically uses LDP of X-registers. */
8816 if (mode == BLKmode)
8817 return (offset + 512) & ~0x3ff;
8818 return (offset + 0x100) & ~0x1ff;
8819 }
8820
8821 /* Small negative offsets are supported. */
8822 if (IN_RANGE (offset, -256, 0))
8823 return 0;
8824
8825 if (mode == TImode || mode == TFmode)
8826 return (offset + 0x100) & ~0x1ff;
8827
8828 /* Use 12-bit offset by access size. */
8829 return offset & (~0xfff * size);
8830 }
8831
8832 static rtx
8833 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
8834 {
8835 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8836 where mask is selected by alignment and size of the offset.
8837 We try to pick as large a range for the offset as possible to
8838 maximize the chance of a CSE. However, for aligned addresses
8839 we limit the range to 4k so that structures with different sized
8840 elements are likely to use the same base. We need to be careful
8841 not to split a CONST for some forms of address expression, otherwise
8842 it will generate sub-optimal code. */
8843
8844 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8845 {
8846 rtx base = XEXP (x, 0);
8847 rtx offset_rtx = XEXP (x, 1);
8848 HOST_WIDE_INT offset = INTVAL (offset_rtx);
8849
8850 if (GET_CODE (base) == PLUS)
8851 {
8852 rtx op0 = XEXP (base, 0);
8853 rtx op1 = XEXP (base, 1);
8854
8855 /* Force any scaling into a temp for CSE. */
8856 op0 = force_reg (Pmode, op0);
8857 op1 = force_reg (Pmode, op1);
8858
8859 /* Let the pointer register be in op0. */
8860 if (REG_POINTER (op1))
8861 std::swap (op0, op1);
8862
8863 /* If the pointer is virtual or frame related, then we know that
8864 virtual register instantiation or register elimination is going
8865 to apply a second constant. We want the two constants folded
8866 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
8867 if (virt_or_elim_regno_p (REGNO (op0)))
8868 {
8869 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8870 NULL_RTX, true, OPTAB_DIRECT);
8871 return gen_rtx_PLUS (Pmode, base, op1);
8872 }
8873
8874 /* Otherwise, in order to encourage CSE (and thence loop strength
8875 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
8876 base = expand_binop (Pmode, add_optab, op0, op1,
8877 NULL_RTX, true, OPTAB_DIRECT);
8878 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8879 }
8880
8881 HOST_WIDE_INT size;
8882 if (GET_MODE_SIZE (mode).is_constant (&size))
8883 {
8884 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8885 mode);
8886 if (base_offset != 0)
8887 {
8888 base = plus_constant (Pmode, base, base_offset);
8889 base = force_operand (base, NULL_RTX);
8890 return plus_constant (Pmode, base, offset - base_offset);
8891 }
8892 }
8893 }
8894
8895 return x;
8896 }
8897
8898 static reg_class_t
8899 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8900 reg_class_t rclass,
8901 machine_mode mode,
8902 secondary_reload_info *sri)
8903 {
8904 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8905 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
8906 comment at the head of aarch64-sve.md for more details about the
8907 big-endian handling. */
8908 if (BYTES_BIG_ENDIAN
8909 && reg_class_subset_p (rclass, FP_REGS)
8910 && !((REG_P (x) && HARD_REGISTER_P (x))
8911 || aarch64_simd_valid_immediate (x, NULL))
8912 && aarch64_sve_data_mode_p (mode))
8913 {
8914 sri->icode = CODE_FOR_aarch64_sve_reload_be;
8915 return NO_REGS;
8916 }
8917
8918 /* If we have to disable direct literal pool loads and stores because the
8919 function is too big, then we need a scratch register. */
8920 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8921 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8922 || targetm.vector_mode_supported_p (GET_MODE (x)))
8923 && !aarch64_pcrelative_literal_loads)
8924 {
8925 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8926 return NO_REGS;
8927 }
8928
8929 /* Without the TARGET_SIMD instructions we cannot move a Q register
8930 to a Q register directly. We need a scratch. */
8931 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8932 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8933 && reg_class_subset_p (rclass, FP_REGS))
8934 {
8935 sri->icode = code_for_aarch64_reload_mov (mode);
8936 return NO_REGS;
8937 }
8938
8939 /* A TFmode or TImode memory access should be handled via an FP_REGS
8940 because AArch64 has richer addressing modes for LDR/STR instructions
8941 than LDP/STP instructions. */
8942 if (TARGET_FLOAT && rclass == GENERAL_REGS
8943 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8944 return FP_REGS;
8945
8946 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8947 return GENERAL_REGS;
8948
8949 return NO_REGS;
8950 }
8951
8952 static bool
8953 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8954 {
8955 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8956
8957 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8958 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
8959 if (frame_pointer_needed)
8960 return to == HARD_FRAME_POINTER_REGNUM;
8961 return true;
8962 }
8963
8964 poly_int64
8965 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8966 {
8967 if (to == HARD_FRAME_POINTER_REGNUM)
8968 {
8969 if (from == ARG_POINTER_REGNUM)
8970 return cfun->machine->frame.hard_fp_offset;
8971
8972 if (from == FRAME_POINTER_REGNUM)
8973 return cfun->machine->frame.hard_fp_offset
8974 - cfun->machine->frame.locals_offset;
8975 }
8976
8977 if (to == STACK_POINTER_REGNUM)
8978 {
8979 if (from == FRAME_POINTER_REGNUM)
8980 return cfun->machine->frame.frame_size
8981 - cfun->machine->frame.locals_offset;
8982 }
8983
8984 return cfun->machine->frame.frame_size;
8985 }
8986
8987 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
8988 previous frame. */
8989
8990 rtx
8991 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8992 {
8993 if (count != 0)
8994 return const0_rtx;
8995 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8996 }
8997
8998
8999 static void
9000 aarch64_asm_trampoline_template (FILE *f)
9001 {
9002 int offset1 = 16;
9003 int offset2 = 20;
9004
9005 if (aarch64_bti_enabled ())
9006 {
9007 asm_fprintf (f, "\thint\t34 // bti c\n");
9008 offset1 -= 4;
9009 offset2 -= 4;
9010 }
9011
9012 if (TARGET_ILP32)
9013 {
9014 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
9015 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
9016 offset1);
9017 }
9018 else
9019 {
9020 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
9021 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
9022 offset2);
9023 }
9024 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
9025
9026 /* The trampoline needs an extra padding instruction. In case if BTI is
9027 enabled the padding instruction is replaced by the BTI instruction at
9028 the beginning. */
9029 if (!aarch64_bti_enabled ())
9030 assemble_aligned_integer (4, const0_rtx);
9031
9032 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9033 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
9034 }
9035
9036 static void
9037 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
9038 {
9039 rtx fnaddr, mem, a_tramp;
9040 const int tramp_code_sz = 16;
9041
9042 /* Don't need to copy the trailing D-words, we fill those in below. */
9043 emit_block_move (m_tramp, assemble_trampoline_template (),
9044 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
9045 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
9046 fnaddr = XEXP (DECL_RTL (fndecl), 0);
9047 if (GET_MODE (fnaddr) != ptr_mode)
9048 fnaddr = convert_memory_address (ptr_mode, fnaddr);
9049 emit_move_insn (mem, fnaddr);
9050
9051 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
9052 emit_move_insn (mem, chain_value);
9053
9054 /* XXX We should really define a "clear_cache" pattern and use
9055 gen_clear_cache(). */
9056 a_tramp = XEXP (m_tramp, 0);
9057 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
9058 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
9059 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
9060 ptr_mode);
9061 }
9062
9063 static unsigned char
9064 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
9065 {
9066 /* ??? Logically we should only need to provide a value when
9067 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
9068 can hold MODE, but at the moment we need to handle all modes.
9069 Just ignore any runtime parts for registers that can't store them. */
9070 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
9071 unsigned int nregs;
9072 switch (regclass)
9073 {
9074 case TAILCALL_ADDR_REGS:
9075 case POINTER_REGS:
9076 case GENERAL_REGS:
9077 case ALL_REGS:
9078 case POINTER_AND_FP_REGS:
9079 case FP_REGS:
9080 case FP_LO_REGS:
9081 case FP_LO8_REGS:
9082 if (aarch64_sve_data_mode_p (mode)
9083 && constant_multiple_p (GET_MODE_SIZE (mode),
9084 BYTES_PER_SVE_VECTOR, &nregs))
9085 return nregs;
9086 return (aarch64_vector_data_mode_p (mode)
9087 ? CEIL (lowest_size, UNITS_PER_VREG)
9088 : CEIL (lowest_size, UNITS_PER_WORD));
9089 case STACK_REG:
9090 case PR_REGS:
9091 case PR_LO_REGS:
9092 case PR_HI_REGS:
9093 return 1;
9094
9095 case NO_REGS:
9096 return 0;
9097
9098 default:
9099 break;
9100 }
9101 gcc_unreachable ();
9102 }
9103
9104 static reg_class_t
9105 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
9106 {
9107 if (regclass == POINTER_REGS)
9108 return GENERAL_REGS;
9109
9110 if (regclass == STACK_REG)
9111 {
9112 if (REG_P(x)
9113 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
9114 return regclass;
9115
9116 return NO_REGS;
9117 }
9118
9119 /* Register eliminiation can result in a request for
9120 SP+constant->FP_REGS. We cannot support such operations which
9121 use SP as source and an FP_REG as destination, so reject out
9122 right now. */
9123 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
9124 {
9125 rtx lhs = XEXP (x, 0);
9126
9127 /* Look through a possible SUBREG introduced by ILP32. */
9128 if (GET_CODE (lhs) == SUBREG)
9129 lhs = SUBREG_REG (lhs);
9130
9131 gcc_assert (REG_P (lhs));
9132 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
9133 POINTER_REGS));
9134 return NO_REGS;
9135 }
9136
9137 return regclass;
9138 }
9139
9140 void
9141 aarch64_asm_output_labelref (FILE* f, const char *name)
9142 {
9143 asm_fprintf (f, "%U%s", name);
9144 }
9145
9146 static void
9147 aarch64_elf_asm_constructor (rtx symbol, int priority)
9148 {
9149 if (priority == DEFAULT_INIT_PRIORITY)
9150 default_ctor_section_asm_out_constructor (symbol, priority);
9151 else
9152 {
9153 section *s;
9154 /* While priority is known to be in range [0, 65535], so 18 bytes
9155 would be enough, the compiler might not know that. To avoid
9156 -Wformat-truncation false positive, use a larger size. */
9157 char buf[23];
9158 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
9159 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9160 switch_to_section (s);
9161 assemble_align (POINTER_SIZE);
9162 assemble_aligned_integer (POINTER_BYTES, symbol);
9163 }
9164 }
9165
9166 static void
9167 aarch64_elf_asm_destructor (rtx symbol, int priority)
9168 {
9169 if (priority == DEFAULT_INIT_PRIORITY)
9170 default_dtor_section_asm_out_destructor (symbol, priority);
9171 else
9172 {
9173 section *s;
9174 /* While priority is known to be in range [0, 65535], so 18 bytes
9175 would be enough, the compiler might not know that. To avoid
9176 -Wformat-truncation false positive, use a larger size. */
9177 char buf[23];
9178 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
9179 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
9180 switch_to_section (s);
9181 assemble_align (POINTER_SIZE);
9182 assemble_aligned_integer (POINTER_BYTES, symbol);
9183 }
9184 }
9185
9186 const char*
9187 aarch64_output_casesi (rtx *operands)
9188 {
9189 char buf[100];
9190 char label[100];
9191 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
9192 int index;
9193 static const char *const patterns[4][2] =
9194 {
9195 {
9196 "ldrb\t%w3, [%0,%w1,uxtw]",
9197 "add\t%3, %4, %w3, sxtb #2"
9198 },
9199 {
9200 "ldrh\t%w3, [%0,%w1,uxtw #1]",
9201 "add\t%3, %4, %w3, sxth #2"
9202 },
9203 {
9204 "ldr\t%w3, [%0,%w1,uxtw #2]",
9205 "add\t%3, %4, %w3, sxtw #2"
9206 },
9207 /* We assume that DImode is only generated when not optimizing and
9208 that we don't really need 64-bit address offsets. That would
9209 imply an object file with 8GB of code in a single function! */
9210 {
9211 "ldr\t%w3, [%0,%w1,uxtw #2]",
9212 "add\t%3, %4, %w3, sxtw #2"
9213 }
9214 };
9215
9216 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
9217
9218 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
9219 index = exact_log2 (GET_MODE_SIZE (mode));
9220
9221 gcc_assert (index >= 0 && index <= 3);
9222
9223 /* Need to implement table size reduction, by chaning the code below. */
9224 output_asm_insn (patterns[index][0], operands);
9225 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
9226 snprintf (buf, sizeof (buf),
9227 "adr\t%%4, %s", targetm.strip_name_encoding (label));
9228 output_asm_insn (buf, operands);
9229 output_asm_insn (patterns[index][1], operands);
9230 output_asm_insn ("br\t%3", operands);
9231 assemble_label (asm_out_file, label);
9232 return "";
9233 }
9234
9235
9236 /* Return size in bits of an arithmetic operand which is shifted/scaled and
9237 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
9238 operator. */
9239
9240 int
9241 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
9242 {
9243 if (shift >= 0 && shift <= 3)
9244 {
9245 int size;
9246 for (size = 8; size <= 32; size *= 2)
9247 {
9248 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
9249 if (mask == bits << shift)
9250 return size;
9251 }
9252 }
9253 return 0;
9254 }
9255
9256 /* Constant pools are per function only when PC relative
9257 literal loads are true or we are in the large memory
9258 model. */
9259
9260 static inline bool
9261 aarch64_can_use_per_function_literal_pools_p (void)
9262 {
9263 return (aarch64_pcrelative_literal_loads
9264 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
9265 }
9266
9267 static bool
9268 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
9269 {
9270 /* We can't use blocks for constants when we're using a per-function
9271 constant pool. */
9272 return !aarch64_can_use_per_function_literal_pools_p ();
9273 }
9274
9275 /* Select appropriate section for constants depending
9276 on where we place literal pools. */
9277
9278 static section *
9279 aarch64_select_rtx_section (machine_mode mode,
9280 rtx x,
9281 unsigned HOST_WIDE_INT align)
9282 {
9283 if (aarch64_can_use_per_function_literal_pools_p ())
9284 return function_section (current_function_decl);
9285
9286 return default_elf_select_rtx_section (mode, x, align);
9287 }
9288
9289 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
9290 void
9291 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
9292 HOST_WIDE_INT offset)
9293 {
9294 /* When using per-function literal pools, we must ensure that any code
9295 section is aligned to the minimal instruction length, lest we get
9296 errors from the assembler re "unaligned instructions". */
9297 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
9298 ASM_OUTPUT_ALIGN (f, 2);
9299 }
9300
9301 /* Costs. */
9302
9303 /* Helper function for rtx cost calculation. Strip a shift expression
9304 from X. Returns the inner operand if successful, or the original
9305 expression on failure. */
9306 static rtx
9307 aarch64_strip_shift (rtx x)
9308 {
9309 rtx op = x;
9310
9311 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
9312 we can convert both to ROR during final output. */
9313 if ((GET_CODE (op) == ASHIFT
9314 || GET_CODE (op) == ASHIFTRT
9315 || GET_CODE (op) == LSHIFTRT
9316 || GET_CODE (op) == ROTATERT
9317 || GET_CODE (op) == ROTATE)
9318 && CONST_INT_P (XEXP (op, 1)))
9319 return XEXP (op, 0);
9320
9321 if (GET_CODE (op) == MULT
9322 && CONST_INT_P (XEXP (op, 1))
9323 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
9324 return XEXP (op, 0);
9325
9326 return x;
9327 }
9328
9329 /* Helper function for rtx cost calculation. Strip an extend
9330 expression from X. Returns the inner operand if successful, or the
9331 original expression on failure. We deal with a number of possible
9332 canonicalization variations here. If STRIP_SHIFT is true, then
9333 we can strip off a shift also. */
9334 static rtx
9335 aarch64_strip_extend (rtx x, bool strip_shift)
9336 {
9337 scalar_int_mode mode;
9338 rtx op = x;
9339
9340 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
9341 return op;
9342
9343 /* Zero and sign extraction of a widened value. */
9344 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
9345 && XEXP (op, 2) == const0_rtx
9346 && GET_CODE (XEXP (op, 0)) == MULT
9347 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
9348 XEXP (op, 1)))
9349 return XEXP (XEXP (op, 0), 0);
9350
9351 /* It can also be represented (for zero-extend) as an AND with an
9352 immediate. */
9353 if (GET_CODE (op) == AND
9354 && GET_CODE (XEXP (op, 0)) == MULT
9355 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
9356 && CONST_INT_P (XEXP (op, 1))
9357 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
9358 INTVAL (XEXP (op, 1))) != 0)
9359 return XEXP (XEXP (op, 0), 0);
9360
9361 /* Now handle extended register, as this may also have an optional
9362 left shift by 1..4. */
9363 if (strip_shift
9364 && GET_CODE (op) == ASHIFT
9365 && CONST_INT_P (XEXP (op, 1))
9366 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
9367 op = XEXP (op, 0);
9368
9369 if (GET_CODE (op) == ZERO_EXTEND
9370 || GET_CODE (op) == SIGN_EXTEND)
9371 op = XEXP (op, 0);
9372
9373 if (op != x)
9374 return op;
9375
9376 return x;
9377 }
9378
9379 /* Return true iff CODE is a shift supported in combination
9380 with arithmetic instructions. */
9381
9382 static bool
9383 aarch64_shift_p (enum rtx_code code)
9384 {
9385 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
9386 }
9387
9388
9389 /* Return true iff X is a cheap shift without a sign extend. */
9390
9391 static bool
9392 aarch64_cheap_mult_shift_p (rtx x)
9393 {
9394 rtx op0, op1;
9395
9396 op0 = XEXP (x, 0);
9397 op1 = XEXP (x, 1);
9398
9399 if (!(aarch64_tune_params.extra_tuning_flags
9400 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
9401 return false;
9402
9403 if (GET_CODE (op0) == SIGN_EXTEND)
9404 return false;
9405
9406 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
9407 && UINTVAL (op1) <= 4)
9408 return true;
9409
9410 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
9411 return false;
9412
9413 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
9414
9415 if (l2 > 0 && l2 <= 4)
9416 return true;
9417
9418 return false;
9419 }
9420
9421 /* Helper function for rtx cost calculation. Calculate the cost of
9422 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
9423 Return the calculated cost of the expression, recursing manually in to
9424 operands where needed. */
9425
9426 static int
9427 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
9428 {
9429 rtx op0, op1;
9430 const struct cpu_cost_table *extra_cost
9431 = aarch64_tune_params.insn_extra_cost;
9432 int cost = 0;
9433 bool compound_p = (outer == PLUS || outer == MINUS);
9434 machine_mode mode = GET_MODE (x);
9435
9436 gcc_checking_assert (code == MULT);
9437
9438 op0 = XEXP (x, 0);
9439 op1 = XEXP (x, 1);
9440
9441 if (VECTOR_MODE_P (mode))
9442 mode = GET_MODE_INNER (mode);
9443
9444 /* Integer multiply/fma. */
9445 if (GET_MODE_CLASS (mode) == MODE_INT)
9446 {
9447 /* The multiply will be canonicalized as a shift, cost it as such. */
9448 if (aarch64_shift_p (GET_CODE (x))
9449 || (CONST_INT_P (op1)
9450 && exact_log2 (INTVAL (op1)) > 0))
9451 {
9452 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
9453 || GET_CODE (op0) == SIGN_EXTEND;
9454 if (speed)
9455 {
9456 if (compound_p)
9457 {
9458 /* If the shift is considered cheap,
9459 then don't add any cost. */
9460 if (aarch64_cheap_mult_shift_p (x))
9461 ;
9462 else if (REG_P (op1))
9463 /* ARITH + shift-by-register. */
9464 cost += extra_cost->alu.arith_shift_reg;
9465 else if (is_extend)
9466 /* ARITH + extended register. We don't have a cost field
9467 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
9468 cost += extra_cost->alu.extend_arith;
9469 else
9470 /* ARITH + shift-by-immediate. */
9471 cost += extra_cost->alu.arith_shift;
9472 }
9473 else
9474 /* LSL (immediate). */
9475 cost += extra_cost->alu.shift;
9476
9477 }
9478 /* Strip extends as we will have costed them in the case above. */
9479 if (is_extend)
9480 op0 = aarch64_strip_extend (op0, true);
9481
9482 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
9483
9484 return cost;
9485 }
9486
9487 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
9488 compound and let the below cases handle it. After all, MNEG is a
9489 special-case alias of MSUB. */
9490 if (GET_CODE (op0) == NEG)
9491 {
9492 op0 = XEXP (op0, 0);
9493 compound_p = true;
9494 }
9495
9496 /* Integer multiplies or FMAs have zero/sign extending variants. */
9497 if ((GET_CODE (op0) == ZERO_EXTEND
9498 && GET_CODE (op1) == ZERO_EXTEND)
9499 || (GET_CODE (op0) == SIGN_EXTEND
9500 && GET_CODE (op1) == SIGN_EXTEND))
9501 {
9502 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
9503 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
9504
9505 if (speed)
9506 {
9507 if (compound_p)
9508 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
9509 cost += extra_cost->mult[0].extend_add;
9510 else
9511 /* MUL/SMULL/UMULL. */
9512 cost += extra_cost->mult[0].extend;
9513 }
9514
9515 return cost;
9516 }
9517
9518 /* This is either an integer multiply or a MADD. In both cases
9519 we want to recurse and cost the operands. */
9520 cost += rtx_cost (op0, mode, MULT, 0, speed);
9521 cost += rtx_cost (op1, mode, MULT, 1, speed);
9522
9523 if (speed)
9524 {
9525 if (compound_p)
9526 /* MADD/MSUB. */
9527 cost += extra_cost->mult[mode == DImode].add;
9528 else
9529 /* MUL. */
9530 cost += extra_cost->mult[mode == DImode].simple;
9531 }
9532
9533 return cost;
9534 }
9535 else
9536 {
9537 if (speed)
9538 {
9539 /* Floating-point FMA/FMUL can also support negations of the
9540 operands, unless the rounding mode is upward or downward in
9541 which case FNMUL is different than FMUL with operand negation. */
9542 bool neg0 = GET_CODE (op0) == NEG;
9543 bool neg1 = GET_CODE (op1) == NEG;
9544 if (compound_p || !flag_rounding_math || (neg0 && neg1))
9545 {
9546 if (neg0)
9547 op0 = XEXP (op0, 0);
9548 if (neg1)
9549 op1 = XEXP (op1, 0);
9550 }
9551
9552 if (compound_p)
9553 /* FMADD/FNMADD/FNMSUB/FMSUB. */
9554 cost += extra_cost->fp[mode == DFmode].fma;
9555 else
9556 /* FMUL/FNMUL. */
9557 cost += extra_cost->fp[mode == DFmode].mult;
9558 }
9559
9560 cost += rtx_cost (op0, mode, MULT, 0, speed);
9561 cost += rtx_cost (op1, mode, MULT, 1, speed);
9562 return cost;
9563 }
9564 }
9565
9566 static int
9567 aarch64_address_cost (rtx x,
9568 machine_mode mode,
9569 addr_space_t as ATTRIBUTE_UNUSED,
9570 bool speed)
9571 {
9572 enum rtx_code c = GET_CODE (x);
9573 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
9574 struct aarch64_address_info info;
9575 int cost = 0;
9576 info.shift = 0;
9577
9578 if (!aarch64_classify_address (&info, x, mode, false))
9579 {
9580 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
9581 {
9582 /* This is a CONST or SYMBOL ref which will be split
9583 in a different way depending on the code model in use.
9584 Cost it through the generic infrastructure. */
9585 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
9586 /* Divide through by the cost of one instruction to
9587 bring it to the same units as the address costs. */
9588 cost_symbol_ref /= COSTS_N_INSNS (1);
9589 /* The cost is then the cost of preparing the address,
9590 followed by an immediate (possibly 0) offset. */
9591 return cost_symbol_ref + addr_cost->imm_offset;
9592 }
9593 else
9594 {
9595 /* This is most likely a jump table from a case
9596 statement. */
9597 return addr_cost->register_offset;
9598 }
9599 }
9600
9601 switch (info.type)
9602 {
9603 case ADDRESS_LO_SUM:
9604 case ADDRESS_SYMBOLIC:
9605 case ADDRESS_REG_IMM:
9606 cost += addr_cost->imm_offset;
9607 break;
9608
9609 case ADDRESS_REG_WB:
9610 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9611 cost += addr_cost->pre_modify;
9612 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9613 cost += addr_cost->post_modify;
9614 else
9615 gcc_unreachable ();
9616
9617 break;
9618
9619 case ADDRESS_REG_REG:
9620 cost += addr_cost->register_offset;
9621 break;
9622
9623 case ADDRESS_REG_SXTW:
9624 cost += addr_cost->register_sextend;
9625 break;
9626
9627 case ADDRESS_REG_UXTW:
9628 cost += addr_cost->register_zextend;
9629 break;
9630
9631 default:
9632 gcc_unreachable ();
9633 }
9634
9635
9636 if (info.shift > 0)
9637 {
9638 /* For the sake of calculating the cost of the shifted register
9639 component, we can treat same sized modes in the same way. */
9640 if (known_eq (GET_MODE_BITSIZE (mode), 16))
9641 cost += addr_cost->addr_scale_costs.hi;
9642 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9643 cost += addr_cost->addr_scale_costs.si;
9644 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9645 cost += addr_cost->addr_scale_costs.di;
9646 else
9647 /* We can't tell, or this is a 128-bit vector. */
9648 cost += addr_cost->addr_scale_costs.ti;
9649 }
9650
9651 return cost;
9652 }
9653
9654 /* Return the cost of a branch. If SPEED_P is true then the compiler is
9655 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
9656 to be taken. */
9657
9658 int
9659 aarch64_branch_cost (bool speed_p, bool predictable_p)
9660 {
9661 /* When optimizing for speed, use the cost of unpredictable branches. */
9662 const struct cpu_branch_cost *branch_costs =
9663 aarch64_tune_params.branch_costs;
9664
9665 if (!speed_p || predictable_p)
9666 return branch_costs->predictable;
9667 else
9668 return branch_costs->unpredictable;
9669 }
9670
9671 /* Return true if the RTX X in mode MODE is a zero or sign extract
9672 usable in an ADD or SUB (extended register) instruction. */
9673 static bool
9674 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9675 {
9676 /* Catch add with a sign extract.
9677 This is add_<optab><mode>_multp2. */
9678 if (GET_CODE (x) == SIGN_EXTRACT
9679 || GET_CODE (x) == ZERO_EXTRACT)
9680 {
9681 rtx op0 = XEXP (x, 0);
9682 rtx op1 = XEXP (x, 1);
9683 rtx op2 = XEXP (x, 2);
9684
9685 if (GET_CODE (op0) == MULT
9686 && CONST_INT_P (op1)
9687 && op2 == const0_rtx
9688 && CONST_INT_P (XEXP (op0, 1))
9689 && aarch64_is_extend_from_extract (mode,
9690 XEXP (op0, 1),
9691 op1))
9692 {
9693 return true;
9694 }
9695 }
9696 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9697 No shift. */
9698 else if (GET_CODE (x) == SIGN_EXTEND
9699 || GET_CODE (x) == ZERO_EXTEND)
9700 return REG_P (XEXP (x, 0));
9701
9702 return false;
9703 }
9704
9705 static bool
9706 aarch64_frint_unspec_p (unsigned int u)
9707 {
9708 switch (u)
9709 {
9710 case UNSPEC_FRINTZ:
9711 case UNSPEC_FRINTP:
9712 case UNSPEC_FRINTM:
9713 case UNSPEC_FRINTA:
9714 case UNSPEC_FRINTN:
9715 case UNSPEC_FRINTX:
9716 case UNSPEC_FRINTI:
9717 return true;
9718
9719 default:
9720 return false;
9721 }
9722 }
9723
9724 /* Return true iff X is an rtx that will match an extr instruction
9725 i.e. as described in the *extr<mode>5_insn family of patterns.
9726 OP0 and OP1 will be set to the operands of the shifts involved
9727 on success and will be NULL_RTX otherwise. */
9728
9729 static bool
9730 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9731 {
9732 rtx op0, op1;
9733 scalar_int_mode mode;
9734 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9735 return false;
9736
9737 *res_op0 = NULL_RTX;
9738 *res_op1 = NULL_RTX;
9739
9740 if (GET_CODE (x) != IOR)
9741 return false;
9742
9743 op0 = XEXP (x, 0);
9744 op1 = XEXP (x, 1);
9745
9746 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9747 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9748 {
9749 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
9750 if (GET_CODE (op1) == ASHIFT)
9751 std::swap (op0, op1);
9752
9753 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9754 return false;
9755
9756 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9757 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9758
9759 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9760 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9761 {
9762 *res_op0 = XEXP (op0, 0);
9763 *res_op1 = XEXP (op1, 0);
9764 return true;
9765 }
9766 }
9767
9768 return false;
9769 }
9770
9771 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9772 storing it in *COST. Result is true if the total cost of the operation
9773 has now been calculated. */
9774 static bool
9775 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9776 {
9777 rtx inner;
9778 rtx comparator;
9779 enum rtx_code cmpcode;
9780
9781 if (COMPARISON_P (op0))
9782 {
9783 inner = XEXP (op0, 0);
9784 comparator = XEXP (op0, 1);
9785 cmpcode = GET_CODE (op0);
9786 }
9787 else
9788 {
9789 inner = op0;
9790 comparator = const0_rtx;
9791 cmpcode = NE;
9792 }
9793
9794 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9795 {
9796 /* Conditional branch. */
9797 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9798 return true;
9799 else
9800 {
9801 if (cmpcode == NE || cmpcode == EQ)
9802 {
9803 if (comparator == const0_rtx)
9804 {
9805 /* TBZ/TBNZ/CBZ/CBNZ. */
9806 if (GET_CODE (inner) == ZERO_EXTRACT)
9807 /* TBZ/TBNZ. */
9808 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9809 ZERO_EXTRACT, 0, speed);
9810 else
9811 /* CBZ/CBNZ. */
9812 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9813
9814 return true;
9815 }
9816 }
9817 else if (cmpcode == LT || cmpcode == GE)
9818 {
9819 /* TBZ/TBNZ. */
9820 if (comparator == const0_rtx)
9821 return true;
9822 }
9823 }
9824 }
9825 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9826 {
9827 /* CCMP. */
9828 if (GET_CODE (op1) == COMPARE)
9829 {
9830 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
9831 if (XEXP (op1, 1) == const0_rtx)
9832 *cost += 1;
9833 if (speed)
9834 {
9835 machine_mode mode = GET_MODE (XEXP (op1, 0));
9836 const struct cpu_cost_table *extra_cost
9837 = aarch64_tune_params.insn_extra_cost;
9838
9839 if (GET_MODE_CLASS (mode) == MODE_INT)
9840 *cost += extra_cost->alu.arith;
9841 else
9842 *cost += extra_cost->fp[mode == DFmode].compare;
9843 }
9844 return true;
9845 }
9846
9847 /* It's a conditional operation based on the status flags,
9848 so it must be some flavor of CSEL. */
9849
9850 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
9851 if (GET_CODE (op1) == NEG
9852 || GET_CODE (op1) == NOT
9853 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9854 op1 = XEXP (op1, 0);
9855 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9856 {
9857 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
9858 op1 = XEXP (op1, 0);
9859 op2 = XEXP (op2, 0);
9860 }
9861
9862 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9863 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9864 return true;
9865 }
9866
9867 /* We don't know what this is, cost all operands. */
9868 return false;
9869 }
9870
9871 /* Check whether X is a bitfield operation of the form shift + extend that
9872 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
9873 operand to which the bitfield operation is applied. Otherwise return
9874 NULL_RTX. */
9875
9876 static rtx
9877 aarch64_extend_bitfield_pattern_p (rtx x)
9878 {
9879 rtx_code outer_code = GET_CODE (x);
9880 machine_mode outer_mode = GET_MODE (x);
9881
9882 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9883 && outer_mode != SImode && outer_mode != DImode)
9884 return NULL_RTX;
9885
9886 rtx inner = XEXP (x, 0);
9887 rtx_code inner_code = GET_CODE (inner);
9888 machine_mode inner_mode = GET_MODE (inner);
9889 rtx op = NULL_RTX;
9890
9891 switch (inner_code)
9892 {
9893 case ASHIFT:
9894 if (CONST_INT_P (XEXP (inner, 1))
9895 && (inner_mode == QImode || inner_mode == HImode))
9896 op = XEXP (inner, 0);
9897 break;
9898 case LSHIFTRT:
9899 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9900 && (inner_mode == QImode || inner_mode == HImode))
9901 op = XEXP (inner, 0);
9902 break;
9903 case ASHIFTRT:
9904 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9905 && (inner_mode == QImode || inner_mode == HImode))
9906 op = XEXP (inner, 0);
9907 break;
9908 default:
9909 break;
9910 }
9911
9912 return op;
9913 }
9914
9915 /* Return true if the mask and a shift amount from an RTX of the form
9916 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9917 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
9918
9919 bool
9920 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9921 rtx shft_amnt)
9922 {
9923 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9924 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9925 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9926 && (INTVAL (mask)
9927 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9928 }
9929
9930 /* Return true if the masks and a shift amount from an RTX of the form
9931 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9932 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
9933
9934 bool
9935 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9936 unsigned HOST_WIDE_INT mask1,
9937 unsigned HOST_WIDE_INT shft_amnt,
9938 unsigned HOST_WIDE_INT mask2)
9939 {
9940 unsigned HOST_WIDE_INT t;
9941
9942 /* Verify that there is no overlap in what bits are set in the two masks. */
9943 if (mask1 != ~mask2)
9944 return false;
9945
9946 /* Verify that mask2 is not all zeros or ones. */
9947 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9948 return false;
9949
9950 /* The shift amount should always be less than the mode size. */
9951 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9952
9953 /* Verify that the mask being shifted is contiguous and would be in the
9954 least significant bits after shifting by shft_amnt. */
9955 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9956 return (t == (t & -t));
9957 }
9958
9959 /* Calculate the cost of calculating X, storing it in *COST. Result
9960 is true if the total cost of the operation has now been calculated. */
9961 static bool
9962 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9963 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9964 {
9965 rtx op0, op1, op2;
9966 const struct cpu_cost_table *extra_cost
9967 = aarch64_tune_params.insn_extra_cost;
9968 int code = GET_CODE (x);
9969 scalar_int_mode int_mode;
9970
9971 /* By default, assume that everything has equivalent cost to the
9972 cheapest instruction. Any additional costs are applied as a delta
9973 above this default. */
9974 *cost = COSTS_N_INSNS (1);
9975
9976 switch (code)
9977 {
9978 case SET:
9979 /* The cost depends entirely on the operands to SET. */
9980 *cost = 0;
9981 op0 = SET_DEST (x);
9982 op1 = SET_SRC (x);
9983
9984 switch (GET_CODE (op0))
9985 {
9986 case MEM:
9987 if (speed)
9988 {
9989 rtx address = XEXP (op0, 0);
9990 if (VECTOR_MODE_P (mode))
9991 *cost += extra_cost->ldst.storev;
9992 else if (GET_MODE_CLASS (mode) == MODE_INT)
9993 *cost += extra_cost->ldst.store;
9994 else if (mode == SFmode)
9995 *cost += extra_cost->ldst.storef;
9996 else if (mode == DFmode)
9997 *cost += extra_cost->ldst.stored;
9998
9999 *cost +=
10000 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10001 0, speed));
10002 }
10003
10004 *cost += rtx_cost (op1, mode, SET, 1, speed);
10005 return true;
10006
10007 case SUBREG:
10008 if (! REG_P (SUBREG_REG (op0)))
10009 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
10010
10011 /* Fall through. */
10012 case REG:
10013 /* The cost is one per vector-register copied. */
10014 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
10015 {
10016 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
10017 *cost = COSTS_N_INSNS (nregs);
10018 }
10019 /* const0_rtx is in general free, but we will use an
10020 instruction to set a register to 0. */
10021 else if (REG_P (op1) || op1 == const0_rtx)
10022 {
10023 /* The cost is 1 per register copied. */
10024 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
10025 *cost = COSTS_N_INSNS (nregs);
10026 }
10027 else
10028 /* Cost is just the cost of the RHS of the set. */
10029 *cost += rtx_cost (op1, mode, SET, 1, speed);
10030 return true;
10031
10032 case ZERO_EXTRACT:
10033 case SIGN_EXTRACT:
10034 /* Bit-field insertion. Strip any redundant widening of
10035 the RHS to meet the width of the target. */
10036 if (GET_CODE (op1) == SUBREG)
10037 op1 = SUBREG_REG (op1);
10038 if ((GET_CODE (op1) == ZERO_EXTEND
10039 || GET_CODE (op1) == SIGN_EXTEND)
10040 && CONST_INT_P (XEXP (op0, 1))
10041 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
10042 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
10043 op1 = XEXP (op1, 0);
10044
10045 if (CONST_INT_P (op1))
10046 {
10047 /* MOV immediate is assumed to always be cheap. */
10048 *cost = COSTS_N_INSNS (1);
10049 }
10050 else
10051 {
10052 /* BFM. */
10053 if (speed)
10054 *cost += extra_cost->alu.bfi;
10055 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
10056 }
10057
10058 return true;
10059
10060 default:
10061 /* We can't make sense of this, assume default cost. */
10062 *cost = COSTS_N_INSNS (1);
10063 return false;
10064 }
10065 return false;
10066
10067 case CONST_INT:
10068 /* If an instruction can incorporate a constant within the
10069 instruction, the instruction's expression avoids calling
10070 rtx_cost() on the constant. If rtx_cost() is called on a
10071 constant, then it is usually because the constant must be
10072 moved into a register by one or more instructions.
10073
10074 The exception is constant 0, which can be expressed
10075 as XZR/WZR and is therefore free. The exception to this is
10076 if we have (set (reg) (const0_rtx)) in which case we must cost
10077 the move. However, we can catch that when we cost the SET, so
10078 we don't need to consider that here. */
10079 if (x == const0_rtx)
10080 *cost = 0;
10081 else
10082 {
10083 /* To an approximation, building any other constant is
10084 proportionally expensive to the number of instructions
10085 required to build that constant. This is true whether we
10086 are compiling for SPEED or otherwise. */
10087 if (!is_a <scalar_int_mode> (mode, &int_mode))
10088 int_mode = word_mode;
10089 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
10090 (NULL_RTX, x, false, int_mode));
10091 }
10092 return true;
10093
10094 case CONST_DOUBLE:
10095
10096 /* First determine number of instructions to do the move
10097 as an integer constant. */
10098 if (!aarch64_float_const_representable_p (x)
10099 && !aarch64_can_const_movi_rtx_p (x, mode)
10100 && aarch64_float_const_rtx_p (x))
10101 {
10102 unsigned HOST_WIDE_INT ival;
10103 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
10104 gcc_assert (succeed);
10105
10106 scalar_int_mode imode = (mode == HFmode
10107 ? SImode
10108 : int_mode_for_mode (mode).require ());
10109 int ncost = aarch64_internal_mov_immediate
10110 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10111 *cost += COSTS_N_INSNS (ncost);
10112 return true;
10113 }
10114
10115 if (speed)
10116 {
10117 /* mov[df,sf]_aarch64. */
10118 if (aarch64_float_const_representable_p (x))
10119 /* FMOV (scalar immediate). */
10120 *cost += extra_cost->fp[mode == DFmode].fpconst;
10121 else if (!aarch64_float_const_zero_rtx_p (x))
10122 {
10123 /* This will be a load from memory. */
10124 if (mode == DFmode)
10125 *cost += extra_cost->ldst.loadd;
10126 else
10127 *cost += extra_cost->ldst.loadf;
10128 }
10129 else
10130 /* Otherwise this is +0.0. We get this using MOVI d0, #0
10131 or MOV v0.s[0], wzr - neither of which are modeled by the
10132 cost tables. Just use the default cost. */
10133 {
10134 }
10135 }
10136
10137 return true;
10138
10139 case MEM:
10140 if (speed)
10141 {
10142 /* For loads we want the base cost of a load, plus an
10143 approximation for the additional cost of the addressing
10144 mode. */
10145 rtx address = XEXP (x, 0);
10146 if (VECTOR_MODE_P (mode))
10147 *cost += extra_cost->ldst.loadv;
10148 else if (GET_MODE_CLASS (mode) == MODE_INT)
10149 *cost += extra_cost->ldst.load;
10150 else if (mode == SFmode)
10151 *cost += extra_cost->ldst.loadf;
10152 else if (mode == DFmode)
10153 *cost += extra_cost->ldst.loadd;
10154
10155 *cost +=
10156 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10157 0, speed));
10158 }
10159
10160 return true;
10161
10162 case NEG:
10163 op0 = XEXP (x, 0);
10164
10165 if (VECTOR_MODE_P (mode))
10166 {
10167 if (speed)
10168 {
10169 /* FNEG. */
10170 *cost += extra_cost->vect.alu;
10171 }
10172 return false;
10173 }
10174
10175 if (GET_MODE_CLASS (mode) == MODE_INT)
10176 {
10177 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10178 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10179 {
10180 /* CSETM. */
10181 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
10182 return true;
10183 }
10184
10185 /* Cost this as SUB wzr, X. */
10186 op0 = CONST0_RTX (mode);
10187 op1 = XEXP (x, 0);
10188 goto cost_minus;
10189 }
10190
10191 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10192 {
10193 /* Support (neg(fma...)) as a single instruction only if
10194 sign of zeros is unimportant. This matches the decision
10195 making in aarch64.md. */
10196 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
10197 {
10198 /* FNMADD. */
10199 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10200 return true;
10201 }
10202 if (GET_CODE (op0) == MULT)
10203 {
10204 /* FNMUL. */
10205 *cost = rtx_cost (op0, mode, NEG, 0, speed);
10206 return true;
10207 }
10208 if (speed)
10209 /* FNEG. */
10210 *cost += extra_cost->fp[mode == DFmode].neg;
10211 return false;
10212 }
10213
10214 return false;
10215
10216 case CLRSB:
10217 case CLZ:
10218 if (speed)
10219 {
10220 if (VECTOR_MODE_P (mode))
10221 *cost += extra_cost->vect.alu;
10222 else
10223 *cost += extra_cost->alu.clz;
10224 }
10225
10226 return false;
10227
10228 case COMPARE:
10229 op0 = XEXP (x, 0);
10230 op1 = XEXP (x, 1);
10231
10232 if (op1 == const0_rtx
10233 && GET_CODE (op0) == AND)
10234 {
10235 x = op0;
10236 mode = GET_MODE (op0);
10237 goto cost_logic;
10238 }
10239
10240 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
10241 {
10242 /* TODO: A write to the CC flags possibly costs extra, this
10243 needs encoding in the cost tables. */
10244
10245 mode = GET_MODE (op0);
10246 /* ANDS. */
10247 if (GET_CODE (op0) == AND)
10248 {
10249 x = op0;
10250 goto cost_logic;
10251 }
10252
10253 if (GET_CODE (op0) == PLUS)
10254 {
10255 /* ADDS (and CMN alias). */
10256 x = op0;
10257 goto cost_plus;
10258 }
10259
10260 if (GET_CODE (op0) == MINUS)
10261 {
10262 /* SUBS. */
10263 x = op0;
10264 goto cost_minus;
10265 }
10266
10267 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
10268 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
10269 && CONST_INT_P (XEXP (op0, 2)))
10270 {
10271 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
10272 Handle it here directly rather than going to cost_logic
10273 since we know the immediate generated for the TST is valid
10274 so we can avoid creating an intermediate rtx for it only
10275 for costing purposes. */
10276 if (speed)
10277 *cost += extra_cost->alu.logical;
10278
10279 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
10280 ZERO_EXTRACT, 0, speed);
10281 return true;
10282 }
10283
10284 if (GET_CODE (op1) == NEG)
10285 {
10286 /* CMN. */
10287 if (speed)
10288 *cost += extra_cost->alu.arith;
10289
10290 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
10291 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
10292 return true;
10293 }
10294
10295 /* CMP.
10296
10297 Compare can freely swap the order of operands, and
10298 canonicalization puts the more complex operation first.
10299 But the integer MINUS logic expects the shift/extend
10300 operation in op1. */
10301 if (! (REG_P (op0)
10302 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
10303 {
10304 op0 = XEXP (x, 1);
10305 op1 = XEXP (x, 0);
10306 }
10307 goto cost_minus;
10308 }
10309
10310 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
10311 {
10312 /* FCMP. */
10313 if (speed)
10314 *cost += extra_cost->fp[mode == DFmode].compare;
10315
10316 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
10317 {
10318 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
10319 /* FCMP supports constant 0.0 for no extra cost. */
10320 return true;
10321 }
10322 return false;
10323 }
10324
10325 if (VECTOR_MODE_P (mode))
10326 {
10327 /* Vector compare. */
10328 if (speed)
10329 *cost += extra_cost->vect.alu;
10330
10331 if (aarch64_float_const_zero_rtx_p (op1))
10332 {
10333 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
10334 cost. */
10335 return true;
10336 }
10337 return false;
10338 }
10339 return false;
10340
10341 case MINUS:
10342 {
10343 op0 = XEXP (x, 0);
10344 op1 = XEXP (x, 1);
10345
10346 cost_minus:
10347 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
10348
10349 /* Detect valid immediates. */
10350 if ((GET_MODE_CLASS (mode) == MODE_INT
10351 || (GET_MODE_CLASS (mode) == MODE_CC
10352 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
10353 && CONST_INT_P (op1)
10354 && aarch64_uimm12_shift (INTVAL (op1)))
10355 {
10356 if (speed)
10357 /* SUB(S) (immediate). */
10358 *cost += extra_cost->alu.arith;
10359 return true;
10360 }
10361
10362 /* Look for SUB (extended register). */
10363 if (is_a <scalar_int_mode> (mode, &int_mode)
10364 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
10365 {
10366 if (speed)
10367 *cost += extra_cost->alu.extend_arith;
10368
10369 op1 = aarch64_strip_extend (op1, true);
10370 *cost += rtx_cost (op1, VOIDmode,
10371 (enum rtx_code) GET_CODE (op1), 0, speed);
10372 return true;
10373 }
10374
10375 rtx new_op1 = aarch64_strip_extend (op1, false);
10376
10377 /* Cost this as an FMA-alike operation. */
10378 if ((GET_CODE (new_op1) == MULT
10379 || aarch64_shift_p (GET_CODE (new_op1)))
10380 && code != COMPARE)
10381 {
10382 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
10383 (enum rtx_code) code,
10384 speed);
10385 return true;
10386 }
10387
10388 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
10389
10390 if (speed)
10391 {
10392 if (VECTOR_MODE_P (mode))
10393 {
10394 /* Vector SUB. */
10395 *cost += extra_cost->vect.alu;
10396 }
10397 else if (GET_MODE_CLASS (mode) == MODE_INT)
10398 {
10399 /* SUB(S). */
10400 *cost += extra_cost->alu.arith;
10401 }
10402 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10403 {
10404 /* FSUB. */
10405 *cost += extra_cost->fp[mode == DFmode].addsub;
10406 }
10407 }
10408 return true;
10409 }
10410
10411 case PLUS:
10412 {
10413 rtx new_op0;
10414
10415 op0 = XEXP (x, 0);
10416 op1 = XEXP (x, 1);
10417
10418 cost_plus:
10419 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
10420 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
10421 {
10422 /* CSINC. */
10423 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
10424 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10425 return true;
10426 }
10427
10428 if (GET_MODE_CLASS (mode) == MODE_INT
10429 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
10430 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
10431 {
10432 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
10433
10434 if (speed)
10435 /* ADD (immediate). */
10436 *cost += extra_cost->alu.arith;
10437 return true;
10438 }
10439
10440 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
10441
10442 /* Look for ADD (extended register). */
10443 if (is_a <scalar_int_mode> (mode, &int_mode)
10444 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
10445 {
10446 if (speed)
10447 *cost += extra_cost->alu.extend_arith;
10448
10449 op0 = aarch64_strip_extend (op0, true);
10450 *cost += rtx_cost (op0, VOIDmode,
10451 (enum rtx_code) GET_CODE (op0), 0, speed);
10452 return true;
10453 }
10454
10455 /* Strip any extend, leave shifts behind as we will
10456 cost them through mult_cost. */
10457 new_op0 = aarch64_strip_extend (op0, false);
10458
10459 if (GET_CODE (new_op0) == MULT
10460 || aarch64_shift_p (GET_CODE (new_op0)))
10461 {
10462 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
10463 speed);
10464 return true;
10465 }
10466
10467 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
10468
10469 if (speed)
10470 {
10471 if (VECTOR_MODE_P (mode))
10472 {
10473 /* Vector ADD. */
10474 *cost += extra_cost->vect.alu;
10475 }
10476 else if (GET_MODE_CLASS (mode) == MODE_INT)
10477 {
10478 /* ADD. */
10479 *cost += extra_cost->alu.arith;
10480 }
10481 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10482 {
10483 /* FADD. */
10484 *cost += extra_cost->fp[mode == DFmode].addsub;
10485 }
10486 }
10487 return true;
10488 }
10489
10490 case BSWAP:
10491 *cost = COSTS_N_INSNS (1);
10492
10493 if (speed)
10494 {
10495 if (VECTOR_MODE_P (mode))
10496 *cost += extra_cost->vect.alu;
10497 else
10498 *cost += extra_cost->alu.rev;
10499 }
10500 return false;
10501
10502 case IOR:
10503 if (aarch_rev16_p (x))
10504 {
10505 *cost = COSTS_N_INSNS (1);
10506
10507 if (speed)
10508 {
10509 if (VECTOR_MODE_P (mode))
10510 *cost += extra_cost->vect.alu;
10511 else
10512 *cost += extra_cost->alu.rev;
10513 }
10514 return true;
10515 }
10516
10517 if (aarch64_extr_rtx_p (x, &op0, &op1))
10518 {
10519 *cost += rtx_cost (op0, mode, IOR, 0, speed);
10520 *cost += rtx_cost (op1, mode, IOR, 1, speed);
10521 if (speed)
10522 *cost += extra_cost->alu.shift;
10523
10524 return true;
10525 }
10526 /* Fall through. */
10527 case XOR:
10528 case AND:
10529 cost_logic:
10530 op0 = XEXP (x, 0);
10531 op1 = XEXP (x, 1);
10532
10533 if (VECTOR_MODE_P (mode))
10534 {
10535 if (speed)
10536 *cost += extra_cost->vect.alu;
10537 return true;
10538 }
10539
10540 if (code == AND
10541 && GET_CODE (op0) == MULT
10542 && CONST_INT_P (XEXP (op0, 1))
10543 && CONST_INT_P (op1)
10544 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
10545 INTVAL (op1)) != 0)
10546 {
10547 /* This is a UBFM/SBFM. */
10548 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
10549 if (speed)
10550 *cost += extra_cost->alu.bfx;
10551 return true;
10552 }
10553
10554 if (is_int_mode (mode, &int_mode))
10555 {
10556 if (CONST_INT_P (op1))
10557 {
10558 /* We have a mask + shift version of a UBFIZ
10559 i.e. the *andim_ashift<mode>_bfiz pattern. */
10560 if (GET_CODE (op0) == ASHIFT
10561 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
10562 XEXP (op0, 1)))
10563 {
10564 *cost += rtx_cost (XEXP (op0, 0), int_mode,
10565 (enum rtx_code) code, 0, speed);
10566 if (speed)
10567 *cost += extra_cost->alu.bfx;
10568
10569 return true;
10570 }
10571 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
10572 {
10573 /* We possibly get the immediate for free, this is not
10574 modelled. */
10575 *cost += rtx_cost (op0, int_mode,
10576 (enum rtx_code) code, 0, speed);
10577 if (speed)
10578 *cost += extra_cost->alu.logical;
10579
10580 return true;
10581 }
10582 }
10583 else
10584 {
10585 rtx new_op0 = op0;
10586
10587 /* Handle ORN, EON, or BIC. */
10588 if (GET_CODE (op0) == NOT)
10589 op0 = XEXP (op0, 0);
10590
10591 new_op0 = aarch64_strip_shift (op0);
10592
10593 /* If we had a shift on op0 then this is a logical-shift-
10594 by-register/immediate operation. Otherwise, this is just
10595 a logical operation. */
10596 if (speed)
10597 {
10598 if (new_op0 != op0)
10599 {
10600 /* Shift by immediate. */
10601 if (CONST_INT_P (XEXP (op0, 1)))
10602 *cost += extra_cost->alu.log_shift;
10603 else
10604 *cost += extra_cost->alu.log_shift_reg;
10605 }
10606 else
10607 *cost += extra_cost->alu.logical;
10608 }
10609
10610 /* In both cases we want to cost both operands. */
10611 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10612 0, speed);
10613 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10614 1, speed);
10615
10616 return true;
10617 }
10618 }
10619 return false;
10620
10621 case NOT:
10622 x = XEXP (x, 0);
10623 op0 = aarch64_strip_shift (x);
10624
10625 if (VECTOR_MODE_P (mode))
10626 {
10627 /* Vector NOT. */
10628 *cost += extra_cost->vect.alu;
10629 return false;
10630 }
10631
10632 /* MVN-shifted-reg. */
10633 if (op0 != x)
10634 {
10635 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10636
10637 if (speed)
10638 *cost += extra_cost->alu.log_shift;
10639
10640 return true;
10641 }
10642 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10643 Handle the second form here taking care that 'a' in the above can
10644 be a shift. */
10645 else if (GET_CODE (op0) == XOR)
10646 {
10647 rtx newop0 = XEXP (op0, 0);
10648 rtx newop1 = XEXP (op0, 1);
10649 rtx op0_stripped = aarch64_strip_shift (newop0);
10650
10651 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10652 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10653
10654 if (speed)
10655 {
10656 if (op0_stripped != newop0)
10657 *cost += extra_cost->alu.log_shift;
10658 else
10659 *cost += extra_cost->alu.logical;
10660 }
10661
10662 return true;
10663 }
10664 /* MVN. */
10665 if (speed)
10666 *cost += extra_cost->alu.logical;
10667
10668 return false;
10669
10670 case ZERO_EXTEND:
10671
10672 op0 = XEXP (x, 0);
10673 /* If a value is written in SI mode, then zero extended to DI
10674 mode, the operation will in general be free as a write to
10675 a 'w' register implicitly zeroes the upper bits of an 'x'
10676 register. However, if this is
10677
10678 (set (reg) (zero_extend (reg)))
10679
10680 we must cost the explicit register move. */
10681 if (mode == DImode
10682 && GET_MODE (op0) == SImode
10683 && outer == SET)
10684 {
10685 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10686
10687 /* If OP_COST is non-zero, then the cost of the zero extend
10688 is effectively the cost of the inner operation. Otherwise
10689 we have a MOV instruction and we take the cost from the MOV
10690 itself. This is true independently of whether we are
10691 optimizing for space or time. */
10692 if (op_cost)
10693 *cost = op_cost;
10694
10695 return true;
10696 }
10697 else if (MEM_P (op0))
10698 {
10699 /* All loads can zero extend to any size for free. */
10700 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10701 return true;
10702 }
10703
10704 op0 = aarch64_extend_bitfield_pattern_p (x);
10705 if (op0)
10706 {
10707 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10708 if (speed)
10709 *cost += extra_cost->alu.bfx;
10710 return true;
10711 }
10712
10713 if (speed)
10714 {
10715 if (VECTOR_MODE_P (mode))
10716 {
10717 /* UMOV. */
10718 *cost += extra_cost->vect.alu;
10719 }
10720 else
10721 {
10722 /* We generate an AND instead of UXTB/UXTH. */
10723 *cost += extra_cost->alu.logical;
10724 }
10725 }
10726 return false;
10727
10728 case SIGN_EXTEND:
10729 if (MEM_P (XEXP (x, 0)))
10730 {
10731 /* LDRSH. */
10732 if (speed)
10733 {
10734 rtx address = XEXP (XEXP (x, 0), 0);
10735 *cost += extra_cost->ldst.load_sign_extend;
10736
10737 *cost +=
10738 COSTS_N_INSNS (aarch64_address_cost (address, mode,
10739 0, speed));
10740 }
10741 return true;
10742 }
10743
10744 op0 = aarch64_extend_bitfield_pattern_p (x);
10745 if (op0)
10746 {
10747 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10748 if (speed)
10749 *cost += extra_cost->alu.bfx;
10750 return true;
10751 }
10752
10753 if (speed)
10754 {
10755 if (VECTOR_MODE_P (mode))
10756 *cost += extra_cost->vect.alu;
10757 else
10758 *cost += extra_cost->alu.extend;
10759 }
10760 return false;
10761
10762 case ASHIFT:
10763 op0 = XEXP (x, 0);
10764 op1 = XEXP (x, 1);
10765
10766 if (CONST_INT_P (op1))
10767 {
10768 if (speed)
10769 {
10770 if (VECTOR_MODE_P (mode))
10771 {
10772 /* Vector shift (immediate). */
10773 *cost += extra_cost->vect.alu;
10774 }
10775 else
10776 {
10777 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
10778 aliases. */
10779 *cost += extra_cost->alu.shift;
10780 }
10781 }
10782
10783 /* We can incorporate zero/sign extend for free. */
10784 if (GET_CODE (op0) == ZERO_EXTEND
10785 || GET_CODE (op0) == SIGN_EXTEND)
10786 op0 = XEXP (op0, 0);
10787
10788 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10789 return true;
10790 }
10791 else
10792 {
10793 if (VECTOR_MODE_P (mode))
10794 {
10795 if (speed)
10796 /* Vector shift (register). */
10797 *cost += extra_cost->vect.alu;
10798 }
10799 else
10800 {
10801 if (speed)
10802 /* LSLV. */
10803 *cost += extra_cost->alu.shift_reg;
10804
10805 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10806 && CONST_INT_P (XEXP (op1, 1))
10807 && known_eq (INTVAL (XEXP (op1, 1)),
10808 GET_MODE_BITSIZE (mode) - 1))
10809 {
10810 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10811 /* We already demanded XEXP (op1, 0) to be REG_P, so
10812 don't recurse into it. */
10813 return true;
10814 }
10815 }
10816 return false; /* All arguments need to be in registers. */
10817 }
10818
10819 case ROTATE:
10820 case ROTATERT:
10821 case LSHIFTRT:
10822 case ASHIFTRT:
10823 op0 = XEXP (x, 0);
10824 op1 = XEXP (x, 1);
10825
10826 if (CONST_INT_P (op1))
10827 {
10828 /* ASR (immediate) and friends. */
10829 if (speed)
10830 {
10831 if (VECTOR_MODE_P (mode))
10832 *cost += extra_cost->vect.alu;
10833 else
10834 *cost += extra_cost->alu.shift;
10835 }
10836
10837 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10838 return true;
10839 }
10840 else
10841 {
10842 if (VECTOR_MODE_P (mode))
10843 {
10844 if (speed)
10845 /* Vector shift (register). */
10846 *cost += extra_cost->vect.alu;
10847 }
10848 else
10849 {
10850 if (speed)
10851 /* ASR (register) and friends. */
10852 *cost += extra_cost->alu.shift_reg;
10853
10854 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10855 && CONST_INT_P (XEXP (op1, 1))
10856 && known_eq (INTVAL (XEXP (op1, 1)),
10857 GET_MODE_BITSIZE (mode) - 1))
10858 {
10859 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10860 /* We already demanded XEXP (op1, 0) to be REG_P, so
10861 don't recurse into it. */
10862 return true;
10863 }
10864 }
10865 return false; /* All arguments need to be in registers. */
10866 }
10867
10868 case SYMBOL_REF:
10869
10870 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10871 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10872 {
10873 /* LDR. */
10874 if (speed)
10875 *cost += extra_cost->ldst.load;
10876 }
10877 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10878 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10879 {
10880 /* ADRP, followed by ADD. */
10881 *cost += COSTS_N_INSNS (1);
10882 if (speed)
10883 *cost += 2 * extra_cost->alu.arith;
10884 }
10885 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10886 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10887 {
10888 /* ADR. */
10889 if (speed)
10890 *cost += extra_cost->alu.arith;
10891 }
10892
10893 if (flag_pic)
10894 {
10895 /* One extra load instruction, after accessing the GOT. */
10896 *cost += COSTS_N_INSNS (1);
10897 if (speed)
10898 *cost += extra_cost->ldst.load;
10899 }
10900 return true;
10901
10902 case HIGH:
10903 case LO_SUM:
10904 /* ADRP/ADD (immediate). */
10905 if (speed)
10906 *cost += extra_cost->alu.arith;
10907 return true;
10908
10909 case ZERO_EXTRACT:
10910 case SIGN_EXTRACT:
10911 /* UBFX/SBFX. */
10912 if (speed)
10913 {
10914 if (VECTOR_MODE_P (mode))
10915 *cost += extra_cost->vect.alu;
10916 else
10917 *cost += extra_cost->alu.bfx;
10918 }
10919
10920 /* We can trust that the immediates used will be correct (there
10921 are no by-register forms), so we need only cost op0. */
10922 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10923 return true;
10924
10925 case MULT:
10926 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10927 /* aarch64_rtx_mult_cost always handles recursion to its
10928 operands. */
10929 return true;
10930
10931 case MOD:
10932 /* We can expand signed mod by power of 2 using a NEGS, two parallel
10933 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
10934 an unconditional negate. This case should only ever be reached through
10935 the set_smod_pow2_cheap check in expmed.c. */
10936 if (CONST_INT_P (XEXP (x, 1))
10937 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10938 && (mode == SImode || mode == DImode))
10939 {
10940 /* We expand to 4 instructions. Reset the baseline. */
10941 *cost = COSTS_N_INSNS (4);
10942
10943 if (speed)
10944 *cost += 2 * extra_cost->alu.logical
10945 + 2 * extra_cost->alu.arith;
10946
10947 return true;
10948 }
10949
10950 /* Fall-through. */
10951 case UMOD:
10952 if (speed)
10953 {
10954 /* Slighly prefer UMOD over SMOD. */
10955 if (VECTOR_MODE_P (mode))
10956 *cost += extra_cost->vect.alu;
10957 else if (GET_MODE_CLASS (mode) == MODE_INT)
10958 *cost += (extra_cost->mult[mode == DImode].add
10959 + extra_cost->mult[mode == DImode].idiv
10960 + (code == MOD ? 1 : 0));
10961 }
10962 return false; /* All arguments need to be in registers. */
10963
10964 case DIV:
10965 case UDIV:
10966 case SQRT:
10967 if (speed)
10968 {
10969 if (VECTOR_MODE_P (mode))
10970 *cost += extra_cost->vect.alu;
10971 else if (GET_MODE_CLASS (mode) == MODE_INT)
10972 /* There is no integer SQRT, so only DIV and UDIV can get
10973 here. */
10974 *cost += (extra_cost->mult[mode == DImode].idiv
10975 /* Slighly prefer UDIV over SDIV. */
10976 + (code == DIV ? 1 : 0));
10977 else
10978 *cost += extra_cost->fp[mode == DFmode].div;
10979 }
10980 return false; /* All arguments need to be in registers. */
10981
10982 case IF_THEN_ELSE:
10983 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10984 XEXP (x, 2), cost, speed);
10985
10986 case EQ:
10987 case NE:
10988 case GT:
10989 case GTU:
10990 case LT:
10991 case LTU:
10992 case GE:
10993 case GEU:
10994 case LE:
10995 case LEU:
10996
10997 return false; /* All arguments must be in registers. */
10998
10999 case FMA:
11000 op0 = XEXP (x, 0);
11001 op1 = XEXP (x, 1);
11002 op2 = XEXP (x, 2);
11003
11004 if (speed)
11005 {
11006 if (VECTOR_MODE_P (mode))
11007 *cost += extra_cost->vect.alu;
11008 else
11009 *cost += extra_cost->fp[mode == DFmode].fma;
11010 }
11011
11012 /* FMSUB, FNMADD, and FNMSUB are free. */
11013 if (GET_CODE (op0) == NEG)
11014 op0 = XEXP (op0, 0);
11015
11016 if (GET_CODE (op2) == NEG)
11017 op2 = XEXP (op2, 0);
11018
11019 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
11020 and the by-element operand as operand 0. */
11021 if (GET_CODE (op1) == NEG)
11022 op1 = XEXP (op1, 0);
11023
11024 /* Catch vector-by-element operations. The by-element operand can
11025 either be (vec_duplicate (vec_select (x))) or just
11026 (vec_select (x)), depending on whether we are multiplying by
11027 a vector or a scalar.
11028
11029 Canonicalization is not very good in these cases, FMA4 will put the
11030 by-element operand as operand 0, FNMA4 will have it as operand 1. */
11031 if (GET_CODE (op0) == VEC_DUPLICATE)
11032 op0 = XEXP (op0, 0);
11033 else if (GET_CODE (op1) == VEC_DUPLICATE)
11034 op1 = XEXP (op1, 0);
11035
11036 if (GET_CODE (op0) == VEC_SELECT)
11037 op0 = XEXP (op0, 0);
11038 else if (GET_CODE (op1) == VEC_SELECT)
11039 op1 = XEXP (op1, 0);
11040
11041 /* If the remaining parameters are not registers,
11042 get the cost to put them into registers. */
11043 *cost += rtx_cost (op0, mode, FMA, 0, speed);
11044 *cost += rtx_cost (op1, mode, FMA, 1, speed);
11045 *cost += rtx_cost (op2, mode, FMA, 2, speed);
11046 return true;
11047
11048 case FLOAT:
11049 case UNSIGNED_FLOAT:
11050 if (speed)
11051 *cost += extra_cost->fp[mode == DFmode].fromint;
11052 return false;
11053
11054 case FLOAT_EXTEND:
11055 if (speed)
11056 {
11057 if (VECTOR_MODE_P (mode))
11058 {
11059 /*Vector truncate. */
11060 *cost += extra_cost->vect.alu;
11061 }
11062 else
11063 *cost += extra_cost->fp[mode == DFmode].widen;
11064 }
11065 return false;
11066
11067 case FLOAT_TRUNCATE:
11068 if (speed)
11069 {
11070 if (VECTOR_MODE_P (mode))
11071 {
11072 /*Vector conversion. */
11073 *cost += extra_cost->vect.alu;
11074 }
11075 else
11076 *cost += extra_cost->fp[mode == DFmode].narrow;
11077 }
11078 return false;
11079
11080 case FIX:
11081 case UNSIGNED_FIX:
11082 x = XEXP (x, 0);
11083 /* Strip the rounding part. They will all be implemented
11084 by the fcvt* family of instructions anyway. */
11085 if (GET_CODE (x) == UNSPEC)
11086 {
11087 unsigned int uns_code = XINT (x, 1);
11088
11089 if (uns_code == UNSPEC_FRINTA
11090 || uns_code == UNSPEC_FRINTM
11091 || uns_code == UNSPEC_FRINTN
11092 || uns_code == UNSPEC_FRINTP
11093 || uns_code == UNSPEC_FRINTZ)
11094 x = XVECEXP (x, 0, 0);
11095 }
11096
11097 if (speed)
11098 {
11099 if (VECTOR_MODE_P (mode))
11100 *cost += extra_cost->vect.alu;
11101 else
11102 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
11103 }
11104
11105 /* We can combine fmul by a power of 2 followed by a fcvt into a single
11106 fixed-point fcvt. */
11107 if (GET_CODE (x) == MULT
11108 && ((VECTOR_MODE_P (mode)
11109 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
11110 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
11111 {
11112 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
11113 0, speed);
11114 return true;
11115 }
11116
11117 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
11118 return true;
11119
11120 case ABS:
11121 if (VECTOR_MODE_P (mode))
11122 {
11123 /* ABS (vector). */
11124 if (speed)
11125 *cost += extra_cost->vect.alu;
11126 }
11127 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11128 {
11129 op0 = XEXP (x, 0);
11130
11131 /* FABD, which is analogous to FADD. */
11132 if (GET_CODE (op0) == MINUS)
11133 {
11134 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
11135 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
11136 if (speed)
11137 *cost += extra_cost->fp[mode == DFmode].addsub;
11138
11139 return true;
11140 }
11141 /* Simple FABS is analogous to FNEG. */
11142 if (speed)
11143 *cost += extra_cost->fp[mode == DFmode].neg;
11144 }
11145 else
11146 {
11147 /* Integer ABS will either be split to
11148 two arithmetic instructions, or will be an ABS
11149 (scalar), which we don't model. */
11150 *cost = COSTS_N_INSNS (2);
11151 if (speed)
11152 *cost += 2 * extra_cost->alu.arith;
11153 }
11154 return false;
11155
11156 case SMAX:
11157 case SMIN:
11158 if (speed)
11159 {
11160 if (VECTOR_MODE_P (mode))
11161 *cost += extra_cost->vect.alu;
11162 else
11163 {
11164 /* FMAXNM/FMINNM/FMAX/FMIN.
11165 TODO: This may not be accurate for all implementations, but
11166 we do not model this in the cost tables. */
11167 *cost += extra_cost->fp[mode == DFmode].addsub;
11168 }
11169 }
11170 return false;
11171
11172 case UNSPEC:
11173 /* The floating point round to integer frint* instructions. */
11174 if (aarch64_frint_unspec_p (XINT (x, 1)))
11175 {
11176 if (speed)
11177 *cost += extra_cost->fp[mode == DFmode].roundint;
11178
11179 return false;
11180 }
11181
11182 if (XINT (x, 1) == UNSPEC_RBIT)
11183 {
11184 if (speed)
11185 *cost += extra_cost->alu.rev;
11186
11187 return false;
11188 }
11189 break;
11190
11191 case TRUNCATE:
11192
11193 /* Decompose <su>muldi3_highpart. */
11194 if (/* (truncate:DI */
11195 mode == DImode
11196 /* (lshiftrt:TI */
11197 && GET_MODE (XEXP (x, 0)) == TImode
11198 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
11199 /* (mult:TI */
11200 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
11201 /* (ANY_EXTEND:TI (reg:DI))
11202 (ANY_EXTEND:TI (reg:DI))) */
11203 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
11204 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
11205 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
11206 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
11207 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
11208 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
11209 /* (const_int 64) */
11210 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
11211 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
11212 {
11213 /* UMULH/SMULH. */
11214 if (speed)
11215 *cost += extra_cost->mult[mode == DImode].extend;
11216 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
11217 mode, MULT, 0, speed);
11218 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
11219 mode, MULT, 1, speed);
11220 return true;
11221 }
11222
11223 /* Fall through. */
11224 default:
11225 break;
11226 }
11227
11228 if (dump_file
11229 && flag_aarch64_verbose_cost)
11230 fprintf (dump_file,
11231 "\nFailed to cost RTX. Assuming default cost.\n");
11232
11233 return true;
11234 }
11235
11236 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
11237 calculated for X. This cost is stored in *COST. Returns true
11238 if the total cost of X was calculated. */
11239 static bool
11240 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
11241 int param, int *cost, bool speed)
11242 {
11243 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
11244
11245 if (dump_file
11246 && flag_aarch64_verbose_cost)
11247 {
11248 print_rtl_single (dump_file, x);
11249 fprintf (dump_file, "\n%s cost: %d (%s)\n",
11250 speed ? "Hot" : "Cold",
11251 *cost, result ? "final" : "partial");
11252 }
11253
11254 return result;
11255 }
11256
11257 static int
11258 aarch64_register_move_cost (machine_mode mode,
11259 reg_class_t from_i, reg_class_t to_i)
11260 {
11261 enum reg_class from = (enum reg_class) from_i;
11262 enum reg_class to = (enum reg_class) to_i;
11263 const struct cpu_regmove_cost *regmove_cost
11264 = aarch64_tune_params.regmove_cost;
11265
11266 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
11267 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
11268 to = GENERAL_REGS;
11269
11270 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
11271 from = GENERAL_REGS;
11272
11273 /* Moving between GPR and stack cost is the same as GP2GP. */
11274 if ((from == GENERAL_REGS && to == STACK_REG)
11275 || (to == GENERAL_REGS && from == STACK_REG))
11276 return regmove_cost->GP2GP;
11277
11278 /* To/From the stack register, we move via the gprs. */
11279 if (to == STACK_REG || from == STACK_REG)
11280 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
11281 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
11282
11283 if (known_eq (GET_MODE_SIZE (mode), 16))
11284 {
11285 /* 128-bit operations on general registers require 2 instructions. */
11286 if (from == GENERAL_REGS && to == GENERAL_REGS)
11287 return regmove_cost->GP2GP * 2;
11288 else if (from == GENERAL_REGS)
11289 return regmove_cost->GP2FP * 2;
11290 else if (to == GENERAL_REGS)
11291 return regmove_cost->FP2GP * 2;
11292
11293 /* When AdvSIMD instructions are disabled it is not possible to move
11294 a 128-bit value directly between Q registers. This is handled in
11295 secondary reload. A general register is used as a scratch to move
11296 the upper DI value and the lower DI value is moved directly,
11297 hence the cost is the sum of three moves. */
11298 if (! TARGET_SIMD)
11299 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
11300
11301 return regmove_cost->FP2FP;
11302 }
11303
11304 if (from == GENERAL_REGS && to == GENERAL_REGS)
11305 return regmove_cost->GP2GP;
11306 else if (from == GENERAL_REGS)
11307 return regmove_cost->GP2FP;
11308 else if (to == GENERAL_REGS)
11309 return regmove_cost->FP2GP;
11310
11311 return regmove_cost->FP2FP;
11312 }
11313
11314 static int
11315 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
11316 reg_class_t rclass ATTRIBUTE_UNUSED,
11317 bool in ATTRIBUTE_UNUSED)
11318 {
11319 return aarch64_tune_params.memmov_cost;
11320 }
11321
11322 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
11323 to optimize 1.0/sqrt. */
11324
11325 static bool
11326 use_rsqrt_p (machine_mode mode)
11327 {
11328 return (!flag_trapping_math
11329 && flag_unsafe_math_optimizations
11330 && ((aarch64_tune_params.approx_modes->recip_sqrt
11331 & AARCH64_APPROX_MODE (mode))
11332 || flag_mrecip_low_precision_sqrt));
11333 }
11334
11335 /* Function to decide when to use the approximate reciprocal square root
11336 builtin. */
11337
11338 static tree
11339 aarch64_builtin_reciprocal (tree fndecl)
11340 {
11341 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
11342
11343 if (!use_rsqrt_p (mode))
11344 return NULL_TREE;
11345 return aarch64_builtin_rsqrt (DECL_MD_FUNCTION_CODE (fndecl));
11346 }
11347
11348 /* Emit instruction sequence to compute either the approximate square root
11349 or its approximate reciprocal, depending on the flag RECP, and return
11350 whether the sequence was emitted or not. */
11351
11352 bool
11353 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
11354 {
11355 machine_mode mode = GET_MODE (dst);
11356
11357 if (GET_MODE_INNER (mode) == HFmode)
11358 {
11359 gcc_assert (!recp);
11360 return false;
11361 }
11362
11363 if (!recp)
11364 {
11365 if (!(flag_mlow_precision_sqrt
11366 || (aarch64_tune_params.approx_modes->sqrt
11367 & AARCH64_APPROX_MODE (mode))))
11368 return false;
11369
11370 if (flag_finite_math_only
11371 || flag_trapping_math
11372 || !flag_unsafe_math_optimizations
11373 || optimize_function_for_size_p (cfun))
11374 return false;
11375 }
11376 else
11377 /* Caller assumes we cannot fail. */
11378 gcc_assert (use_rsqrt_p (mode));
11379
11380 machine_mode mmsk = mode_for_int_vector (mode).require ();
11381 rtx xmsk = gen_reg_rtx (mmsk);
11382 if (!recp)
11383 /* When calculating the approximate square root, compare the
11384 argument with 0.0 and create a mask. */
11385 emit_insn (gen_rtx_SET (xmsk,
11386 gen_rtx_NEG (mmsk,
11387 gen_rtx_EQ (mmsk, src,
11388 CONST0_RTX (mode)))));
11389
11390 /* Estimate the approximate reciprocal square root. */
11391 rtx xdst = gen_reg_rtx (mode);
11392 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
11393
11394 /* Iterate over the series twice for SF and thrice for DF. */
11395 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11396
11397 /* Optionally iterate over the series once less for faster performance
11398 while sacrificing the accuracy. */
11399 if ((recp && flag_mrecip_low_precision_sqrt)
11400 || (!recp && flag_mlow_precision_sqrt))
11401 iterations--;
11402
11403 /* Iterate over the series to calculate the approximate reciprocal square
11404 root. */
11405 rtx x1 = gen_reg_rtx (mode);
11406 while (iterations--)
11407 {
11408 rtx x2 = gen_reg_rtx (mode);
11409 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
11410
11411 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
11412
11413 if (iterations > 0)
11414 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
11415 }
11416
11417 if (!recp)
11418 {
11419 /* Qualify the approximate reciprocal square root when the argument is
11420 0.0 by squashing the intermediary result to 0.0. */
11421 rtx xtmp = gen_reg_rtx (mmsk);
11422 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
11423 gen_rtx_SUBREG (mmsk, xdst, 0)));
11424 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
11425
11426 /* Calculate the approximate square root. */
11427 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
11428 }
11429
11430 /* Finalize the approximation. */
11431 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
11432
11433 return true;
11434 }
11435
11436 /* Emit the instruction sequence to compute the approximation for the division
11437 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
11438
11439 bool
11440 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
11441 {
11442 machine_mode mode = GET_MODE (quo);
11443
11444 if (GET_MODE_INNER (mode) == HFmode)
11445 return false;
11446
11447 bool use_approx_division_p = (flag_mlow_precision_div
11448 || (aarch64_tune_params.approx_modes->division
11449 & AARCH64_APPROX_MODE (mode)));
11450
11451 if (!flag_finite_math_only
11452 || flag_trapping_math
11453 || !flag_unsafe_math_optimizations
11454 || optimize_function_for_size_p (cfun)
11455 || !use_approx_division_p)
11456 return false;
11457
11458 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
11459 return false;
11460
11461 /* Estimate the approximate reciprocal. */
11462 rtx xrcp = gen_reg_rtx (mode);
11463 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
11464
11465 /* Iterate over the series twice for SF and thrice for DF. */
11466 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
11467
11468 /* Optionally iterate over the series once less for faster performance,
11469 while sacrificing the accuracy. */
11470 if (flag_mlow_precision_div)
11471 iterations--;
11472
11473 /* Iterate over the series to calculate the approximate reciprocal. */
11474 rtx xtmp = gen_reg_rtx (mode);
11475 while (iterations--)
11476 {
11477 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
11478
11479 if (iterations > 0)
11480 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
11481 }
11482
11483 if (num != CONST1_RTX (mode))
11484 {
11485 /* As the approximate reciprocal of DEN is already calculated, only
11486 calculate the approximate division when NUM is not 1.0. */
11487 rtx xnum = force_reg (mode, num);
11488 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
11489 }
11490
11491 /* Finalize the approximation. */
11492 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
11493 return true;
11494 }
11495
11496 /* Return the number of instructions that can be issued per cycle. */
11497 static int
11498 aarch64_sched_issue_rate (void)
11499 {
11500 return aarch64_tune_params.issue_rate;
11501 }
11502
11503 static int
11504 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
11505 {
11506 int issue_rate = aarch64_sched_issue_rate ();
11507
11508 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
11509 }
11510
11511
11512 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
11513 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
11514 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
11515
11516 static int
11517 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
11518 int ready_index)
11519 {
11520 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
11521 }
11522
11523
11524 /* Vectorizer cost model target hooks. */
11525
11526 /* Implement targetm.vectorize.builtin_vectorization_cost. */
11527 static int
11528 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
11529 tree vectype,
11530 int misalign ATTRIBUTE_UNUSED)
11531 {
11532 unsigned elements;
11533 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
11534 bool fp = false;
11535
11536 if (vectype != NULL)
11537 fp = FLOAT_TYPE_P (vectype);
11538
11539 switch (type_of_cost)
11540 {
11541 case scalar_stmt:
11542 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
11543
11544 case scalar_load:
11545 return costs->scalar_load_cost;
11546
11547 case scalar_store:
11548 return costs->scalar_store_cost;
11549
11550 case vector_stmt:
11551 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11552
11553 case vector_load:
11554 return costs->vec_align_load_cost;
11555
11556 case vector_store:
11557 return costs->vec_store_cost;
11558
11559 case vec_to_scalar:
11560 return costs->vec_to_scalar_cost;
11561
11562 case scalar_to_vec:
11563 return costs->scalar_to_vec_cost;
11564
11565 case unaligned_load:
11566 case vector_gather_load:
11567 return costs->vec_unalign_load_cost;
11568
11569 case unaligned_store:
11570 case vector_scatter_store:
11571 return costs->vec_unalign_store_cost;
11572
11573 case cond_branch_taken:
11574 return costs->cond_taken_branch_cost;
11575
11576 case cond_branch_not_taken:
11577 return costs->cond_not_taken_branch_cost;
11578
11579 case vec_perm:
11580 return costs->vec_permute_cost;
11581
11582 case vec_promote_demote:
11583 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
11584
11585 case vec_construct:
11586 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
11587 return elements / 2 + 1;
11588
11589 default:
11590 gcc_unreachable ();
11591 }
11592 }
11593
11594 /* Implement targetm.vectorize.add_stmt_cost. */
11595 static unsigned
11596 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11597 struct _stmt_vec_info *stmt_info, int misalign,
11598 enum vect_cost_model_location where)
11599 {
11600 unsigned *cost = (unsigned *) data;
11601 unsigned retval = 0;
11602
11603 if (flag_vect_cost_model)
11604 {
11605 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11606 int stmt_cost =
11607 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11608
11609 /* Statements in an inner loop relative to the loop being
11610 vectorized are weighted more heavily. The value here is
11611 arbitrary and could potentially be improved with analysis. */
11612 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11613 count *= 50; /* FIXME */
11614
11615 retval = (unsigned) (count * stmt_cost);
11616 cost[where] += retval;
11617 }
11618
11619 return retval;
11620 }
11621
11622 static void initialize_aarch64_code_model (struct gcc_options *);
11623
11624 /* Parse the TO_PARSE string and put the architecture struct that it
11625 selects into RES and the architectural features into ISA_FLAGS.
11626 Return an aarch64_parse_opt_result describing the parse result.
11627 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11628 When the TO_PARSE string contains an invalid extension,
11629 a copy of the string is created and stored to INVALID_EXTENSION. */
11630
11631 static enum aarch64_parse_opt_result
11632 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11633 uint64_t *isa_flags, std::string *invalid_extension)
11634 {
11635 const char *ext;
11636 const struct processor *arch;
11637 size_t len;
11638
11639 ext = strchr (to_parse, '+');
11640
11641 if (ext != NULL)
11642 len = ext - to_parse;
11643 else
11644 len = strlen (to_parse);
11645
11646 if (len == 0)
11647 return AARCH64_PARSE_MISSING_ARG;
11648
11649
11650 /* Loop through the list of supported ARCHes to find a match. */
11651 for (arch = all_architectures; arch->name != NULL; arch++)
11652 {
11653 if (strlen (arch->name) == len
11654 && strncmp (arch->name, to_parse, len) == 0)
11655 {
11656 uint64_t isa_temp = arch->flags;
11657
11658 if (ext != NULL)
11659 {
11660 /* TO_PARSE string contains at least one extension. */
11661 enum aarch64_parse_opt_result ext_res
11662 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11663
11664 if (ext_res != AARCH64_PARSE_OK)
11665 return ext_res;
11666 }
11667 /* Extension parsing was successful. Confirm the result
11668 arch and ISA flags. */
11669 *res = arch;
11670 *isa_flags = isa_temp;
11671 return AARCH64_PARSE_OK;
11672 }
11673 }
11674
11675 /* ARCH name not found in list. */
11676 return AARCH64_PARSE_INVALID_ARG;
11677 }
11678
11679 /* Parse the TO_PARSE string and put the result tuning in RES and the
11680 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
11681 describing the parse result. If there is an error parsing, RES and
11682 ISA_FLAGS are left unchanged.
11683 When the TO_PARSE string contains an invalid extension,
11684 a copy of the string is created and stored to INVALID_EXTENSION. */
11685
11686 static enum aarch64_parse_opt_result
11687 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11688 uint64_t *isa_flags, std::string *invalid_extension)
11689 {
11690 const char *ext;
11691 const struct processor *cpu;
11692 size_t len;
11693
11694 ext = strchr (to_parse, '+');
11695
11696 if (ext != NULL)
11697 len = ext - to_parse;
11698 else
11699 len = strlen (to_parse);
11700
11701 if (len == 0)
11702 return AARCH64_PARSE_MISSING_ARG;
11703
11704
11705 /* Loop through the list of supported CPUs to find a match. */
11706 for (cpu = all_cores; cpu->name != NULL; cpu++)
11707 {
11708 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11709 {
11710 uint64_t isa_temp = cpu->flags;
11711
11712
11713 if (ext != NULL)
11714 {
11715 /* TO_PARSE string contains at least one extension. */
11716 enum aarch64_parse_opt_result ext_res
11717 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11718
11719 if (ext_res != AARCH64_PARSE_OK)
11720 return ext_res;
11721 }
11722 /* Extension parsing was successfull. Confirm the result
11723 cpu and ISA flags. */
11724 *res = cpu;
11725 *isa_flags = isa_temp;
11726 return AARCH64_PARSE_OK;
11727 }
11728 }
11729
11730 /* CPU name not found in list. */
11731 return AARCH64_PARSE_INVALID_ARG;
11732 }
11733
11734 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11735 Return an aarch64_parse_opt_result describing the parse result.
11736 If the parsing fails the RES does not change. */
11737
11738 static enum aarch64_parse_opt_result
11739 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11740 {
11741 const struct processor *cpu;
11742
11743 /* Loop through the list of supported CPUs to find a match. */
11744 for (cpu = all_cores; cpu->name != NULL; cpu++)
11745 {
11746 if (strcmp (cpu->name, to_parse) == 0)
11747 {
11748 *res = cpu;
11749 return AARCH64_PARSE_OK;
11750 }
11751 }
11752
11753 /* CPU name not found in list. */
11754 return AARCH64_PARSE_INVALID_ARG;
11755 }
11756
11757 /* Parse TOKEN, which has length LENGTH to see if it is an option
11758 described in FLAG. If it is, return the index bit for that fusion type.
11759 If not, error (printing OPTION_NAME) and return zero. */
11760
11761 static unsigned int
11762 aarch64_parse_one_option_token (const char *token,
11763 size_t length,
11764 const struct aarch64_flag_desc *flag,
11765 const char *option_name)
11766 {
11767 for (; flag->name != NULL; flag++)
11768 {
11769 if (length == strlen (flag->name)
11770 && !strncmp (flag->name, token, length))
11771 return flag->flag;
11772 }
11773
11774 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11775 return 0;
11776 }
11777
11778 /* Parse OPTION which is a comma-separated list of flags to enable.
11779 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11780 default state we inherit from the CPU tuning structures. OPTION_NAME
11781 gives the top-level option we are parsing in the -moverride string,
11782 for use in error messages. */
11783
11784 static unsigned int
11785 aarch64_parse_boolean_options (const char *option,
11786 const struct aarch64_flag_desc *flags,
11787 unsigned int initial_state,
11788 const char *option_name)
11789 {
11790 const char separator = '.';
11791 const char* specs = option;
11792 const char* ntoken = option;
11793 unsigned int found_flags = initial_state;
11794
11795 while ((ntoken = strchr (specs, separator)))
11796 {
11797 size_t token_length = ntoken - specs;
11798 unsigned token_ops = aarch64_parse_one_option_token (specs,
11799 token_length,
11800 flags,
11801 option_name);
11802 /* If we find "none" (or, for simplicity's sake, an error) anywhere
11803 in the token stream, reset the supported operations. So:
11804
11805 adrp+add.cmp+branch.none.adrp+add
11806
11807 would have the result of turning on only adrp+add fusion. */
11808 if (!token_ops)
11809 found_flags = 0;
11810
11811 found_flags |= token_ops;
11812 specs = ++ntoken;
11813 }
11814
11815 /* We ended with a comma, print something. */
11816 if (!(*specs))
11817 {
11818 error ("%s string ill-formed\n", option_name);
11819 return 0;
11820 }
11821
11822 /* We still have one more token to parse. */
11823 size_t token_length = strlen (specs);
11824 unsigned token_ops = aarch64_parse_one_option_token (specs,
11825 token_length,
11826 flags,
11827 option_name);
11828 if (!token_ops)
11829 found_flags = 0;
11830
11831 found_flags |= token_ops;
11832 return found_flags;
11833 }
11834
11835 /* Support for overriding instruction fusion. */
11836
11837 static void
11838 aarch64_parse_fuse_string (const char *fuse_string,
11839 struct tune_params *tune)
11840 {
11841 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11842 aarch64_fusible_pairs,
11843 tune->fusible_ops,
11844 "fuse=");
11845 }
11846
11847 /* Support for overriding other tuning flags. */
11848
11849 static void
11850 aarch64_parse_tune_string (const char *tune_string,
11851 struct tune_params *tune)
11852 {
11853 tune->extra_tuning_flags
11854 = aarch64_parse_boolean_options (tune_string,
11855 aarch64_tuning_flags,
11856 tune->extra_tuning_flags,
11857 "tune=");
11858 }
11859
11860 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11861 Accept the valid SVE vector widths allowed by
11862 aarch64_sve_vector_bits_enum and use it to override sve_width
11863 in TUNE. */
11864
11865 static void
11866 aarch64_parse_sve_width_string (const char *tune_string,
11867 struct tune_params *tune)
11868 {
11869 int width = -1;
11870
11871 int n = sscanf (tune_string, "%d", &width);
11872 if (n == EOF)
11873 {
11874 error ("invalid format for sve_width");
11875 return;
11876 }
11877 switch (width)
11878 {
11879 case SVE_128:
11880 case SVE_256:
11881 case SVE_512:
11882 case SVE_1024:
11883 case SVE_2048:
11884 break;
11885 default:
11886 error ("invalid sve_width value: %d", width);
11887 }
11888 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11889 }
11890
11891 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11892 we understand. If it is, extract the option string and handoff to
11893 the appropriate function. */
11894
11895 void
11896 aarch64_parse_one_override_token (const char* token,
11897 size_t length,
11898 struct tune_params *tune)
11899 {
11900 const struct aarch64_tuning_override_function *fn
11901 = aarch64_tuning_override_functions;
11902
11903 const char *option_part = strchr (token, '=');
11904 if (!option_part)
11905 {
11906 error ("tuning string missing in option (%s)", token);
11907 return;
11908 }
11909
11910 /* Get the length of the option name. */
11911 length = option_part - token;
11912 /* Skip the '=' to get to the option string. */
11913 option_part++;
11914
11915 for (; fn->name != NULL; fn++)
11916 {
11917 if (!strncmp (fn->name, token, length))
11918 {
11919 fn->parse_override (option_part, tune);
11920 return;
11921 }
11922 }
11923
11924 error ("unknown tuning option (%s)",token);
11925 return;
11926 }
11927
11928 /* A checking mechanism for the implementation of the tls size. */
11929
11930 static void
11931 initialize_aarch64_tls_size (struct gcc_options *opts)
11932 {
11933 if (aarch64_tls_size == 0)
11934 aarch64_tls_size = 24;
11935
11936 switch (opts->x_aarch64_cmodel_var)
11937 {
11938 case AARCH64_CMODEL_TINY:
11939 /* Both the default and maximum TLS size allowed under tiny is 1M which
11940 needs two instructions to address, so we clamp the size to 24. */
11941 if (aarch64_tls_size > 24)
11942 aarch64_tls_size = 24;
11943 break;
11944 case AARCH64_CMODEL_SMALL:
11945 /* The maximum TLS size allowed under small is 4G. */
11946 if (aarch64_tls_size > 32)
11947 aarch64_tls_size = 32;
11948 break;
11949 case AARCH64_CMODEL_LARGE:
11950 /* The maximum TLS size allowed under large is 16E.
11951 FIXME: 16E should be 64bit, we only support 48bit offset now. */
11952 if (aarch64_tls_size > 48)
11953 aarch64_tls_size = 48;
11954 break;
11955 default:
11956 gcc_unreachable ();
11957 }
11958
11959 return;
11960 }
11961
11962 /* Parse STRING looking for options in the format:
11963 string :: option:string
11964 option :: name=substring
11965 name :: {a-z}
11966 substring :: defined by option. */
11967
11968 static void
11969 aarch64_parse_override_string (const char* input_string,
11970 struct tune_params* tune)
11971 {
11972 const char separator = ':';
11973 size_t string_length = strlen (input_string) + 1;
11974 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11975 char *string = string_root;
11976 strncpy (string, input_string, string_length);
11977 string[string_length - 1] = '\0';
11978
11979 char* ntoken = string;
11980
11981 while ((ntoken = strchr (string, separator)))
11982 {
11983 size_t token_length = ntoken - string;
11984 /* Make this substring look like a string. */
11985 *ntoken = '\0';
11986 aarch64_parse_one_override_token (string, token_length, tune);
11987 string = ++ntoken;
11988 }
11989
11990 /* One last option to parse. */
11991 aarch64_parse_one_override_token (string, strlen (string), tune);
11992 free (string_root);
11993 }
11994
11995
11996 static void
11997 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11998 {
11999 if (accepted_branch_protection_string)
12000 {
12001 opts->x_aarch64_branch_protection_string
12002 = xstrdup (accepted_branch_protection_string);
12003 }
12004
12005 /* PR 70044: We have to be careful about being called multiple times for the
12006 same function. This means all changes should be repeatable. */
12007
12008 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
12009 Disable the frame pointer flag so the mid-end will not use a frame
12010 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
12011 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
12012 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
12013 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
12014 if (opts->x_flag_omit_frame_pointer == 0)
12015 opts->x_flag_omit_frame_pointer = 2;
12016
12017 /* If not optimizing for size, set the default
12018 alignment to what the target wants. */
12019 if (!opts->x_optimize_size)
12020 {
12021 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
12022 opts->x_str_align_loops = aarch64_tune_params.loop_align;
12023 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
12024 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
12025 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
12026 opts->x_str_align_functions = aarch64_tune_params.function_align;
12027 }
12028
12029 /* We default to no pc-relative literal loads. */
12030
12031 aarch64_pcrelative_literal_loads = false;
12032
12033 /* If -mpc-relative-literal-loads is set on the command line, this
12034 implies that the user asked for PC relative literal loads. */
12035 if (opts->x_pcrelative_literal_loads == 1)
12036 aarch64_pcrelative_literal_loads = true;
12037
12038 /* In the tiny memory model it makes no sense to disallow PC relative
12039 literal pool loads. */
12040 if (aarch64_cmodel == AARCH64_CMODEL_TINY
12041 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12042 aarch64_pcrelative_literal_loads = true;
12043
12044 /* When enabling the lower precision Newton series for the square root, also
12045 enable it for the reciprocal square root, since the latter is an
12046 intermediary step for the former. */
12047 if (flag_mlow_precision_sqrt)
12048 flag_mrecip_low_precision_sqrt = true;
12049 }
12050
12051 /* 'Unpack' up the internal tuning structs and update the options
12052 in OPTS. The caller must have set up selected_tune and selected_arch
12053 as all the other target-specific codegen decisions are
12054 derived from them. */
12055
12056 void
12057 aarch64_override_options_internal (struct gcc_options *opts)
12058 {
12059 aarch64_tune_flags = selected_tune->flags;
12060 aarch64_tune = selected_tune->sched_core;
12061 /* Make a copy of the tuning parameters attached to the core, which
12062 we may later overwrite. */
12063 aarch64_tune_params = *(selected_tune->tune);
12064 aarch64_architecture_version = selected_arch->architecture_version;
12065
12066 if (opts->x_aarch64_override_tune_string)
12067 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
12068 &aarch64_tune_params);
12069
12070 /* This target defaults to strict volatile bitfields. */
12071 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
12072 opts->x_flag_strict_volatile_bitfields = 1;
12073
12074 if (aarch64_stack_protector_guard == SSP_GLOBAL
12075 && opts->x_aarch64_stack_protector_guard_offset_str)
12076 {
12077 error ("incompatible options %<-mstack-protector-guard=global%> and "
12078 "%<-mstack-protector-guard-offset=%s%>",
12079 aarch64_stack_protector_guard_offset_str);
12080 }
12081
12082 if (aarch64_stack_protector_guard == SSP_SYSREG
12083 && !(opts->x_aarch64_stack_protector_guard_offset_str
12084 && opts->x_aarch64_stack_protector_guard_reg_str))
12085 {
12086 error ("both %<-mstack-protector-guard-offset%> and "
12087 "%<-mstack-protector-guard-reg%> must be used "
12088 "with %<-mstack-protector-guard=sysreg%>");
12089 }
12090
12091 if (opts->x_aarch64_stack_protector_guard_reg_str)
12092 {
12093 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
12094 error ("specify a system register with a small string length.");
12095 }
12096
12097 if (opts->x_aarch64_stack_protector_guard_offset_str)
12098 {
12099 char *end;
12100 const char *str = aarch64_stack_protector_guard_offset_str;
12101 errno = 0;
12102 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
12103 if (!*str || *end || errno)
12104 error ("%qs is not a valid offset in %qs", str,
12105 "-mstack-protector-guard-offset=");
12106 aarch64_stack_protector_guard_offset = offs;
12107 }
12108
12109 initialize_aarch64_code_model (opts);
12110 initialize_aarch64_tls_size (opts);
12111
12112 int queue_depth = 0;
12113 switch (aarch64_tune_params.autoprefetcher_model)
12114 {
12115 case tune_params::AUTOPREFETCHER_OFF:
12116 queue_depth = -1;
12117 break;
12118 case tune_params::AUTOPREFETCHER_WEAK:
12119 queue_depth = 0;
12120 break;
12121 case tune_params::AUTOPREFETCHER_STRONG:
12122 queue_depth = max_insn_queue_index + 1;
12123 break;
12124 default:
12125 gcc_unreachable ();
12126 }
12127
12128 /* We don't mind passing in global_options_set here as we don't use
12129 the *options_set structs anyway. */
12130 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
12131 queue_depth,
12132 opts->x_param_values,
12133 global_options_set.x_param_values);
12134
12135 /* Set up parameters to be used in prefetching algorithm. Do not
12136 override the defaults unless we are tuning for a core we have
12137 researched values for. */
12138 if (aarch64_tune_params.prefetch->num_slots > 0)
12139 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
12140 aarch64_tune_params.prefetch->num_slots,
12141 opts->x_param_values,
12142 global_options_set.x_param_values);
12143 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
12144 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
12145 aarch64_tune_params.prefetch->l1_cache_size,
12146 opts->x_param_values,
12147 global_options_set.x_param_values);
12148 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
12149 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
12150 aarch64_tune_params.prefetch->l1_cache_line_size,
12151 opts->x_param_values,
12152 global_options_set.x_param_values);
12153 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
12154 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
12155 aarch64_tune_params.prefetch->l2_cache_size,
12156 opts->x_param_values,
12157 global_options_set.x_param_values);
12158 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
12159 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
12160 0,
12161 opts->x_param_values,
12162 global_options_set.x_param_values);
12163 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
12164 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
12165 aarch64_tune_params.prefetch->minimum_stride,
12166 opts->x_param_values,
12167 global_options_set.x_param_values);
12168
12169 /* Use the alternative scheduling-pressure algorithm by default. */
12170 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
12171 opts->x_param_values,
12172 global_options_set.x_param_values);
12173
12174 /* If the user hasn't changed it via configure then set the default to 64 KB
12175 for the backend. */
12176 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
12177 DEFAULT_STK_CLASH_GUARD_SIZE == 0
12178 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
12179 opts->x_param_values,
12180 global_options_set.x_param_values);
12181
12182 /* Validate the guard size. */
12183 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
12184
12185 /* Enforce that interval is the same size as size so the mid-end does the
12186 right thing. */
12187 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
12188 guard_size,
12189 opts->x_param_values,
12190 global_options_set.x_param_values);
12191
12192 /* The maybe_set calls won't update the value if the user has explicitly set
12193 one. Which means we need to validate that probing interval and guard size
12194 are equal. */
12195 int probe_interval
12196 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
12197 if (guard_size != probe_interval)
12198 error ("stack clash guard size %<%d%> must be equal to probing interval "
12199 "%<%d%>", guard_size, probe_interval);
12200
12201 /* Enable sw prefetching at specified optimization level for
12202 CPUS that have prefetch. Lower optimization level threshold by 1
12203 when profiling is enabled. */
12204 if (opts->x_flag_prefetch_loop_arrays < 0
12205 && !opts->x_optimize_size
12206 && aarch64_tune_params.prefetch->default_opt_level >= 0
12207 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
12208 opts->x_flag_prefetch_loop_arrays = 1;
12209
12210 if (opts->x_aarch64_arch_string == NULL)
12211 opts->x_aarch64_arch_string = selected_arch->name;
12212 if (opts->x_aarch64_cpu_string == NULL)
12213 opts->x_aarch64_cpu_string = selected_cpu->name;
12214 if (opts->x_aarch64_tune_string == NULL)
12215 opts->x_aarch64_tune_string = selected_tune->name;
12216
12217 aarch64_override_options_after_change_1 (opts);
12218 }
12219
12220 /* Print a hint with a suggestion for a core or architecture name that
12221 most closely resembles what the user passed in STR. ARCH is true if
12222 the user is asking for an architecture name. ARCH is false if the user
12223 is asking for a core name. */
12224
12225 static void
12226 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
12227 {
12228 auto_vec<const char *> candidates;
12229 const struct processor *entry = arch ? all_architectures : all_cores;
12230 for (; entry->name != NULL; entry++)
12231 candidates.safe_push (entry->name);
12232
12233 #ifdef HAVE_LOCAL_CPU_DETECT
12234 /* Add also "native" as possible value. */
12235 if (arch)
12236 candidates.safe_push ("native");
12237 #endif
12238
12239 char *s;
12240 const char *hint = candidates_list_and_hint (str, s, candidates);
12241 if (hint)
12242 inform (input_location, "valid arguments are: %s;"
12243 " did you mean %qs?", s, hint);
12244 else
12245 inform (input_location, "valid arguments are: %s", s);
12246
12247 XDELETEVEC (s);
12248 }
12249
12250 /* Print a hint with a suggestion for a core name that most closely resembles
12251 what the user passed in STR. */
12252
12253 inline static void
12254 aarch64_print_hint_for_core (const char *str)
12255 {
12256 aarch64_print_hint_for_core_or_arch (str, false);
12257 }
12258
12259 /* Print a hint with a suggestion for an architecture name that most closely
12260 resembles what the user passed in STR. */
12261
12262 inline static void
12263 aarch64_print_hint_for_arch (const char *str)
12264 {
12265 aarch64_print_hint_for_core_or_arch (str, true);
12266 }
12267
12268
12269 /* Print a hint with a suggestion for an extension name
12270 that most closely resembles what the user passed in STR. */
12271
12272 void
12273 aarch64_print_hint_for_extensions (const std::string &str)
12274 {
12275 auto_vec<const char *> candidates;
12276 aarch64_get_all_extension_candidates (&candidates);
12277 char *s;
12278 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
12279 if (hint)
12280 inform (input_location, "valid arguments are: %s;"
12281 " did you mean %qs?", s, hint);
12282 else
12283 inform (input_location, "valid arguments are: %s;", s);
12284
12285 XDELETEVEC (s);
12286 }
12287
12288 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
12289 specified in STR and throw errors if appropriate. Put the results if
12290 they are valid in RES and ISA_FLAGS. Return whether the option is
12291 valid. */
12292
12293 static bool
12294 aarch64_validate_mcpu (const char *str, const struct processor **res,
12295 uint64_t *isa_flags)
12296 {
12297 std::string invalid_extension;
12298 enum aarch64_parse_opt_result parse_res
12299 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
12300
12301 if (parse_res == AARCH64_PARSE_OK)
12302 return true;
12303
12304 switch (parse_res)
12305 {
12306 case AARCH64_PARSE_MISSING_ARG:
12307 error ("missing cpu name in %<-mcpu=%s%>", str);
12308 break;
12309 case AARCH64_PARSE_INVALID_ARG:
12310 error ("unknown value %qs for %<-mcpu%>", str);
12311 aarch64_print_hint_for_core (str);
12312 break;
12313 case AARCH64_PARSE_INVALID_FEATURE:
12314 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
12315 invalid_extension.c_str (), str);
12316 aarch64_print_hint_for_extensions (invalid_extension);
12317 break;
12318 default:
12319 gcc_unreachable ();
12320 }
12321
12322 return false;
12323 }
12324
12325 /* Parses CONST_STR for branch protection features specified in
12326 aarch64_branch_protect_types, and set any global variables required. Returns
12327 the parsing result and assigns LAST_STR to the last processed token from
12328 CONST_STR so that it can be used for error reporting. */
12329
12330 static enum
12331 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
12332 char** last_str)
12333 {
12334 char *str_root = xstrdup (const_str);
12335 char* token_save = NULL;
12336 char *str = strtok_r (str_root, "+", &token_save);
12337 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
12338 if (!str)
12339 res = AARCH64_PARSE_MISSING_ARG;
12340 else
12341 {
12342 char *next_str = strtok_r (NULL, "+", &token_save);
12343 /* Reset the branch protection features to their defaults. */
12344 aarch64_handle_no_branch_protection (NULL, NULL);
12345
12346 while (str && res == AARCH64_PARSE_OK)
12347 {
12348 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
12349 bool found = false;
12350 /* Search for this type. */
12351 while (type && type->name && !found && res == AARCH64_PARSE_OK)
12352 {
12353 if (strcmp (str, type->name) == 0)
12354 {
12355 found = true;
12356 res = type->handler (str, next_str);
12357 str = next_str;
12358 next_str = strtok_r (NULL, "+", &token_save);
12359 }
12360 else
12361 type++;
12362 }
12363 if (found && res == AARCH64_PARSE_OK)
12364 {
12365 bool found_subtype = true;
12366 /* Loop through each token until we find one that isn't a
12367 subtype. */
12368 while (found_subtype)
12369 {
12370 found_subtype = false;
12371 const aarch64_branch_protect_type *subtype = type->subtypes;
12372 /* Search for the subtype. */
12373 while (str && subtype && subtype->name && !found_subtype
12374 && res == AARCH64_PARSE_OK)
12375 {
12376 if (strcmp (str, subtype->name) == 0)
12377 {
12378 found_subtype = true;
12379 res = subtype->handler (str, next_str);
12380 str = next_str;
12381 next_str = strtok_r (NULL, "+", &token_save);
12382 }
12383 else
12384 subtype++;
12385 }
12386 }
12387 }
12388 else if (!found)
12389 res = AARCH64_PARSE_INVALID_ARG;
12390 }
12391 }
12392 /* Copy the last processed token into the argument to pass it back.
12393 Used by option and attribute validation to print the offending token. */
12394 if (last_str)
12395 {
12396 if (str) strcpy (*last_str, str);
12397 else *last_str = NULL;
12398 }
12399 if (res == AARCH64_PARSE_OK)
12400 {
12401 /* If needed, alloc the accepted string then copy in const_str.
12402 Used by override_option_after_change_1. */
12403 if (!accepted_branch_protection_string)
12404 accepted_branch_protection_string = (char *) xmalloc (
12405 BRANCH_PROTECT_STR_MAX
12406 + 1);
12407 strncpy (accepted_branch_protection_string, const_str,
12408 BRANCH_PROTECT_STR_MAX + 1);
12409 /* Forcibly null-terminate. */
12410 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
12411 }
12412 return res;
12413 }
12414
12415 static bool
12416 aarch64_validate_mbranch_protection (const char *const_str)
12417 {
12418 char *str = (char *) xmalloc (strlen (const_str));
12419 enum aarch64_parse_opt_result res =
12420 aarch64_parse_branch_protection (const_str, &str);
12421 if (res == AARCH64_PARSE_INVALID_ARG)
12422 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
12423 else if (res == AARCH64_PARSE_MISSING_ARG)
12424 error ("missing argument for %<-mbranch-protection=%>");
12425 free (str);
12426 return res == AARCH64_PARSE_OK;
12427 }
12428
12429 /* Validate a command-line -march option. Parse the arch and extensions
12430 (if any) specified in STR and throw errors if appropriate. Put the
12431 results, if they are valid, in RES and ISA_FLAGS. Return whether the
12432 option is valid. */
12433
12434 static bool
12435 aarch64_validate_march (const char *str, const struct processor **res,
12436 uint64_t *isa_flags)
12437 {
12438 std::string invalid_extension;
12439 enum aarch64_parse_opt_result parse_res
12440 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
12441
12442 if (parse_res == AARCH64_PARSE_OK)
12443 return true;
12444
12445 switch (parse_res)
12446 {
12447 case AARCH64_PARSE_MISSING_ARG:
12448 error ("missing arch name in %<-march=%s%>", str);
12449 break;
12450 case AARCH64_PARSE_INVALID_ARG:
12451 error ("unknown value %qs for %<-march%>", str);
12452 aarch64_print_hint_for_arch (str);
12453 break;
12454 case AARCH64_PARSE_INVALID_FEATURE:
12455 error ("invalid feature modifier %qs in %<-march=%s%>",
12456 invalid_extension.c_str (), str);
12457 aarch64_print_hint_for_extensions (invalid_extension);
12458 break;
12459 default:
12460 gcc_unreachable ();
12461 }
12462
12463 return false;
12464 }
12465
12466 /* Validate a command-line -mtune option. Parse the cpu
12467 specified in STR and throw errors if appropriate. Put the
12468 result, if it is valid, in RES. Return whether the option is
12469 valid. */
12470
12471 static bool
12472 aarch64_validate_mtune (const char *str, const struct processor **res)
12473 {
12474 enum aarch64_parse_opt_result parse_res
12475 = aarch64_parse_tune (str, res);
12476
12477 if (parse_res == AARCH64_PARSE_OK)
12478 return true;
12479
12480 switch (parse_res)
12481 {
12482 case AARCH64_PARSE_MISSING_ARG:
12483 error ("missing cpu name in %<-mtune=%s%>", str);
12484 break;
12485 case AARCH64_PARSE_INVALID_ARG:
12486 error ("unknown value %qs for %<-mtune%>", str);
12487 aarch64_print_hint_for_core (str);
12488 break;
12489 default:
12490 gcc_unreachable ();
12491 }
12492 return false;
12493 }
12494
12495 /* Return the CPU corresponding to the enum CPU.
12496 If it doesn't specify a cpu, return the default. */
12497
12498 static const struct processor *
12499 aarch64_get_tune_cpu (enum aarch64_processor cpu)
12500 {
12501 if (cpu != aarch64_none)
12502 return &all_cores[cpu];
12503
12504 /* The & 0x3f is to extract the bottom 6 bits that encode the
12505 default cpu as selected by the --with-cpu GCC configure option
12506 in config.gcc.
12507 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
12508 flags mechanism should be reworked to make it more sane. */
12509 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12510 }
12511
12512 /* Return the architecture corresponding to the enum ARCH.
12513 If it doesn't specify a valid architecture, return the default. */
12514
12515 static const struct processor *
12516 aarch64_get_arch (enum aarch64_arch arch)
12517 {
12518 if (arch != aarch64_no_arch)
12519 return &all_architectures[arch];
12520
12521 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
12522
12523 return &all_architectures[cpu->arch];
12524 }
12525
12526 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
12527
12528 static poly_uint16
12529 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
12530 {
12531 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
12532 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
12533 deciding which .md file patterns to use and when deciding whether
12534 something is a legitimate address or constant. */
12535 if (value == SVE_SCALABLE || value == SVE_128)
12536 return poly_uint16 (2, 2);
12537 else
12538 return (int) value / 64;
12539 }
12540
12541 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
12542 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
12543 tuning structs. In particular it must set selected_tune and
12544 aarch64_isa_flags that define the available ISA features and tuning
12545 decisions. It must also set selected_arch as this will be used to
12546 output the .arch asm tags for each function. */
12547
12548 static void
12549 aarch64_override_options (void)
12550 {
12551 uint64_t cpu_isa = 0;
12552 uint64_t arch_isa = 0;
12553 aarch64_isa_flags = 0;
12554
12555 bool valid_cpu = true;
12556 bool valid_tune = true;
12557 bool valid_arch = true;
12558
12559 selected_cpu = NULL;
12560 selected_arch = NULL;
12561 selected_tune = NULL;
12562
12563 if (aarch64_branch_protection_string)
12564 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
12565
12566 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
12567 If either of -march or -mtune is given, they override their
12568 respective component of -mcpu. */
12569 if (aarch64_cpu_string)
12570 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
12571 &cpu_isa);
12572
12573 if (aarch64_arch_string)
12574 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
12575 &arch_isa);
12576
12577 if (aarch64_tune_string)
12578 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
12579
12580 #ifdef SUBTARGET_OVERRIDE_OPTIONS
12581 SUBTARGET_OVERRIDE_OPTIONS;
12582 #endif
12583
12584 /* If the user did not specify a processor, choose the default
12585 one for them. This will be the CPU set during configuration using
12586 --with-cpu, otherwise it is "generic". */
12587 if (!selected_cpu)
12588 {
12589 if (selected_arch)
12590 {
12591 selected_cpu = &all_cores[selected_arch->ident];
12592 aarch64_isa_flags = arch_isa;
12593 explicit_arch = selected_arch->arch;
12594 }
12595 else
12596 {
12597 /* Get default configure-time CPU. */
12598 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12599 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12600 }
12601
12602 if (selected_tune)
12603 explicit_tune_core = selected_tune->ident;
12604 }
12605 /* If both -mcpu and -march are specified check that they are architecturally
12606 compatible, warn if they're not and prefer the -march ISA flags. */
12607 else if (selected_arch)
12608 {
12609 if (selected_arch->arch != selected_cpu->arch)
12610 {
12611 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12612 all_architectures[selected_cpu->arch].name,
12613 selected_arch->name);
12614 }
12615 aarch64_isa_flags = arch_isa;
12616 explicit_arch = selected_arch->arch;
12617 explicit_tune_core = selected_tune ? selected_tune->ident
12618 : selected_cpu->ident;
12619 }
12620 else
12621 {
12622 /* -mcpu but no -march. */
12623 aarch64_isa_flags = cpu_isa;
12624 explicit_tune_core = selected_tune ? selected_tune->ident
12625 : selected_cpu->ident;
12626 gcc_assert (selected_cpu);
12627 selected_arch = &all_architectures[selected_cpu->arch];
12628 explicit_arch = selected_arch->arch;
12629 }
12630
12631 /* Set the arch as well as we will need it when outputing
12632 the .arch directive in assembly. */
12633 if (!selected_arch)
12634 {
12635 gcc_assert (selected_cpu);
12636 selected_arch = &all_architectures[selected_cpu->arch];
12637 }
12638
12639 if (!selected_tune)
12640 selected_tune = selected_cpu;
12641
12642 if (aarch64_enable_bti == 2)
12643 {
12644 #ifdef TARGET_ENABLE_BTI
12645 aarch64_enable_bti = 1;
12646 #else
12647 aarch64_enable_bti = 0;
12648 #endif
12649 }
12650
12651 /* Return address signing is currently not supported for ILP32 targets. For
12652 LP64 targets use the configured option in the absence of a command-line
12653 option for -mbranch-protection. */
12654 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12655 {
12656 #ifdef TARGET_ENABLE_PAC_RET
12657 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12658 #else
12659 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12660 #endif
12661 }
12662
12663 #ifndef HAVE_AS_MABI_OPTION
12664 /* The compiler may have been configured with 2.23.* binutils, which does
12665 not have support for ILP32. */
12666 if (TARGET_ILP32)
12667 error ("assembler does not support %<-mabi=ilp32%>");
12668 #endif
12669
12670 /* Convert -msve-vector-bits to a VG count. */
12671 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12672
12673 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12674 sorry ("return address signing is only supported for %<-mabi=lp64%>");
12675
12676 /* Make sure we properly set up the explicit options. */
12677 if ((aarch64_cpu_string && valid_cpu)
12678 || (aarch64_tune_string && valid_tune))
12679 gcc_assert (explicit_tune_core != aarch64_none);
12680
12681 if ((aarch64_cpu_string && valid_cpu)
12682 || (aarch64_arch_string && valid_arch))
12683 gcc_assert (explicit_arch != aarch64_no_arch);
12684
12685 /* The pass to insert speculation tracking runs before
12686 shrink-wrapping and the latter does not know how to update the
12687 tracking status. So disable it in this case. */
12688 if (aarch64_track_speculation)
12689 flag_shrink_wrap = 0;
12690
12691 aarch64_override_options_internal (&global_options);
12692
12693 /* Save these options as the default ones in case we push and pop them later
12694 while processing functions with potential target attributes. */
12695 target_option_default_node = target_option_current_node
12696 = build_target_option_node (&global_options);
12697 }
12698
12699 /* Implement targetm.override_options_after_change. */
12700
12701 static void
12702 aarch64_override_options_after_change (void)
12703 {
12704 aarch64_override_options_after_change_1 (&global_options);
12705 }
12706
12707 static struct machine_function *
12708 aarch64_init_machine_status (void)
12709 {
12710 struct machine_function *machine;
12711 machine = ggc_cleared_alloc<machine_function> ();
12712 return machine;
12713 }
12714
12715 void
12716 aarch64_init_expanders (void)
12717 {
12718 init_machine_status = aarch64_init_machine_status;
12719 }
12720
12721 /* A checking mechanism for the implementation of the various code models. */
12722 static void
12723 initialize_aarch64_code_model (struct gcc_options *opts)
12724 {
12725 if (opts->x_flag_pic)
12726 {
12727 switch (opts->x_aarch64_cmodel_var)
12728 {
12729 case AARCH64_CMODEL_TINY:
12730 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12731 break;
12732 case AARCH64_CMODEL_SMALL:
12733 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12734 aarch64_cmodel = (flag_pic == 2
12735 ? AARCH64_CMODEL_SMALL_PIC
12736 : AARCH64_CMODEL_SMALL_SPIC);
12737 #else
12738 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12739 #endif
12740 break;
12741 case AARCH64_CMODEL_LARGE:
12742 sorry ("code model %qs with %<-f%s%>", "large",
12743 opts->x_flag_pic > 1 ? "PIC" : "pic");
12744 break;
12745 default:
12746 gcc_unreachable ();
12747 }
12748 }
12749 else
12750 aarch64_cmodel = opts->x_aarch64_cmodel_var;
12751 }
12752
12753 /* Implement TARGET_OPTION_SAVE. */
12754
12755 static void
12756 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12757 {
12758 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12759 ptr->x_aarch64_branch_protection_string
12760 = opts->x_aarch64_branch_protection_string;
12761 }
12762
12763 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
12764 using the information saved in PTR. */
12765
12766 static void
12767 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12768 {
12769 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12770 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12771 opts->x_explicit_arch = ptr->x_explicit_arch;
12772 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12773 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12774 opts->x_aarch64_branch_protection_string
12775 = ptr->x_aarch64_branch_protection_string;
12776 if (opts->x_aarch64_branch_protection_string)
12777 {
12778 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12779 NULL);
12780 }
12781
12782 aarch64_override_options_internal (opts);
12783 }
12784
12785 /* Implement TARGET_OPTION_PRINT. */
12786
12787 static void
12788 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12789 {
12790 const struct processor *cpu
12791 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12792 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
12793 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12794 std::string extension
12795 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12796
12797 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12798 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12799 arch->name, extension.c_str ());
12800 }
12801
12802 static GTY(()) tree aarch64_previous_fndecl;
12803
12804 void
12805 aarch64_reset_previous_fndecl (void)
12806 {
12807 aarch64_previous_fndecl = NULL;
12808 }
12809
12810 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12811 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12812 make sure optab availability predicates are recomputed when necessary. */
12813
12814 void
12815 aarch64_save_restore_target_globals (tree new_tree)
12816 {
12817 if (TREE_TARGET_GLOBALS (new_tree))
12818 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12819 else if (new_tree == target_option_default_node)
12820 restore_target_globals (&default_target_globals);
12821 else
12822 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12823 }
12824
12825 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
12826 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12827 of the function, if such exists. This function may be called multiple
12828 times on a single function so use aarch64_previous_fndecl to avoid
12829 setting up identical state. */
12830
12831 static void
12832 aarch64_set_current_function (tree fndecl)
12833 {
12834 if (!fndecl || fndecl == aarch64_previous_fndecl)
12835 return;
12836
12837 tree old_tree = (aarch64_previous_fndecl
12838 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12839 : NULL_TREE);
12840
12841 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12842
12843 /* If current function has no attributes but the previous one did,
12844 use the default node. */
12845 if (!new_tree && old_tree)
12846 new_tree = target_option_default_node;
12847
12848 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
12849 the default have been handled by aarch64_save_restore_target_globals from
12850 aarch64_pragma_target_parse. */
12851 if (old_tree == new_tree)
12852 return;
12853
12854 aarch64_previous_fndecl = fndecl;
12855
12856 /* First set the target options. */
12857 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12858
12859 aarch64_save_restore_target_globals (new_tree);
12860 }
12861
12862 /* Enum describing the various ways we can handle attributes.
12863 In many cases we can reuse the generic option handling machinery. */
12864
12865 enum aarch64_attr_opt_type
12866 {
12867 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
12868 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
12869 aarch64_attr_enum, /* Attribute sets an enum variable. */
12870 aarch64_attr_custom /* Attribute requires a custom handling function. */
12871 };
12872
12873 /* All the information needed to handle a target attribute.
12874 NAME is the name of the attribute.
12875 ATTR_TYPE specifies the type of behavior of the attribute as described
12876 in the definition of enum aarch64_attr_opt_type.
12877 ALLOW_NEG is true if the attribute supports a "no-" form.
12878 HANDLER is the function that takes the attribute string as an argument
12879 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12880 OPT_NUM is the enum specifying the option that the attribute modifies.
12881 This is needed for attributes that mirror the behavior of a command-line
12882 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12883 aarch64_attr_enum. */
12884
12885 struct aarch64_attribute_info
12886 {
12887 const char *name;
12888 enum aarch64_attr_opt_type attr_type;
12889 bool allow_neg;
12890 bool (*handler) (const char *);
12891 enum opt_code opt_num;
12892 };
12893
12894 /* Handle the ARCH_STR argument to the arch= target attribute. */
12895
12896 static bool
12897 aarch64_handle_attr_arch (const char *str)
12898 {
12899 const struct processor *tmp_arch = NULL;
12900 std::string invalid_extension;
12901 enum aarch64_parse_opt_result parse_res
12902 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12903
12904 if (parse_res == AARCH64_PARSE_OK)
12905 {
12906 gcc_assert (tmp_arch);
12907 selected_arch = tmp_arch;
12908 explicit_arch = selected_arch->arch;
12909 return true;
12910 }
12911
12912 switch (parse_res)
12913 {
12914 case AARCH64_PARSE_MISSING_ARG:
12915 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12916 break;
12917 case AARCH64_PARSE_INVALID_ARG:
12918 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12919 aarch64_print_hint_for_arch (str);
12920 break;
12921 case AARCH64_PARSE_INVALID_FEATURE:
12922 error ("invalid feature modifier %s of value (\"%s\") in "
12923 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12924 aarch64_print_hint_for_extensions (invalid_extension);
12925 break;
12926 default:
12927 gcc_unreachable ();
12928 }
12929
12930 return false;
12931 }
12932
12933 /* Handle the argument CPU_STR to the cpu= target attribute. */
12934
12935 static bool
12936 aarch64_handle_attr_cpu (const char *str)
12937 {
12938 const struct processor *tmp_cpu = NULL;
12939 std::string invalid_extension;
12940 enum aarch64_parse_opt_result parse_res
12941 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12942
12943 if (parse_res == AARCH64_PARSE_OK)
12944 {
12945 gcc_assert (tmp_cpu);
12946 selected_tune = tmp_cpu;
12947 explicit_tune_core = selected_tune->ident;
12948
12949 selected_arch = &all_architectures[tmp_cpu->arch];
12950 explicit_arch = selected_arch->arch;
12951 return true;
12952 }
12953
12954 switch (parse_res)
12955 {
12956 case AARCH64_PARSE_MISSING_ARG:
12957 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12958 break;
12959 case AARCH64_PARSE_INVALID_ARG:
12960 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12961 aarch64_print_hint_for_core (str);
12962 break;
12963 case AARCH64_PARSE_INVALID_FEATURE:
12964 error ("invalid feature modifier %s of value (\"%s\") in "
12965 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12966 aarch64_print_hint_for_extensions (invalid_extension);
12967 break;
12968 default:
12969 gcc_unreachable ();
12970 }
12971
12972 return false;
12973 }
12974
12975 /* Handle the argument STR to the branch-protection= attribute. */
12976
12977 static bool
12978 aarch64_handle_attr_branch_protection (const char* str)
12979 {
12980 char *err_str = (char *) xmalloc (strlen (str));
12981 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12982 &err_str);
12983 bool success = false;
12984 switch (res)
12985 {
12986 case AARCH64_PARSE_MISSING_ARG:
12987 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12988 " attribute");
12989 break;
12990 case AARCH64_PARSE_INVALID_ARG:
12991 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12992 "=\")%> pragma or attribute", err_str);
12993 break;
12994 case AARCH64_PARSE_OK:
12995 success = true;
12996 /* Fall through. */
12997 case AARCH64_PARSE_INVALID_FEATURE:
12998 break;
12999 default:
13000 gcc_unreachable ();
13001 }
13002 free (err_str);
13003 return success;
13004 }
13005
13006 /* Handle the argument STR to the tune= target attribute. */
13007
13008 static bool
13009 aarch64_handle_attr_tune (const char *str)
13010 {
13011 const struct processor *tmp_tune = NULL;
13012 enum aarch64_parse_opt_result parse_res
13013 = aarch64_parse_tune (str, &tmp_tune);
13014
13015 if (parse_res == AARCH64_PARSE_OK)
13016 {
13017 gcc_assert (tmp_tune);
13018 selected_tune = tmp_tune;
13019 explicit_tune_core = selected_tune->ident;
13020 return true;
13021 }
13022
13023 switch (parse_res)
13024 {
13025 case AARCH64_PARSE_INVALID_ARG:
13026 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
13027 aarch64_print_hint_for_core (str);
13028 break;
13029 default:
13030 gcc_unreachable ();
13031 }
13032
13033 return false;
13034 }
13035
13036 /* Parse an architecture extensions target attribute string specified in STR.
13037 For example "+fp+nosimd". Show any errors if needed. Return TRUE
13038 if successful. Update aarch64_isa_flags to reflect the ISA features
13039 modified. */
13040
13041 static bool
13042 aarch64_handle_attr_isa_flags (char *str)
13043 {
13044 enum aarch64_parse_opt_result parse_res;
13045 uint64_t isa_flags = aarch64_isa_flags;
13046
13047 /* We allow "+nothing" in the beginning to clear out all architectural
13048 features if the user wants to handpick specific features. */
13049 if (strncmp ("+nothing", str, 8) == 0)
13050 {
13051 isa_flags = 0;
13052 str += 8;
13053 }
13054
13055 std::string invalid_extension;
13056 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
13057
13058 if (parse_res == AARCH64_PARSE_OK)
13059 {
13060 aarch64_isa_flags = isa_flags;
13061 return true;
13062 }
13063
13064 switch (parse_res)
13065 {
13066 case AARCH64_PARSE_MISSING_ARG:
13067 error ("missing value in %<target()%> pragma or attribute");
13068 break;
13069
13070 case AARCH64_PARSE_INVALID_FEATURE:
13071 error ("invalid feature modifier %s of value (\"%s\") in "
13072 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
13073 break;
13074
13075 default:
13076 gcc_unreachable ();
13077 }
13078
13079 return false;
13080 }
13081
13082 /* The target attributes that we support. On top of these we also support just
13083 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
13084 handled explicitly in aarch64_process_one_target_attr. */
13085
13086 static const struct aarch64_attribute_info aarch64_attributes[] =
13087 {
13088 { "general-regs-only", aarch64_attr_mask, false, NULL,
13089 OPT_mgeneral_regs_only },
13090 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
13091 OPT_mfix_cortex_a53_835769 },
13092 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
13093 OPT_mfix_cortex_a53_843419 },
13094 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
13095 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
13096 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
13097 OPT_momit_leaf_frame_pointer },
13098 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
13099 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
13100 OPT_march_ },
13101 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
13102 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
13103 OPT_mtune_ },
13104 { "branch-protection", aarch64_attr_custom, false,
13105 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
13106 { "sign-return-address", aarch64_attr_enum, false, NULL,
13107 OPT_msign_return_address_ },
13108 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
13109 };
13110
13111 /* Parse ARG_STR which contains the definition of one target attribute.
13112 Show appropriate errors if any or return true if the attribute is valid. */
13113
13114 static bool
13115 aarch64_process_one_target_attr (char *arg_str)
13116 {
13117 bool invert = false;
13118
13119 size_t len = strlen (arg_str);
13120
13121 if (len == 0)
13122 {
13123 error ("malformed %<target()%> pragma or attribute");
13124 return false;
13125 }
13126
13127 char *str_to_check = (char *) alloca (len + 1);
13128 strcpy (str_to_check, arg_str);
13129
13130 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
13131 It is easier to detect and handle it explicitly here rather than going
13132 through the machinery for the rest of the target attributes in this
13133 function. */
13134 if (*str_to_check == '+')
13135 return aarch64_handle_attr_isa_flags (str_to_check);
13136
13137 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
13138 {
13139 invert = true;
13140 str_to_check += 3;
13141 }
13142 char *arg = strchr (str_to_check, '=');
13143
13144 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
13145 and point ARG to "foo". */
13146 if (arg)
13147 {
13148 *arg = '\0';
13149 arg++;
13150 }
13151 const struct aarch64_attribute_info *p_attr;
13152 bool found = false;
13153 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
13154 {
13155 /* If the names don't match up, or the user has given an argument
13156 to an attribute that doesn't accept one, or didn't give an argument
13157 to an attribute that expects one, fail to match. */
13158 if (strcmp (str_to_check, p_attr->name) != 0)
13159 continue;
13160
13161 found = true;
13162 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
13163 || p_attr->attr_type == aarch64_attr_enum;
13164
13165 if (attr_need_arg_p ^ (arg != NULL))
13166 {
13167 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
13168 return false;
13169 }
13170
13171 /* If the name matches but the attribute does not allow "no-" versions
13172 then we can't match. */
13173 if (invert && !p_attr->allow_neg)
13174 {
13175 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
13176 return false;
13177 }
13178
13179 switch (p_attr->attr_type)
13180 {
13181 /* Has a custom handler registered.
13182 For example, cpu=, arch=, tune=. */
13183 case aarch64_attr_custom:
13184 gcc_assert (p_attr->handler);
13185 if (!p_attr->handler (arg))
13186 return false;
13187 break;
13188
13189 /* Either set or unset a boolean option. */
13190 case aarch64_attr_bool:
13191 {
13192 struct cl_decoded_option decoded;
13193
13194 generate_option (p_attr->opt_num, NULL, !invert,
13195 CL_TARGET, &decoded);
13196 aarch64_handle_option (&global_options, &global_options_set,
13197 &decoded, input_location);
13198 break;
13199 }
13200 /* Set or unset a bit in the target_flags. aarch64_handle_option
13201 should know what mask to apply given the option number. */
13202 case aarch64_attr_mask:
13203 {
13204 struct cl_decoded_option decoded;
13205 /* We only need to specify the option number.
13206 aarch64_handle_option will know which mask to apply. */
13207 decoded.opt_index = p_attr->opt_num;
13208 decoded.value = !invert;
13209 aarch64_handle_option (&global_options, &global_options_set,
13210 &decoded, input_location);
13211 break;
13212 }
13213 /* Use the option setting machinery to set an option to an enum. */
13214 case aarch64_attr_enum:
13215 {
13216 gcc_assert (arg);
13217 bool valid;
13218 int value;
13219 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
13220 &value, CL_TARGET);
13221 if (valid)
13222 {
13223 set_option (&global_options, NULL, p_attr->opt_num, value,
13224 NULL, DK_UNSPECIFIED, input_location,
13225 global_dc);
13226 }
13227 else
13228 {
13229 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
13230 }
13231 break;
13232 }
13233 default:
13234 gcc_unreachable ();
13235 }
13236 }
13237
13238 /* If we reached here we either have found an attribute and validated
13239 it or didn't match any. If we matched an attribute but its arguments
13240 were malformed we will have returned false already. */
13241 return found;
13242 }
13243
13244 /* Count how many times the character C appears in
13245 NULL-terminated string STR. */
13246
13247 static unsigned int
13248 num_occurences_in_str (char c, char *str)
13249 {
13250 unsigned int res = 0;
13251 while (*str != '\0')
13252 {
13253 if (*str == c)
13254 res++;
13255
13256 str++;
13257 }
13258
13259 return res;
13260 }
13261
13262 /* Parse the tree in ARGS that contains the target attribute information
13263 and update the global target options space. */
13264
13265 bool
13266 aarch64_process_target_attr (tree args)
13267 {
13268 if (TREE_CODE (args) == TREE_LIST)
13269 {
13270 do
13271 {
13272 tree head = TREE_VALUE (args);
13273 if (head)
13274 {
13275 if (!aarch64_process_target_attr (head))
13276 return false;
13277 }
13278 args = TREE_CHAIN (args);
13279 } while (args);
13280
13281 return true;
13282 }
13283
13284 if (TREE_CODE (args) != STRING_CST)
13285 {
13286 error ("attribute %<target%> argument not a string");
13287 return false;
13288 }
13289
13290 size_t len = strlen (TREE_STRING_POINTER (args));
13291 char *str_to_check = (char *) alloca (len + 1);
13292 strcpy (str_to_check, TREE_STRING_POINTER (args));
13293
13294 if (len == 0)
13295 {
13296 error ("malformed %<target()%> pragma or attribute");
13297 return false;
13298 }
13299
13300 /* Used to catch empty spaces between commas i.e.
13301 attribute ((target ("attr1,,attr2"))). */
13302 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
13303
13304 /* Handle multiple target attributes separated by ','. */
13305 char *token = strtok_r (str_to_check, ",", &str_to_check);
13306
13307 unsigned int num_attrs = 0;
13308 while (token)
13309 {
13310 num_attrs++;
13311 if (!aarch64_process_one_target_attr (token))
13312 {
13313 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
13314 return false;
13315 }
13316
13317 token = strtok_r (NULL, ",", &str_to_check);
13318 }
13319
13320 if (num_attrs != num_commas + 1)
13321 {
13322 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
13323 return false;
13324 }
13325
13326 return true;
13327 }
13328
13329 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
13330 process attribute ((target ("..."))). */
13331
13332 static bool
13333 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
13334 {
13335 struct cl_target_option cur_target;
13336 bool ret;
13337 tree old_optimize;
13338 tree new_target, new_optimize;
13339 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13340
13341 /* If what we're processing is the current pragma string then the
13342 target option node is already stored in target_option_current_node
13343 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
13344 having to re-parse the string. This is especially useful to keep
13345 arm_neon.h compile times down since that header contains a lot
13346 of intrinsics enclosed in pragmas. */
13347 if (!existing_target && args == current_target_pragma)
13348 {
13349 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
13350 return true;
13351 }
13352 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13353
13354 old_optimize = build_optimization_node (&global_options);
13355 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
13356
13357 /* If the function changed the optimization levels as well as setting
13358 target options, start with the optimizations specified. */
13359 if (func_optimize && func_optimize != old_optimize)
13360 cl_optimization_restore (&global_options,
13361 TREE_OPTIMIZATION (func_optimize));
13362
13363 /* Save the current target options to restore at the end. */
13364 cl_target_option_save (&cur_target, &global_options);
13365
13366 /* If fndecl already has some target attributes applied to it, unpack
13367 them so that we add this attribute on top of them, rather than
13368 overwriting them. */
13369 if (existing_target)
13370 {
13371 struct cl_target_option *existing_options
13372 = TREE_TARGET_OPTION (existing_target);
13373
13374 if (existing_options)
13375 cl_target_option_restore (&global_options, existing_options);
13376 }
13377 else
13378 cl_target_option_restore (&global_options,
13379 TREE_TARGET_OPTION (target_option_current_node));
13380
13381 ret = aarch64_process_target_attr (args);
13382
13383 /* Set up any additional state. */
13384 if (ret)
13385 {
13386 aarch64_override_options_internal (&global_options);
13387 /* Initialize SIMD builtins if we haven't already.
13388 Set current_target_pragma to NULL for the duration so that
13389 the builtin initialization code doesn't try to tag the functions
13390 being built with the attributes specified by any current pragma, thus
13391 going into an infinite recursion. */
13392 if (TARGET_SIMD)
13393 {
13394 tree saved_current_target_pragma = current_target_pragma;
13395 current_target_pragma = NULL;
13396 aarch64_init_simd_builtins ();
13397 current_target_pragma = saved_current_target_pragma;
13398 }
13399 new_target = build_target_option_node (&global_options);
13400 }
13401 else
13402 new_target = NULL;
13403
13404 new_optimize = build_optimization_node (&global_options);
13405
13406 if (fndecl && ret)
13407 {
13408 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
13409
13410 if (old_optimize != new_optimize)
13411 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
13412 }
13413
13414 cl_target_option_restore (&global_options, &cur_target);
13415
13416 if (old_optimize != new_optimize)
13417 cl_optimization_restore (&global_options,
13418 TREE_OPTIMIZATION (old_optimize));
13419 return ret;
13420 }
13421
13422 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
13423 tri-bool options (yes, no, don't care) and the default value is
13424 DEF, determine whether to reject inlining. */
13425
13426 static bool
13427 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
13428 int dont_care, int def)
13429 {
13430 /* If the callee doesn't care, always allow inlining. */
13431 if (callee == dont_care)
13432 return true;
13433
13434 /* If the caller doesn't care, always allow inlining. */
13435 if (caller == dont_care)
13436 return true;
13437
13438 /* Otherwise, allow inlining if either the callee and caller values
13439 agree, or if the callee is using the default value. */
13440 return (callee == caller || callee == def);
13441 }
13442
13443 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
13444 to inline CALLEE into CALLER based on target-specific info.
13445 Make sure that the caller and callee have compatible architectural
13446 features. Then go through the other possible target attributes
13447 and see if they can block inlining. Try not to reject always_inline
13448 callees unless they are incompatible architecturally. */
13449
13450 static bool
13451 aarch64_can_inline_p (tree caller, tree callee)
13452 {
13453 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
13454 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
13455
13456 struct cl_target_option *caller_opts
13457 = TREE_TARGET_OPTION (caller_tree ? caller_tree
13458 : target_option_default_node);
13459
13460 struct cl_target_option *callee_opts
13461 = TREE_TARGET_OPTION (callee_tree ? callee_tree
13462 : target_option_default_node);
13463
13464 /* Callee's ISA flags should be a subset of the caller's. */
13465 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
13466 != callee_opts->x_aarch64_isa_flags)
13467 return false;
13468
13469 /* Allow non-strict aligned functions inlining into strict
13470 aligned ones. */
13471 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
13472 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
13473 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
13474 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
13475 return false;
13476
13477 bool always_inline = lookup_attribute ("always_inline",
13478 DECL_ATTRIBUTES (callee));
13479
13480 /* If the architectural features match up and the callee is always_inline
13481 then the other attributes don't matter. */
13482 if (always_inline)
13483 return true;
13484
13485 if (caller_opts->x_aarch64_cmodel_var
13486 != callee_opts->x_aarch64_cmodel_var)
13487 return false;
13488
13489 if (caller_opts->x_aarch64_tls_dialect
13490 != callee_opts->x_aarch64_tls_dialect)
13491 return false;
13492
13493 /* Honour explicit requests to workaround errata. */
13494 if (!aarch64_tribools_ok_for_inlining_p (
13495 caller_opts->x_aarch64_fix_a53_err835769,
13496 callee_opts->x_aarch64_fix_a53_err835769,
13497 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
13498 return false;
13499
13500 if (!aarch64_tribools_ok_for_inlining_p (
13501 caller_opts->x_aarch64_fix_a53_err843419,
13502 callee_opts->x_aarch64_fix_a53_err843419,
13503 2, TARGET_FIX_ERR_A53_843419))
13504 return false;
13505
13506 /* If the user explicitly specified -momit-leaf-frame-pointer for the
13507 caller and calle and they don't match up, reject inlining. */
13508 if (!aarch64_tribools_ok_for_inlining_p (
13509 caller_opts->x_flag_omit_leaf_frame_pointer,
13510 callee_opts->x_flag_omit_leaf_frame_pointer,
13511 2, 1))
13512 return false;
13513
13514 /* If the callee has specific tuning overrides, respect them. */
13515 if (callee_opts->x_aarch64_override_tune_string != NULL
13516 && caller_opts->x_aarch64_override_tune_string == NULL)
13517 return false;
13518
13519 /* If the user specified tuning override strings for the
13520 caller and callee and they don't match up, reject inlining.
13521 We just do a string compare here, we don't analyze the meaning
13522 of the string, as it would be too costly for little gain. */
13523 if (callee_opts->x_aarch64_override_tune_string
13524 && caller_opts->x_aarch64_override_tune_string
13525 && (strcmp (callee_opts->x_aarch64_override_tune_string,
13526 caller_opts->x_aarch64_override_tune_string) != 0))
13527 return false;
13528
13529 return true;
13530 }
13531
13532 /* Return true if SYMBOL_REF X binds locally. */
13533
13534 static bool
13535 aarch64_symbol_binds_local_p (const_rtx x)
13536 {
13537 return (SYMBOL_REF_DECL (x)
13538 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
13539 : SYMBOL_REF_LOCAL_P (x));
13540 }
13541
13542 /* Return true if SYMBOL_REF X is thread local */
13543 static bool
13544 aarch64_tls_symbol_p (rtx x)
13545 {
13546 if (! TARGET_HAVE_TLS)
13547 return false;
13548
13549 if (GET_CODE (x) != SYMBOL_REF)
13550 return false;
13551
13552 return SYMBOL_REF_TLS_MODEL (x) != 0;
13553 }
13554
13555 /* Classify a TLS symbol into one of the TLS kinds. */
13556 enum aarch64_symbol_type
13557 aarch64_classify_tls_symbol (rtx x)
13558 {
13559 enum tls_model tls_kind = tls_symbolic_operand_type (x);
13560
13561 switch (tls_kind)
13562 {
13563 case TLS_MODEL_GLOBAL_DYNAMIC:
13564 case TLS_MODEL_LOCAL_DYNAMIC:
13565 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
13566
13567 case TLS_MODEL_INITIAL_EXEC:
13568 switch (aarch64_cmodel)
13569 {
13570 case AARCH64_CMODEL_TINY:
13571 case AARCH64_CMODEL_TINY_PIC:
13572 return SYMBOL_TINY_TLSIE;
13573 default:
13574 return SYMBOL_SMALL_TLSIE;
13575 }
13576
13577 case TLS_MODEL_LOCAL_EXEC:
13578 if (aarch64_tls_size == 12)
13579 return SYMBOL_TLSLE12;
13580 else if (aarch64_tls_size == 24)
13581 return SYMBOL_TLSLE24;
13582 else if (aarch64_tls_size == 32)
13583 return SYMBOL_TLSLE32;
13584 else if (aarch64_tls_size == 48)
13585 return SYMBOL_TLSLE48;
13586 else
13587 gcc_unreachable ();
13588
13589 case TLS_MODEL_EMULATED:
13590 case TLS_MODEL_NONE:
13591 return SYMBOL_FORCE_TO_MEM;
13592
13593 default:
13594 gcc_unreachable ();
13595 }
13596 }
13597
13598 /* Return the correct method for accessing X + OFFSET, where X is either
13599 a SYMBOL_REF or LABEL_REF. */
13600
13601 enum aarch64_symbol_type
13602 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13603 {
13604 if (GET_CODE (x) == LABEL_REF)
13605 {
13606 switch (aarch64_cmodel)
13607 {
13608 case AARCH64_CMODEL_LARGE:
13609 return SYMBOL_FORCE_TO_MEM;
13610
13611 case AARCH64_CMODEL_TINY_PIC:
13612 case AARCH64_CMODEL_TINY:
13613 return SYMBOL_TINY_ABSOLUTE;
13614
13615 case AARCH64_CMODEL_SMALL_SPIC:
13616 case AARCH64_CMODEL_SMALL_PIC:
13617 case AARCH64_CMODEL_SMALL:
13618 return SYMBOL_SMALL_ABSOLUTE;
13619
13620 default:
13621 gcc_unreachable ();
13622 }
13623 }
13624
13625 if (GET_CODE (x) == SYMBOL_REF)
13626 {
13627 if (aarch64_tls_symbol_p (x))
13628 return aarch64_classify_tls_symbol (x);
13629
13630 switch (aarch64_cmodel)
13631 {
13632 case AARCH64_CMODEL_TINY:
13633 /* When we retrieve symbol + offset address, we have to make sure
13634 the offset does not cause overflow of the final address. But
13635 we have no way of knowing the address of symbol at compile time
13636 so we can't accurately say if the distance between the PC and
13637 symbol + offset is outside the addressible range of +/-1M in the
13638 TINY code model. So we rely on images not being greater than
13639 1M and cap the offset at 1M and anything beyond 1M will have to
13640 be loaded using an alternative mechanism. Furthermore if the
13641 symbol is a weak reference to something that isn't known to
13642 resolve to a symbol in this module, then force to memory. */
13643 if ((SYMBOL_REF_WEAK (x)
13644 && !aarch64_symbol_binds_local_p (x))
13645 || !IN_RANGE (offset, -1048575, 1048575))
13646 return SYMBOL_FORCE_TO_MEM;
13647 return SYMBOL_TINY_ABSOLUTE;
13648
13649 case AARCH64_CMODEL_SMALL:
13650 /* Same reasoning as the tiny code model, but the offset cap here is
13651 4G. */
13652 if ((SYMBOL_REF_WEAK (x)
13653 && !aarch64_symbol_binds_local_p (x))
13654 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13655 HOST_WIDE_INT_C (4294967264)))
13656 return SYMBOL_FORCE_TO_MEM;
13657 return SYMBOL_SMALL_ABSOLUTE;
13658
13659 case AARCH64_CMODEL_TINY_PIC:
13660 if (!aarch64_symbol_binds_local_p (x))
13661 return SYMBOL_TINY_GOT;
13662 return SYMBOL_TINY_ABSOLUTE;
13663
13664 case AARCH64_CMODEL_SMALL_SPIC:
13665 case AARCH64_CMODEL_SMALL_PIC:
13666 if (!aarch64_symbol_binds_local_p (x))
13667 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13668 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13669 return SYMBOL_SMALL_ABSOLUTE;
13670
13671 case AARCH64_CMODEL_LARGE:
13672 /* This is alright even in PIC code as the constant
13673 pool reference is always PC relative and within
13674 the same translation unit. */
13675 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13676 return SYMBOL_SMALL_ABSOLUTE;
13677 else
13678 return SYMBOL_FORCE_TO_MEM;
13679
13680 default:
13681 gcc_unreachable ();
13682 }
13683 }
13684
13685 /* By default push everything into the constant pool. */
13686 return SYMBOL_FORCE_TO_MEM;
13687 }
13688
13689 bool
13690 aarch64_constant_address_p (rtx x)
13691 {
13692 return (CONSTANT_P (x) && memory_address_p (DImode, x));
13693 }
13694
13695 bool
13696 aarch64_legitimate_pic_operand_p (rtx x)
13697 {
13698 if (GET_CODE (x) == SYMBOL_REF
13699 || (GET_CODE (x) == CONST
13700 && GET_CODE (XEXP (x, 0)) == PLUS
13701 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13702 return false;
13703
13704 return true;
13705 }
13706
13707 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
13708 that should be rematerialized rather than spilled. */
13709
13710 static bool
13711 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13712 {
13713 /* Support CSE and rematerialization of common constants. */
13714 if (CONST_INT_P (x)
13715 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13716 || GET_CODE (x) == CONST_VECTOR)
13717 return true;
13718
13719 /* Do not allow vector struct mode constants for Advanced SIMD.
13720 We could support 0 and -1 easily, but they need support in
13721 aarch64-simd.md. */
13722 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13723 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13724 return false;
13725
13726 /* Only accept variable-length vector constants if they can be
13727 handled directly.
13728
13729 ??? It would be possible to handle rematerialization of other
13730 constants via secondary reloads. */
13731 if (vec_flags & VEC_ANY_SVE)
13732 return aarch64_simd_valid_immediate (x, NULL);
13733
13734 if (GET_CODE (x) == HIGH)
13735 x = XEXP (x, 0);
13736
13737 /* Accept polynomial constants that can be calculated by using the
13738 destination of a move as the sole temporary. Constants that
13739 require a second temporary cannot be rematerialized (they can't be
13740 forced to memory and also aren't legitimate constants). */
13741 poly_int64 offset;
13742 if (poly_int_rtx_p (x, &offset))
13743 return aarch64_offset_temporaries (false, offset) <= 1;
13744
13745 /* If an offset is being added to something else, we need to allow the
13746 base to be moved into the destination register, meaning that there
13747 are no free temporaries for the offset. */
13748 x = strip_offset (x, &offset);
13749 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13750 return false;
13751
13752 /* Do not allow const (plus (anchor_symbol, const_int)). */
13753 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13754 return false;
13755
13756 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
13757 so spilling them is better than rematerialization. */
13758 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13759 return true;
13760
13761 /* Label references are always constant. */
13762 if (GET_CODE (x) == LABEL_REF)
13763 return true;
13764
13765 return false;
13766 }
13767
13768 rtx
13769 aarch64_load_tp (rtx target)
13770 {
13771 if (!target
13772 || GET_MODE (target) != Pmode
13773 || !register_operand (target, Pmode))
13774 target = gen_reg_rtx (Pmode);
13775
13776 /* Can return in any reg. */
13777 emit_insn (gen_aarch64_load_tp_hard (target));
13778 return target;
13779 }
13780
13781 /* On AAPCS systems, this is the "struct __va_list". */
13782 static GTY(()) tree va_list_type;
13783
13784 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13785 Return the type to use as __builtin_va_list.
13786
13787 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13788
13789 struct __va_list
13790 {
13791 void *__stack;
13792 void *__gr_top;
13793 void *__vr_top;
13794 int __gr_offs;
13795 int __vr_offs;
13796 }; */
13797
13798 static tree
13799 aarch64_build_builtin_va_list (void)
13800 {
13801 tree va_list_name;
13802 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13803
13804 /* Create the type. */
13805 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13806 /* Give it the required name. */
13807 va_list_name = build_decl (BUILTINS_LOCATION,
13808 TYPE_DECL,
13809 get_identifier ("__va_list"),
13810 va_list_type);
13811 DECL_ARTIFICIAL (va_list_name) = 1;
13812 TYPE_NAME (va_list_type) = va_list_name;
13813 TYPE_STUB_DECL (va_list_type) = va_list_name;
13814
13815 /* Create the fields. */
13816 f_stack = build_decl (BUILTINS_LOCATION,
13817 FIELD_DECL, get_identifier ("__stack"),
13818 ptr_type_node);
13819 f_grtop = build_decl (BUILTINS_LOCATION,
13820 FIELD_DECL, get_identifier ("__gr_top"),
13821 ptr_type_node);
13822 f_vrtop = build_decl (BUILTINS_LOCATION,
13823 FIELD_DECL, get_identifier ("__vr_top"),
13824 ptr_type_node);
13825 f_groff = build_decl (BUILTINS_LOCATION,
13826 FIELD_DECL, get_identifier ("__gr_offs"),
13827 integer_type_node);
13828 f_vroff = build_decl (BUILTINS_LOCATION,
13829 FIELD_DECL, get_identifier ("__vr_offs"),
13830 integer_type_node);
13831
13832 /* Tell tree-stdarg pass about our internal offset fields.
13833 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13834 purpose to identify whether the code is updating va_list internal
13835 offset fields through irregular way. */
13836 va_list_gpr_counter_field = f_groff;
13837 va_list_fpr_counter_field = f_vroff;
13838
13839 DECL_ARTIFICIAL (f_stack) = 1;
13840 DECL_ARTIFICIAL (f_grtop) = 1;
13841 DECL_ARTIFICIAL (f_vrtop) = 1;
13842 DECL_ARTIFICIAL (f_groff) = 1;
13843 DECL_ARTIFICIAL (f_vroff) = 1;
13844
13845 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13846 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13847 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13848 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13849 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13850
13851 TYPE_FIELDS (va_list_type) = f_stack;
13852 DECL_CHAIN (f_stack) = f_grtop;
13853 DECL_CHAIN (f_grtop) = f_vrtop;
13854 DECL_CHAIN (f_vrtop) = f_groff;
13855 DECL_CHAIN (f_groff) = f_vroff;
13856
13857 /* Compute its layout. */
13858 layout_type (va_list_type);
13859
13860 return va_list_type;
13861 }
13862
13863 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
13864 static void
13865 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13866 {
13867 const CUMULATIVE_ARGS *cum;
13868 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13869 tree stack, grtop, vrtop, groff, vroff;
13870 tree t;
13871 int gr_save_area_size = cfun->va_list_gpr_size;
13872 int vr_save_area_size = cfun->va_list_fpr_size;
13873 int vr_offset;
13874
13875 cum = &crtl->args.info;
13876 if (cfun->va_list_gpr_size)
13877 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13878 cfun->va_list_gpr_size);
13879 if (cfun->va_list_fpr_size)
13880 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13881 * UNITS_PER_VREG, cfun->va_list_fpr_size);
13882
13883 if (!TARGET_FLOAT)
13884 {
13885 gcc_assert (cum->aapcs_nvrn == 0);
13886 vr_save_area_size = 0;
13887 }
13888
13889 f_stack = TYPE_FIELDS (va_list_type_node);
13890 f_grtop = DECL_CHAIN (f_stack);
13891 f_vrtop = DECL_CHAIN (f_grtop);
13892 f_groff = DECL_CHAIN (f_vrtop);
13893 f_vroff = DECL_CHAIN (f_groff);
13894
13895 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13896 NULL_TREE);
13897 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13898 NULL_TREE);
13899 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13900 NULL_TREE);
13901 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13902 NULL_TREE);
13903 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13904 NULL_TREE);
13905
13906 /* Emit code to initialize STACK, which points to the next varargs stack
13907 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
13908 by named arguments. STACK is 8-byte aligned. */
13909 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13910 if (cum->aapcs_stack_size > 0)
13911 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13912 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13913 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13914
13915 /* Emit code to initialize GRTOP, the top of the GR save area.
13916 virtual_incoming_args_rtx should have been 16 byte aligned. */
13917 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13918 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13919 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13920
13921 /* Emit code to initialize VRTOP, the top of the VR save area.
13922 This address is gr_save_area_bytes below GRTOP, rounded
13923 down to the next 16-byte boundary. */
13924 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13925 vr_offset = ROUND_UP (gr_save_area_size,
13926 STACK_BOUNDARY / BITS_PER_UNIT);
13927
13928 if (vr_offset)
13929 t = fold_build_pointer_plus_hwi (t, -vr_offset);
13930 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13931 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13932
13933 /* Emit code to initialize GROFF, the offset from GRTOP of the
13934 next GPR argument. */
13935 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13936 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13937 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13938
13939 /* Likewise emit code to initialize VROFF, the offset from FTOP
13940 of the next VR argument. */
13941 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13942 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13943 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13944 }
13945
13946 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
13947
13948 static tree
13949 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13950 gimple_seq *post_p ATTRIBUTE_UNUSED)
13951 {
13952 tree addr;
13953 bool indirect_p;
13954 bool is_ha; /* is HFA or HVA. */
13955 bool dw_align; /* double-word align. */
13956 machine_mode ag_mode = VOIDmode;
13957 int nregs;
13958 machine_mode mode;
13959
13960 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13961 tree stack, f_top, f_off, off, arg, roundup, on_stack;
13962 HOST_WIDE_INT size, rsize, adjust, align;
13963 tree t, u, cond1, cond2;
13964
13965 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13966 if (indirect_p)
13967 type = build_pointer_type (type);
13968
13969 mode = TYPE_MODE (type);
13970
13971 f_stack = TYPE_FIELDS (va_list_type_node);
13972 f_grtop = DECL_CHAIN (f_stack);
13973 f_vrtop = DECL_CHAIN (f_grtop);
13974 f_groff = DECL_CHAIN (f_vrtop);
13975 f_vroff = DECL_CHAIN (f_groff);
13976
13977 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13978 f_stack, NULL_TREE);
13979 size = int_size_in_bytes (type);
13980
13981 bool abi_break;
13982 align
13983 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13984
13985 dw_align = false;
13986 adjust = 0;
13987 if (aarch64_vfp_is_call_or_return_candidate (mode,
13988 type,
13989 &ag_mode,
13990 &nregs,
13991 &is_ha))
13992 {
13993 /* No frontends can create types with variable-sized modes, so we
13994 shouldn't be asked to pass or return them. */
13995 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13996
13997 /* TYPE passed in fp/simd registers. */
13998 if (!TARGET_FLOAT)
13999 aarch64_err_no_fpadvsimd (mode);
14000
14001 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
14002 unshare_expr (valist), f_vrtop, NULL_TREE);
14003 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
14004 unshare_expr (valist), f_vroff, NULL_TREE);
14005
14006 rsize = nregs * UNITS_PER_VREG;
14007
14008 if (is_ha)
14009 {
14010 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
14011 adjust = UNITS_PER_VREG - ag_size;
14012 }
14013 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14014 && size < UNITS_PER_VREG)
14015 {
14016 adjust = UNITS_PER_VREG - size;
14017 }
14018 }
14019 else
14020 {
14021 /* TYPE passed in general registers. */
14022 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
14023 unshare_expr (valist), f_grtop, NULL_TREE);
14024 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
14025 unshare_expr (valist), f_groff, NULL_TREE);
14026 rsize = ROUND_UP (size, UNITS_PER_WORD);
14027 nregs = rsize / UNITS_PER_WORD;
14028
14029 if (align > 8)
14030 {
14031 if (abi_break && warn_psabi)
14032 inform (input_location, "parameter passing for argument of type "
14033 "%qT changed in GCC 9.1", type);
14034 dw_align = true;
14035 }
14036
14037 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14038 && size < UNITS_PER_WORD)
14039 {
14040 adjust = UNITS_PER_WORD - size;
14041 }
14042 }
14043
14044 /* Get a local temporary for the field value. */
14045 off = get_initialized_tmp_var (f_off, pre_p, NULL);
14046
14047 /* Emit code to branch if off >= 0. */
14048 t = build2 (GE_EXPR, boolean_type_node, off,
14049 build_int_cst (TREE_TYPE (off), 0));
14050 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
14051
14052 if (dw_align)
14053 {
14054 /* Emit: offs = (offs + 15) & -16. */
14055 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14056 build_int_cst (TREE_TYPE (off), 15));
14057 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
14058 build_int_cst (TREE_TYPE (off), -16));
14059 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
14060 }
14061 else
14062 roundup = NULL;
14063
14064 /* Update ap.__[g|v]r_offs */
14065 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
14066 build_int_cst (TREE_TYPE (off), rsize));
14067 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
14068
14069 /* String up. */
14070 if (roundup)
14071 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14072
14073 /* [cond2] if (ap.__[g|v]r_offs > 0) */
14074 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
14075 build_int_cst (TREE_TYPE (f_off), 0));
14076 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
14077
14078 /* String up: make sure the assignment happens before the use. */
14079 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
14080 COND_EXPR_ELSE (cond1) = t;
14081
14082 /* Prepare the trees handling the argument that is passed on the stack;
14083 the top level node will store in ON_STACK. */
14084 arg = get_initialized_tmp_var (stack, pre_p, NULL);
14085 if (align > 8)
14086 {
14087 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
14088 t = fold_build_pointer_plus_hwi (arg, 15);
14089 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14090 build_int_cst (TREE_TYPE (t), -16));
14091 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
14092 }
14093 else
14094 roundup = NULL;
14095 /* Advance ap.__stack */
14096 t = fold_build_pointer_plus_hwi (arg, size + 7);
14097 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
14098 build_int_cst (TREE_TYPE (t), -8));
14099 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
14100 /* String up roundup and advance. */
14101 if (roundup)
14102 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
14103 /* String up with arg */
14104 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
14105 /* Big-endianness related address adjustment. */
14106 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
14107 && size < UNITS_PER_WORD)
14108 {
14109 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
14110 size_int (UNITS_PER_WORD - size));
14111 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
14112 }
14113
14114 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
14115 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
14116
14117 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
14118 t = off;
14119 if (adjust)
14120 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
14121 build_int_cst (TREE_TYPE (off), adjust));
14122
14123 t = fold_convert (sizetype, t);
14124 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
14125
14126 if (is_ha)
14127 {
14128 /* type ha; // treat as "struct {ftype field[n];}"
14129 ... [computing offs]
14130 for (i = 0; i <nregs; ++i, offs += 16)
14131 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
14132 return ha; */
14133 int i;
14134 tree tmp_ha, field_t, field_ptr_t;
14135
14136 /* Declare a local variable. */
14137 tmp_ha = create_tmp_var_raw (type, "ha");
14138 gimple_add_tmp_var (tmp_ha);
14139
14140 /* Establish the base type. */
14141 switch (ag_mode)
14142 {
14143 case E_SFmode:
14144 field_t = float_type_node;
14145 field_ptr_t = float_ptr_type_node;
14146 break;
14147 case E_DFmode:
14148 field_t = double_type_node;
14149 field_ptr_t = double_ptr_type_node;
14150 break;
14151 case E_TFmode:
14152 field_t = long_double_type_node;
14153 field_ptr_t = long_double_ptr_type_node;
14154 break;
14155 case E_HFmode:
14156 field_t = aarch64_fp16_type_node;
14157 field_ptr_t = aarch64_fp16_ptr_type_node;
14158 break;
14159 case E_V2SImode:
14160 case E_V4SImode:
14161 {
14162 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
14163 field_t = build_vector_type_for_mode (innertype, ag_mode);
14164 field_ptr_t = build_pointer_type (field_t);
14165 }
14166 break;
14167 default:
14168 gcc_assert (0);
14169 }
14170
14171 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
14172 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
14173 addr = t;
14174 t = fold_convert (field_ptr_t, addr);
14175 t = build2 (MODIFY_EXPR, field_t,
14176 build1 (INDIRECT_REF, field_t, tmp_ha),
14177 build1 (INDIRECT_REF, field_t, t));
14178
14179 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
14180 for (i = 1; i < nregs; ++i)
14181 {
14182 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
14183 u = fold_convert (field_ptr_t, addr);
14184 u = build2 (MODIFY_EXPR, field_t,
14185 build2 (MEM_REF, field_t, tmp_ha,
14186 build_int_cst (field_ptr_t,
14187 (i *
14188 int_size_in_bytes (field_t)))),
14189 build1 (INDIRECT_REF, field_t, u));
14190 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
14191 }
14192
14193 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
14194 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
14195 }
14196
14197 COND_EXPR_ELSE (cond2) = t;
14198 addr = fold_convert (build_pointer_type (type), cond1);
14199 addr = build_va_arg_indirect_ref (addr);
14200
14201 if (indirect_p)
14202 addr = build_va_arg_indirect_ref (addr);
14203
14204 return addr;
14205 }
14206
14207 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
14208
14209 static void
14210 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
14211 tree type, int *pretend_size ATTRIBUTE_UNUSED,
14212 int no_rtl)
14213 {
14214 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
14215 CUMULATIVE_ARGS local_cum;
14216 int gr_saved = cfun->va_list_gpr_size;
14217 int vr_saved = cfun->va_list_fpr_size;
14218
14219 /* The caller has advanced CUM up to, but not beyond, the last named
14220 argument. Advance a local copy of CUM past the last "real" named
14221 argument, to find out how many registers are left over. */
14222 local_cum = *cum;
14223 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
14224
14225 /* Found out how many registers we need to save.
14226 Honor tree-stdvar analysis results. */
14227 if (cfun->va_list_gpr_size)
14228 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
14229 cfun->va_list_gpr_size / UNITS_PER_WORD);
14230 if (cfun->va_list_fpr_size)
14231 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
14232 cfun->va_list_fpr_size / UNITS_PER_VREG);
14233
14234 if (!TARGET_FLOAT)
14235 {
14236 gcc_assert (local_cum.aapcs_nvrn == 0);
14237 vr_saved = 0;
14238 }
14239
14240 if (!no_rtl)
14241 {
14242 if (gr_saved > 0)
14243 {
14244 rtx ptr, mem;
14245
14246 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
14247 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
14248 - gr_saved * UNITS_PER_WORD);
14249 mem = gen_frame_mem (BLKmode, ptr);
14250 set_mem_alias_set (mem, get_varargs_alias_set ());
14251
14252 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
14253 mem, gr_saved);
14254 }
14255 if (vr_saved > 0)
14256 {
14257 /* We can't use move_block_from_reg, because it will use
14258 the wrong mode, storing D regs only. */
14259 machine_mode mode = TImode;
14260 int off, i, vr_start;
14261
14262 /* Set OFF to the offset from virtual_incoming_args_rtx of
14263 the first vector register. The VR save area lies below
14264 the GR one, and is aligned to 16 bytes. */
14265 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
14266 STACK_BOUNDARY / BITS_PER_UNIT);
14267 off -= vr_saved * UNITS_PER_VREG;
14268
14269 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
14270 for (i = 0; i < vr_saved; ++i)
14271 {
14272 rtx ptr, mem;
14273
14274 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
14275 mem = gen_frame_mem (mode, ptr);
14276 set_mem_alias_set (mem, get_varargs_alias_set ());
14277 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
14278 off += UNITS_PER_VREG;
14279 }
14280 }
14281 }
14282
14283 /* We don't save the size into *PRETEND_SIZE because we want to avoid
14284 any complication of having crtl->args.pretend_args_size changed. */
14285 cfun->machine->frame.saved_varargs_size
14286 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
14287 STACK_BOUNDARY / BITS_PER_UNIT)
14288 + vr_saved * UNITS_PER_VREG);
14289 }
14290
14291 static void
14292 aarch64_conditional_register_usage (void)
14293 {
14294 int i;
14295 if (!TARGET_FLOAT)
14296 {
14297 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
14298 {
14299 fixed_regs[i] = 1;
14300 call_used_regs[i] = 1;
14301 }
14302 }
14303 if (!TARGET_SVE)
14304 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
14305 {
14306 fixed_regs[i] = 1;
14307 call_used_regs[i] = 1;
14308 }
14309
14310 /* When tracking speculation, we need a couple of call-clobbered registers
14311 to track the speculation state. It would be nice to just use
14312 IP0 and IP1, but currently there are numerous places that just
14313 assume these registers are free for other uses (eg pointer
14314 authentication). */
14315 if (aarch64_track_speculation)
14316 {
14317 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
14318 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
14319 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14320 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
14321 }
14322 }
14323
14324 /* Walk down the type tree of TYPE counting consecutive base elements.
14325 If *MODEP is VOIDmode, then set it to the first valid floating point
14326 type. If a non-floating point type is found, or if a floating point
14327 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
14328 otherwise return the count in the sub-tree. */
14329 static int
14330 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
14331 {
14332 machine_mode mode;
14333 HOST_WIDE_INT size;
14334
14335 switch (TREE_CODE (type))
14336 {
14337 case REAL_TYPE:
14338 mode = TYPE_MODE (type);
14339 if (mode != DFmode && mode != SFmode
14340 && mode != TFmode && mode != HFmode)
14341 return -1;
14342
14343 if (*modep == VOIDmode)
14344 *modep = mode;
14345
14346 if (*modep == mode)
14347 return 1;
14348
14349 break;
14350
14351 case COMPLEX_TYPE:
14352 mode = TYPE_MODE (TREE_TYPE (type));
14353 if (mode != DFmode && mode != SFmode
14354 && mode != TFmode && mode != HFmode)
14355 return -1;
14356
14357 if (*modep == VOIDmode)
14358 *modep = mode;
14359
14360 if (*modep == mode)
14361 return 2;
14362
14363 break;
14364
14365 case VECTOR_TYPE:
14366 /* Use V2SImode and V4SImode as representatives of all 64-bit
14367 and 128-bit vector types. */
14368 size = int_size_in_bytes (type);
14369 switch (size)
14370 {
14371 case 8:
14372 mode = V2SImode;
14373 break;
14374 case 16:
14375 mode = V4SImode;
14376 break;
14377 default:
14378 return -1;
14379 }
14380
14381 if (*modep == VOIDmode)
14382 *modep = mode;
14383
14384 /* Vector modes are considered to be opaque: two vectors are
14385 equivalent for the purposes of being homogeneous aggregates
14386 if they are the same size. */
14387 if (*modep == mode)
14388 return 1;
14389
14390 break;
14391
14392 case ARRAY_TYPE:
14393 {
14394 int count;
14395 tree index = TYPE_DOMAIN (type);
14396
14397 /* Can't handle incomplete types nor sizes that are not
14398 fixed. */
14399 if (!COMPLETE_TYPE_P (type)
14400 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14401 return -1;
14402
14403 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
14404 if (count == -1
14405 || !index
14406 || !TYPE_MAX_VALUE (index)
14407 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
14408 || !TYPE_MIN_VALUE (index)
14409 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
14410 || count < 0)
14411 return -1;
14412
14413 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
14414 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
14415
14416 /* There must be no padding. */
14417 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14418 count * GET_MODE_BITSIZE (*modep)))
14419 return -1;
14420
14421 return count;
14422 }
14423
14424 case RECORD_TYPE:
14425 {
14426 int count = 0;
14427 int sub_count;
14428 tree field;
14429
14430 /* Can't handle incomplete types nor sizes that are not
14431 fixed. */
14432 if (!COMPLETE_TYPE_P (type)
14433 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14434 return -1;
14435
14436 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14437 {
14438 if (TREE_CODE (field) != FIELD_DECL)
14439 continue;
14440
14441 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14442 if (sub_count < 0)
14443 return -1;
14444 count += sub_count;
14445 }
14446
14447 /* There must be no padding. */
14448 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14449 count * GET_MODE_BITSIZE (*modep)))
14450 return -1;
14451
14452 return count;
14453 }
14454
14455 case UNION_TYPE:
14456 case QUAL_UNION_TYPE:
14457 {
14458 /* These aren't very interesting except in a degenerate case. */
14459 int count = 0;
14460 int sub_count;
14461 tree field;
14462
14463 /* Can't handle incomplete types nor sizes that are not
14464 fixed. */
14465 if (!COMPLETE_TYPE_P (type)
14466 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14467 return -1;
14468
14469 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
14470 {
14471 if (TREE_CODE (field) != FIELD_DECL)
14472 continue;
14473
14474 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
14475 if (sub_count < 0)
14476 return -1;
14477 count = count > sub_count ? count : sub_count;
14478 }
14479
14480 /* There must be no padding. */
14481 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
14482 count * GET_MODE_BITSIZE (*modep)))
14483 return -1;
14484
14485 return count;
14486 }
14487
14488 default:
14489 break;
14490 }
14491
14492 return -1;
14493 }
14494
14495 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
14496 type as described in AAPCS64 \S 4.1.2.
14497
14498 See the comment above aarch64_composite_type_p for the notes on MODE. */
14499
14500 static bool
14501 aarch64_short_vector_p (const_tree type,
14502 machine_mode mode)
14503 {
14504 poly_int64 size = -1;
14505
14506 if (type && TREE_CODE (type) == VECTOR_TYPE)
14507 size = int_size_in_bytes (type);
14508 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
14509 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
14510 size = GET_MODE_SIZE (mode);
14511
14512 return known_eq (size, 8) || known_eq (size, 16);
14513 }
14514
14515 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
14516 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
14517 array types. The C99 floating-point complex types are also considered
14518 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
14519 types, which are GCC extensions and out of the scope of AAPCS64, are
14520 treated as composite types here as well.
14521
14522 Note that MODE itself is not sufficient in determining whether a type
14523 is such a composite type or not. This is because
14524 stor-layout.c:compute_record_mode may have already changed the MODE
14525 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
14526 structure with only one field may have its MODE set to the mode of the
14527 field. Also an integer mode whose size matches the size of the
14528 RECORD_TYPE type may be used to substitute the original mode
14529 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
14530 solely relied on. */
14531
14532 static bool
14533 aarch64_composite_type_p (const_tree type,
14534 machine_mode mode)
14535 {
14536 if (aarch64_short_vector_p (type, mode))
14537 return false;
14538
14539 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
14540 return true;
14541
14542 if (mode == BLKmode
14543 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
14544 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
14545 return true;
14546
14547 return false;
14548 }
14549
14550 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
14551 shall be passed or returned in simd/fp register(s) (providing these
14552 parameter passing registers are available).
14553
14554 Upon successful return, *COUNT returns the number of needed registers,
14555 *BASE_MODE returns the mode of the individual register and when IS_HAF
14556 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
14557 floating-point aggregate or a homogeneous short-vector aggregate. */
14558
14559 static bool
14560 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
14561 const_tree type,
14562 machine_mode *base_mode,
14563 int *count,
14564 bool *is_ha)
14565 {
14566 machine_mode new_mode = VOIDmode;
14567 bool composite_p = aarch64_composite_type_p (type, mode);
14568
14569 if (is_ha != NULL) *is_ha = false;
14570
14571 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
14572 || aarch64_short_vector_p (type, mode))
14573 {
14574 *count = 1;
14575 new_mode = mode;
14576 }
14577 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
14578 {
14579 if (is_ha != NULL) *is_ha = true;
14580 *count = 2;
14581 new_mode = GET_MODE_INNER (mode);
14582 }
14583 else if (type && composite_p)
14584 {
14585 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14586
14587 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14588 {
14589 if (is_ha != NULL) *is_ha = true;
14590 *count = ag_count;
14591 }
14592 else
14593 return false;
14594 }
14595 else
14596 return false;
14597
14598 *base_mode = new_mode;
14599 return true;
14600 }
14601
14602 /* Implement TARGET_STRUCT_VALUE_RTX. */
14603
14604 static rtx
14605 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14606 int incoming ATTRIBUTE_UNUSED)
14607 {
14608 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14609 }
14610
14611 /* Implements target hook vector_mode_supported_p. */
14612 static bool
14613 aarch64_vector_mode_supported_p (machine_mode mode)
14614 {
14615 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14616 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14617 }
14618
14619 /* Return the full-width SVE vector mode for element mode MODE, if one
14620 exists. */
14621 opt_machine_mode
14622 aarch64_full_sve_mode (scalar_mode mode)
14623 {
14624 switch (mode)
14625 {
14626 case E_DFmode:
14627 return VNx2DFmode;
14628 case E_SFmode:
14629 return VNx4SFmode;
14630 case E_HFmode:
14631 return VNx8HFmode;
14632 case E_DImode:
14633 return VNx2DImode;
14634 case E_SImode:
14635 return VNx4SImode;
14636 case E_HImode:
14637 return VNx8HImode;
14638 case E_QImode:
14639 return VNx16QImode;
14640 default:
14641 return opt_machine_mode ();
14642 }
14643 }
14644
14645 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
14646 if it exists. */
14647 opt_machine_mode
14648 aarch64_vq_mode (scalar_mode mode)
14649 {
14650 switch (mode)
14651 {
14652 case E_DFmode:
14653 return V2DFmode;
14654 case E_SFmode:
14655 return V4SFmode;
14656 case E_HFmode:
14657 return V8HFmode;
14658 case E_SImode:
14659 return V4SImode;
14660 case E_HImode:
14661 return V8HImode;
14662 case E_QImode:
14663 return V16QImode;
14664 case E_DImode:
14665 return V2DImode;
14666 default:
14667 return opt_machine_mode ();
14668 }
14669 }
14670
14671 /* Return appropriate SIMD container
14672 for MODE within a vector of WIDTH bits. */
14673 static machine_mode
14674 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14675 {
14676 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14677 return aarch64_full_sve_mode (mode).else_mode (word_mode);
14678
14679 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14680 if (TARGET_SIMD)
14681 {
14682 if (known_eq (width, 128))
14683 return aarch64_vq_mode (mode).else_mode (word_mode);
14684 else
14685 switch (mode)
14686 {
14687 case E_SFmode:
14688 return V2SFmode;
14689 case E_HFmode:
14690 return V4HFmode;
14691 case E_SImode:
14692 return V2SImode;
14693 case E_HImode:
14694 return V4HImode;
14695 case E_QImode:
14696 return V8QImode;
14697 default:
14698 break;
14699 }
14700 }
14701 return word_mode;
14702 }
14703
14704 /* Return 128-bit container as the preferred SIMD mode for MODE. */
14705 static machine_mode
14706 aarch64_preferred_simd_mode (scalar_mode mode)
14707 {
14708 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14709 return aarch64_simd_container_mode (mode, bits);
14710 }
14711
14712 /* Return a list of possible vector sizes for the vectorizer
14713 to iterate over. */
14714 static void
14715 aarch64_autovectorize_vector_sizes (vector_sizes *sizes, bool)
14716 {
14717 if (TARGET_SVE)
14718 sizes->safe_push (BYTES_PER_SVE_VECTOR);
14719 sizes->safe_push (16);
14720 sizes->safe_push (8);
14721 }
14722
14723 /* Implement TARGET_MANGLE_TYPE. */
14724
14725 static const char *
14726 aarch64_mangle_type (const_tree type)
14727 {
14728 /* The AArch64 ABI documents say that "__va_list" has to be
14729 mangled as if it is in the "std" namespace. */
14730 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14731 return "St9__va_list";
14732
14733 /* Half-precision float. */
14734 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14735 return "Dh";
14736
14737 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
14738 builtin types. */
14739 if (TYPE_NAME (type) != NULL)
14740 return aarch64_mangle_builtin_type (type);
14741
14742 /* Use the default mangling. */
14743 return NULL;
14744 }
14745
14746 /* Find the first rtx_insn before insn that will generate an assembly
14747 instruction. */
14748
14749 static rtx_insn *
14750 aarch64_prev_real_insn (rtx_insn *insn)
14751 {
14752 if (!insn)
14753 return NULL;
14754
14755 do
14756 {
14757 insn = prev_real_insn (insn);
14758 }
14759 while (insn && recog_memoized (insn) < 0);
14760
14761 return insn;
14762 }
14763
14764 static bool
14765 is_madd_op (enum attr_type t1)
14766 {
14767 unsigned int i;
14768 /* A number of these may be AArch32 only. */
14769 enum attr_type mlatypes[] = {
14770 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14771 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14772 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14773 };
14774
14775 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14776 {
14777 if (t1 == mlatypes[i])
14778 return true;
14779 }
14780
14781 return false;
14782 }
14783
14784 /* Check if there is a register dependency between a load and the insn
14785 for which we hold recog_data. */
14786
14787 static bool
14788 dep_between_memop_and_curr (rtx memop)
14789 {
14790 rtx load_reg;
14791 int opno;
14792
14793 gcc_assert (GET_CODE (memop) == SET);
14794
14795 if (!REG_P (SET_DEST (memop)))
14796 return false;
14797
14798 load_reg = SET_DEST (memop);
14799 for (opno = 1; opno < recog_data.n_operands; opno++)
14800 {
14801 rtx operand = recog_data.operand[opno];
14802 if (REG_P (operand)
14803 && reg_overlap_mentioned_p (load_reg, operand))
14804 return true;
14805
14806 }
14807 return false;
14808 }
14809
14810
14811 /* When working around the Cortex-A53 erratum 835769,
14812 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14813 instruction and has a preceding memory instruction such that a NOP
14814 should be inserted between them. */
14815
14816 bool
14817 aarch64_madd_needs_nop (rtx_insn* insn)
14818 {
14819 enum attr_type attr_type;
14820 rtx_insn *prev;
14821 rtx body;
14822
14823 if (!TARGET_FIX_ERR_A53_835769)
14824 return false;
14825
14826 if (!INSN_P (insn) || recog_memoized (insn) < 0)
14827 return false;
14828
14829 attr_type = get_attr_type (insn);
14830 if (!is_madd_op (attr_type))
14831 return false;
14832
14833 prev = aarch64_prev_real_insn (insn);
14834 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14835 Restore recog state to INSN to avoid state corruption. */
14836 extract_constrain_insn_cached (insn);
14837
14838 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14839 return false;
14840
14841 body = single_set (prev);
14842
14843 /* If the previous insn is a memory op and there is no dependency between
14844 it and the DImode madd, emit a NOP between them. If body is NULL then we
14845 have a complex memory operation, probably a load/store pair.
14846 Be conservative for now and emit a NOP. */
14847 if (GET_MODE (recog_data.operand[0]) == DImode
14848 && (!body || !dep_between_memop_and_curr (body)))
14849 return true;
14850
14851 return false;
14852
14853 }
14854
14855
14856 /* Implement FINAL_PRESCAN_INSN. */
14857
14858 void
14859 aarch64_final_prescan_insn (rtx_insn *insn)
14860 {
14861 if (aarch64_madd_needs_nop (insn))
14862 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14863 }
14864
14865
14866 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14867 instruction. */
14868
14869 bool
14870 aarch64_sve_index_immediate_p (rtx base_or_step)
14871 {
14872 return (CONST_INT_P (base_or_step)
14873 && IN_RANGE (INTVAL (base_or_step), -16, 15));
14874 }
14875
14876 /* Return true if X is a valid immediate for the SVE ADD and SUB
14877 instructions. Negate X first if NEGATE_P is true. */
14878
14879 bool
14880 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14881 {
14882 rtx elt;
14883
14884 if (!const_vec_duplicate_p (x, &elt)
14885 || !CONST_INT_P (elt))
14886 return false;
14887
14888 HOST_WIDE_INT val = INTVAL (elt);
14889 if (negate_p)
14890 val = -val;
14891 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14892
14893 if (val & 0xff)
14894 return IN_RANGE (val, 0, 0xff);
14895 return IN_RANGE (val, 0, 0xff00);
14896 }
14897
14898 /* Return true if X is a valid immediate operand for an SVE logical
14899 instruction such as AND. */
14900
14901 bool
14902 aarch64_sve_bitmask_immediate_p (rtx x)
14903 {
14904 rtx elt;
14905
14906 return (const_vec_duplicate_p (x, &elt)
14907 && CONST_INT_P (elt)
14908 && aarch64_bitmask_imm (INTVAL (elt),
14909 GET_MODE_INNER (GET_MODE (x))));
14910 }
14911
14912 /* Return true if X is a valid immediate for the SVE DUP and CPY
14913 instructions. */
14914
14915 bool
14916 aarch64_sve_dup_immediate_p (rtx x)
14917 {
14918 rtx elt;
14919
14920 if (!const_vec_duplicate_p (x, &elt)
14921 || !CONST_INT_P (elt))
14922 return false;
14923
14924 HOST_WIDE_INT val = INTVAL (elt);
14925 if (val & 0xff)
14926 return IN_RANGE (val, -0x80, 0x7f);
14927 return IN_RANGE (val, -0x8000, 0x7f00);
14928 }
14929
14930 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14931 SIGNED_P says whether the operand is signed rather than unsigned. */
14932
14933 bool
14934 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14935 {
14936 rtx elt;
14937
14938 return (const_vec_duplicate_p (x, &elt)
14939 && CONST_INT_P (elt)
14940 && (signed_p
14941 ? IN_RANGE (INTVAL (elt), -16, 15)
14942 : IN_RANGE (INTVAL (elt), 0, 127)));
14943 }
14944
14945 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14946 instruction. Negate X first if NEGATE_P is true. */
14947
14948 bool
14949 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14950 {
14951 rtx elt;
14952 REAL_VALUE_TYPE r;
14953
14954 if (!const_vec_duplicate_p (x, &elt)
14955 || GET_CODE (elt) != CONST_DOUBLE)
14956 return false;
14957
14958 r = *CONST_DOUBLE_REAL_VALUE (elt);
14959
14960 if (negate_p)
14961 r = real_value_negate (&r);
14962
14963 if (real_equal (&r, &dconst1))
14964 return true;
14965 if (real_equal (&r, &dconsthalf))
14966 return true;
14967 return false;
14968 }
14969
14970 /* Return true if X is a valid immediate operand for an SVE FMUL
14971 instruction. */
14972
14973 bool
14974 aarch64_sve_float_mul_immediate_p (rtx x)
14975 {
14976 rtx elt;
14977
14978 /* GCC will never generate a multiply with an immediate of 2, so there is no
14979 point testing for it (even though it is a valid constant). */
14980 return (const_vec_duplicate_p (x, &elt)
14981 && GET_CODE (elt) == CONST_DOUBLE
14982 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14983 }
14984
14985 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14986 for the Advanced SIMD operation described by WHICH and INSN. If INFO
14987 is nonnull, use it to describe valid immediates. */
14988 static bool
14989 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14990 simd_immediate_info *info,
14991 enum simd_immediate_check which,
14992 simd_immediate_info::insn_type insn)
14993 {
14994 /* Try a 4-byte immediate with LSL. */
14995 for (unsigned int shift = 0; shift < 32; shift += 8)
14996 if ((val32 & (0xff << shift)) == val32)
14997 {
14998 if (info)
14999 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15000 simd_immediate_info::LSL, shift);
15001 return true;
15002 }
15003
15004 /* Try a 2-byte immediate with LSL. */
15005 unsigned int imm16 = val32 & 0xffff;
15006 if (imm16 == (val32 >> 16))
15007 for (unsigned int shift = 0; shift < 16; shift += 8)
15008 if ((imm16 & (0xff << shift)) == imm16)
15009 {
15010 if (info)
15011 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
15012 simd_immediate_info::LSL, shift);
15013 return true;
15014 }
15015
15016 /* Try a 4-byte immediate with MSL, except for cases that MVN
15017 can handle. */
15018 if (which == AARCH64_CHECK_MOV)
15019 for (unsigned int shift = 8; shift < 24; shift += 8)
15020 {
15021 unsigned int low = (1 << shift) - 1;
15022 if (((val32 & (0xff << shift)) | low) == val32)
15023 {
15024 if (info)
15025 *info = simd_immediate_info (SImode, val32 >> shift, insn,
15026 simd_immediate_info::MSL, shift);
15027 return true;
15028 }
15029 }
15030
15031 return false;
15032 }
15033
15034 /* Return true if replicating VAL64 is a valid immediate for the
15035 Advanced SIMD operation described by WHICH. If INFO is nonnull,
15036 use it to describe valid immediates. */
15037 static bool
15038 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
15039 simd_immediate_info *info,
15040 enum simd_immediate_check which)
15041 {
15042 unsigned int val32 = val64 & 0xffffffff;
15043 unsigned int val16 = val64 & 0xffff;
15044 unsigned int val8 = val64 & 0xff;
15045
15046 if (val32 == (val64 >> 32))
15047 {
15048 if ((which & AARCH64_CHECK_ORR) != 0
15049 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
15050 simd_immediate_info::MOV))
15051 return true;
15052
15053 if ((which & AARCH64_CHECK_BIC) != 0
15054 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
15055 simd_immediate_info::MVN))
15056 return true;
15057
15058 /* Try using a replicated byte. */
15059 if (which == AARCH64_CHECK_MOV
15060 && val16 == (val32 >> 16)
15061 && val8 == (val16 >> 8))
15062 {
15063 if (info)
15064 *info = simd_immediate_info (QImode, val8);
15065 return true;
15066 }
15067 }
15068
15069 /* Try using a bit-to-bytemask. */
15070 if (which == AARCH64_CHECK_MOV)
15071 {
15072 unsigned int i;
15073 for (i = 0; i < 64; i += 8)
15074 {
15075 unsigned char byte = (val64 >> i) & 0xff;
15076 if (byte != 0 && byte != 0xff)
15077 break;
15078 }
15079 if (i == 64)
15080 {
15081 if (info)
15082 *info = simd_immediate_info (DImode, val64);
15083 return true;
15084 }
15085 }
15086 return false;
15087 }
15088
15089 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
15090 instruction. If INFO is nonnull, use it to describe valid immediates. */
15091
15092 static bool
15093 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
15094 simd_immediate_info *info)
15095 {
15096 scalar_int_mode mode = DImode;
15097 unsigned int val32 = val64 & 0xffffffff;
15098 if (val32 == (val64 >> 32))
15099 {
15100 mode = SImode;
15101 unsigned int val16 = val32 & 0xffff;
15102 if (val16 == (val32 >> 16))
15103 {
15104 mode = HImode;
15105 unsigned int val8 = val16 & 0xff;
15106 if (val8 == (val16 >> 8))
15107 mode = QImode;
15108 }
15109 }
15110 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
15111 if (IN_RANGE (val, -0x80, 0x7f))
15112 {
15113 /* DUP with no shift. */
15114 if (info)
15115 *info = simd_immediate_info (mode, val);
15116 return true;
15117 }
15118 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
15119 {
15120 /* DUP with LSL #8. */
15121 if (info)
15122 *info = simd_immediate_info (mode, val);
15123 return true;
15124 }
15125 if (aarch64_bitmask_imm (val64, mode))
15126 {
15127 /* DUPM. */
15128 if (info)
15129 *info = simd_immediate_info (mode, val);
15130 return true;
15131 }
15132 return false;
15133 }
15134
15135 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
15136 it to describe valid immediates. */
15137
15138 static bool
15139 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
15140 {
15141 if (x == CONST0_RTX (GET_MODE (x)))
15142 {
15143 if (info)
15144 *info = simd_immediate_info (DImode, 0);
15145 return true;
15146 }
15147
15148 /* Analyze the value as a VNx16BImode. This should be relatively
15149 efficient, since rtx_vector_builder has enough built-in capacity
15150 to store all VLA predicate constants without needing the heap. */
15151 rtx_vector_builder builder;
15152 if (!aarch64_get_sve_pred_bits (builder, x))
15153 return false;
15154
15155 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
15156 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
15157 {
15158 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
15159 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
15160 if (pattern != AARCH64_NUM_SVPATTERNS)
15161 {
15162 if (info)
15163 {
15164 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
15165 *info = simd_immediate_info (int_mode, pattern);
15166 }
15167 return true;
15168 }
15169 }
15170 return false;
15171 }
15172
15173 /* Return true if OP is a valid SIMD immediate for the operation
15174 described by WHICH. If INFO is nonnull, use it to describe valid
15175 immediates. */
15176 bool
15177 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
15178 enum simd_immediate_check which)
15179 {
15180 machine_mode mode = GET_MODE (op);
15181 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15182 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15183 return false;
15184
15185 if (vec_flags & VEC_SVE_PRED)
15186 return aarch64_sve_pred_valid_immediate (op, info);
15187
15188 scalar_mode elt_mode = GET_MODE_INNER (mode);
15189 rtx base, step;
15190 unsigned int n_elts;
15191 if (GET_CODE (op) == CONST_VECTOR
15192 && CONST_VECTOR_DUPLICATE_P (op))
15193 n_elts = CONST_VECTOR_NPATTERNS (op);
15194 else if ((vec_flags & VEC_SVE_DATA)
15195 && const_vec_series_p (op, &base, &step))
15196 {
15197 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
15198 if (!aarch64_sve_index_immediate_p (base)
15199 || !aarch64_sve_index_immediate_p (step))
15200 return false;
15201
15202 if (info)
15203 *info = simd_immediate_info (elt_mode, base, step);
15204 return true;
15205 }
15206 else if (GET_CODE (op) == CONST_VECTOR
15207 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
15208 /* N_ELTS set above. */;
15209 else
15210 return false;
15211
15212 scalar_float_mode elt_float_mode;
15213 if (n_elts == 1
15214 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
15215 {
15216 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
15217 if (aarch64_float_const_zero_rtx_p (elt)
15218 || aarch64_float_const_representable_p (elt))
15219 {
15220 if (info)
15221 *info = simd_immediate_info (elt_float_mode, elt);
15222 return true;
15223 }
15224 }
15225
15226 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
15227 if (elt_size > 8)
15228 return false;
15229
15230 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
15231
15232 /* Expand the vector constant out into a byte vector, with the least
15233 significant byte of the register first. */
15234 auto_vec<unsigned char, 16> bytes;
15235 bytes.reserve (n_elts * elt_size);
15236 for (unsigned int i = 0; i < n_elts; i++)
15237 {
15238 /* The vector is provided in gcc endian-neutral fashion.
15239 For aarch64_be Advanced SIMD, it must be laid out in the vector
15240 register in reverse order. */
15241 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
15242 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
15243
15244 if (elt_mode != elt_int_mode)
15245 elt = gen_lowpart (elt_int_mode, elt);
15246
15247 if (!CONST_INT_P (elt))
15248 return false;
15249
15250 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
15251 for (unsigned int byte = 0; byte < elt_size; byte++)
15252 {
15253 bytes.quick_push (elt_val & 0xff);
15254 elt_val >>= BITS_PER_UNIT;
15255 }
15256 }
15257
15258 /* The immediate must repeat every eight bytes. */
15259 unsigned int nbytes = bytes.length ();
15260 for (unsigned i = 8; i < nbytes; ++i)
15261 if (bytes[i] != bytes[i - 8])
15262 return false;
15263
15264 /* Get the repeating 8-byte value as an integer. No endian correction
15265 is needed here because bytes is already in lsb-first order. */
15266 unsigned HOST_WIDE_INT val64 = 0;
15267 for (unsigned int i = 0; i < 8; i++)
15268 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
15269 << (i * BITS_PER_UNIT));
15270
15271 if (vec_flags & VEC_SVE_DATA)
15272 return aarch64_sve_valid_immediate (val64, info);
15273 else
15274 return aarch64_advsimd_valid_immediate (val64, info, which);
15275 }
15276
15277 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
15278 has a step in the range of INDEX. Return the index expression if so,
15279 otherwise return null. */
15280 rtx
15281 aarch64_check_zero_based_sve_index_immediate (rtx x)
15282 {
15283 rtx base, step;
15284 if (const_vec_series_p (x, &base, &step)
15285 && base == const0_rtx
15286 && aarch64_sve_index_immediate_p (step))
15287 return step;
15288 return NULL_RTX;
15289 }
15290
15291 /* Check of immediate shift constants are within range. */
15292 bool
15293 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
15294 {
15295 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
15296 if (left)
15297 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
15298 else
15299 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
15300 }
15301
15302 /* Return the bitmask CONST_INT to select the bits required by a zero extract
15303 operation of width WIDTH at bit position POS. */
15304
15305 rtx
15306 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
15307 {
15308 gcc_assert (CONST_INT_P (width));
15309 gcc_assert (CONST_INT_P (pos));
15310
15311 unsigned HOST_WIDE_INT mask
15312 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
15313 return GEN_INT (mask << UINTVAL (pos));
15314 }
15315
15316 bool
15317 aarch64_mov_operand_p (rtx x, machine_mode mode)
15318 {
15319 if (GET_CODE (x) == HIGH
15320 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
15321 return true;
15322
15323 if (CONST_INT_P (x))
15324 return true;
15325
15326 if (VECTOR_MODE_P (GET_MODE (x)))
15327 {
15328 /* Require predicate constants to be VNx16BI before RA, so that we
15329 force everything to have a canonical form. */
15330 if (!lra_in_progress
15331 && !reload_completed
15332 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
15333 && GET_MODE (x) != VNx16BImode)
15334 return false;
15335
15336 return aarch64_simd_valid_immediate (x, NULL);
15337 }
15338
15339 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
15340 return true;
15341
15342 if (aarch64_sve_cnt_immediate_p (x))
15343 return true;
15344
15345 return aarch64_classify_symbolic_expression (x)
15346 == SYMBOL_TINY_ABSOLUTE;
15347 }
15348
15349 /* Return a const_int vector of VAL. */
15350 rtx
15351 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
15352 {
15353 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
15354 return gen_const_vec_duplicate (mode, c);
15355 }
15356
15357 /* Check OP is a legal scalar immediate for the MOVI instruction. */
15358
15359 bool
15360 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
15361 {
15362 machine_mode vmode;
15363
15364 vmode = aarch64_simd_container_mode (mode, 64);
15365 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
15366 return aarch64_simd_valid_immediate (op_v, NULL);
15367 }
15368
15369 /* Construct and return a PARALLEL RTX vector with elements numbering the
15370 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
15371 the vector - from the perspective of the architecture. This does not
15372 line up with GCC's perspective on lane numbers, so we end up with
15373 different masks depending on our target endian-ness. The diagram
15374 below may help. We must draw the distinction when building masks
15375 which select one half of the vector. An instruction selecting
15376 architectural low-lanes for a big-endian target, must be described using
15377 a mask selecting GCC high-lanes.
15378
15379 Big-Endian Little-Endian
15380
15381 GCC 0 1 2 3 3 2 1 0
15382 | x | x | x | x | | x | x | x | x |
15383 Architecture 3 2 1 0 3 2 1 0
15384
15385 Low Mask: { 2, 3 } { 0, 1 }
15386 High Mask: { 0, 1 } { 2, 3 }
15387
15388 MODE Is the mode of the vector and NUNITS is the number of units in it. */
15389
15390 rtx
15391 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
15392 {
15393 rtvec v = rtvec_alloc (nunits / 2);
15394 int high_base = nunits / 2;
15395 int low_base = 0;
15396 int base;
15397 rtx t1;
15398 int i;
15399
15400 if (BYTES_BIG_ENDIAN)
15401 base = high ? low_base : high_base;
15402 else
15403 base = high ? high_base : low_base;
15404
15405 for (i = 0; i < nunits / 2; i++)
15406 RTVEC_ELT (v, i) = GEN_INT (base + i);
15407
15408 t1 = gen_rtx_PARALLEL (mode, v);
15409 return t1;
15410 }
15411
15412 /* Check OP for validity as a PARALLEL RTX vector with elements
15413 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
15414 from the perspective of the architecture. See the diagram above
15415 aarch64_simd_vect_par_cnst_half for more details. */
15416
15417 bool
15418 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
15419 bool high)
15420 {
15421 int nelts;
15422 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
15423 return false;
15424
15425 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
15426 HOST_WIDE_INT count_op = XVECLEN (op, 0);
15427 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
15428 int i = 0;
15429
15430 if (count_op != count_ideal)
15431 return false;
15432
15433 for (i = 0; i < count_ideal; i++)
15434 {
15435 rtx elt_op = XVECEXP (op, 0, i);
15436 rtx elt_ideal = XVECEXP (ideal, 0, i);
15437
15438 if (!CONST_INT_P (elt_op)
15439 || INTVAL (elt_ideal) != INTVAL (elt_op))
15440 return false;
15441 }
15442 return true;
15443 }
15444
15445 /* Return a PARALLEL containing NELTS elements, with element I equal
15446 to BASE + I * STEP. */
15447
15448 rtx
15449 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
15450 {
15451 rtvec vec = rtvec_alloc (nelts);
15452 for (unsigned int i = 0; i < nelts; ++i)
15453 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
15454 return gen_rtx_PARALLEL (VOIDmode, vec);
15455 }
15456
15457 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
15458 series with step STEP. */
15459
15460 bool
15461 aarch64_stepped_int_parallel_p (rtx op, int step)
15462 {
15463 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
15464 return false;
15465
15466 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
15467 for (int i = 1; i < XVECLEN (op, 0); ++i)
15468 if (!CONST_INT_P (XVECEXP (op, 0, i))
15469 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
15470 return false;
15471
15472 return true;
15473 }
15474
15475 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
15476 HIGH (exclusive). */
15477 void
15478 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
15479 const_tree exp)
15480 {
15481 HOST_WIDE_INT lane;
15482 gcc_assert (CONST_INT_P (operand));
15483 lane = INTVAL (operand);
15484
15485 if (lane < low || lane >= high)
15486 {
15487 if (exp)
15488 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
15489 else
15490 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
15491 }
15492 }
15493
15494 /* Peform endian correction on lane number N, which indexes a vector
15495 of mode MODE, and return the result as an SImode rtx. */
15496
15497 rtx
15498 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
15499 {
15500 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
15501 }
15502
15503 /* Return TRUE if OP is a valid vector addressing mode. */
15504
15505 bool
15506 aarch64_simd_mem_operand_p (rtx op)
15507 {
15508 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
15509 || REG_P (XEXP (op, 0)));
15510 }
15511
15512 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
15513
15514 bool
15515 aarch64_sve_ld1r_operand_p (rtx op)
15516 {
15517 struct aarch64_address_info addr;
15518 scalar_mode mode;
15519
15520 return (MEM_P (op)
15521 && is_a <scalar_mode> (GET_MODE (op), &mode)
15522 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
15523 && addr.type == ADDRESS_REG_IMM
15524 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
15525 }
15526
15527 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
15528 bool
15529 aarch64_sve_ld1rq_operand_p (rtx op)
15530 {
15531 struct aarch64_address_info addr;
15532 scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
15533 if (!MEM_P (op)
15534 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
15535 return false;
15536
15537 if (addr.type == ADDRESS_REG_IMM)
15538 return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
15539
15540 if (addr.type == ADDRESS_REG_REG)
15541 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
15542
15543 return false;
15544 }
15545
15546 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
15547 The conditions for STR are the same. */
15548 bool
15549 aarch64_sve_ldr_operand_p (rtx op)
15550 {
15551 struct aarch64_address_info addr;
15552
15553 return (MEM_P (op)
15554 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
15555 false, ADDR_QUERY_ANY)
15556 && addr.type == ADDRESS_REG_IMM);
15557 }
15558
15559 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
15560 We need to be able to access the individual pieces, so the range
15561 is different from LD[234] and ST[234]. */
15562 bool
15563 aarch64_sve_struct_memory_operand_p (rtx op)
15564 {
15565 if (!MEM_P (op))
15566 return false;
15567
15568 machine_mode mode = GET_MODE (op);
15569 struct aarch64_address_info addr;
15570 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
15571 ADDR_QUERY_ANY)
15572 || addr.type != ADDRESS_REG_IMM)
15573 return false;
15574
15575 poly_int64 first = addr.const_offset;
15576 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
15577 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
15578 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
15579 }
15580
15581 /* Emit a register copy from operand to operand, taking care not to
15582 early-clobber source registers in the process.
15583
15584 COUNT is the number of components into which the copy needs to be
15585 decomposed. */
15586 void
15587 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
15588 unsigned int count)
15589 {
15590 unsigned int i;
15591 int rdest = REGNO (operands[0]);
15592 int rsrc = REGNO (operands[1]);
15593
15594 if (!reg_overlap_mentioned_p (operands[0], operands[1])
15595 || rdest < rsrc)
15596 for (i = 0; i < count; i++)
15597 emit_move_insn (gen_rtx_REG (mode, rdest + i),
15598 gen_rtx_REG (mode, rsrc + i));
15599 else
15600 for (i = 0; i < count; i++)
15601 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
15602 gen_rtx_REG (mode, rsrc + count - i - 1));
15603 }
15604
15605 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
15606 one of VSTRUCT modes: OI, CI, or XI. */
15607 int
15608 aarch64_simd_attr_length_rglist (machine_mode mode)
15609 {
15610 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
15611 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
15612 }
15613
15614 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
15615 alignment of a vector to 128 bits. SVE predicates have an alignment of
15616 16 bits. */
15617 static HOST_WIDE_INT
15618 aarch64_simd_vector_alignment (const_tree type)
15619 {
15620 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15621 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
15622 be set for non-predicate vectors of booleans. Modes are the most
15623 direct way we have of identifying real SVE predicate types. */
15624 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
15625 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
15626 }
15627
15628 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
15629 static poly_uint64
15630 aarch64_vectorize_preferred_vector_alignment (const_tree type)
15631 {
15632 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
15633 {
15634 /* If the length of the vector is fixed, try to align to that length,
15635 otherwise don't try to align at all. */
15636 HOST_WIDE_INT result;
15637 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
15638 result = TYPE_ALIGN (TREE_TYPE (type));
15639 return result;
15640 }
15641 return TYPE_ALIGN (type);
15642 }
15643
15644 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
15645 static bool
15646 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
15647 {
15648 if (is_packed)
15649 return false;
15650
15651 /* For fixed-length vectors, check that the vectorizer will aim for
15652 full-vector alignment. This isn't true for generic GCC vectors
15653 that are wider than the ABI maximum of 128 bits. */
15654 poly_uint64 preferred_alignment =
15655 aarch64_vectorize_preferred_vector_alignment (type);
15656 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15657 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
15658 preferred_alignment))
15659 return false;
15660
15661 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
15662 return true;
15663 }
15664
15665 /* Return true if the vector misalignment factor is supported by the
15666 target. */
15667 static bool
15668 aarch64_builtin_support_vector_misalignment (machine_mode mode,
15669 const_tree type, int misalignment,
15670 bool is_packed)
15671 {
15672 if (TARGET_SIMD && STRICT_ALIGNMENT)
15673 {
15674 /* Return if movmisalign pattern is not supported for this mode. */
15675 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
15676 return false;
15677
15678 /* Misalignment factor is unknown at compile time. */
15679 if (misalignment == -1)
15680 return false;
15681 }
15682 return default_builtin_support_vector_misalignment (mode, type, misalignment,
15683 is_packed);
15684 }
15685
15686 /* If VALS is a vector constant that can be loaded into a register
15687 using DUP, generate instructions to do so and return an RTX to
15688 assign to the register. Otherwise return NULL_RTX. */
15689 static rtx
15690 aarch64_simd_dup_constant (rtx vals)
15691 {
15692 machine_mode mode = GET_MODE (vals);
15693 machine_mode inner_mode = GET_MODE_INNER (mode);
15694 rtx x;
15695
15696 if (!const_vec_duplicate_p (vals, &x))
15697 return NULL_RTX;
15698
15699 /* We can load this constant by using DUP and a constant in a
15700 single ARM register. This will be cheaper than a vector
15701 load. */
15702 x = copy_to_mode_reg (inner_mode, x);
15703 return gen_vec_duplicate (mode, x);
15704 }
15705
15706
15707 /* Generate code to load VALS, which is a PARALLEL containing only
15708 constants (for vec_init) or CONST_VECTOR, efficiently into a
15709 register. Returns an RTX to copy into the register, or NULL_RTX
15710 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
15711 static rtx
15712 aarch64_simd_make_constant (rtx vals)
15713 {
15714 machine_mode mode = GET_MODE (vals);
15715 rtx const_dup;
15716 rtx const_vec = NULL_RTX;
15717 int n_const = 0;
15718 int i;
15719
15720 if (GET_CODE (vals) == CONST_VECTOR)
15721 const_vec = vals;
15722 else if (GET_CODE (vals) == PARALLEL)
15723 {
15724 /* A CONST_VECTOR must contain only CONST_INTs and
15725 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15726 Only store valid constants in a CONST_VECTOR. */
15727 int n_elts = XVECLEN (vals, 0);
15728 for (i = 0; i < n_elts; ++i)
15729 {
15730 rtx x = XVECEXP (vals, 0, i);
15731 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15732 n_const++;
15733 }
15734 if (n_const == n_elts)
15735 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15736 }
15737 else
15738 gcc_unreachable ();
15739
15740 if (const_vec != NULL_RTX
15741 && aarch64_simd_valid_immediate (const_vec, NULL))
15742 /* Load using MOVI/MVNI. */
15743 return const_vec;
15744 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15745 /* Loaded using DUP. */
15746 return const_dup;
15747 else if (const_vec != NULL_RTX)
15748 /* Load from constant pool. We cannot take advantage of single-cycle
15749 LD1 because we need a PC-relative addressing mode. */
15750 return const_vec;
15751 else
15752 /* A PARALLEL containing something not valid inside CONST_VECTOR.
15753 We cannot construct an initializer. */
15754 return NULL_RTX;
15755 }
15756
15757 /* Expand a vector initialisation sequence, such that TARGET is
15758 initialised to contain VALS. */
15759
15760 void
15761 aarch64_expand_vector_init (rtx target, rtx vals)
15762 {
15763 machine_mode mode = GET_MODE (target);
15764 scalar_mode inner_mode = GET_MODE_INNER (mode);
15765 /* The number of vector elements. */
15766 int n_elts = XVECLEN (vals, 0);
15767 /* The number of vector elements which are not constant. */
15768 int n_var = 0;
15769 rtx any_const = NULL_RTX;
15770 /* The first element of vals. */
15771 rtx v0 = XVECEXP (vals, 0, 0);
15772 bool all_same = true;
15773
15774 /* This is a special vec_init<M><N> where N is not an element mode but a
15775 vector mode with half the elements of M. We expect to find two entries
15776 of mode N in VALS and we must put their concatentation into TARGET. */
15777 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
15778 {
15779 gcc_assert (known_eq (GET_MODE_SIZE (mode),
15780 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
15781 rtx lo = XVECEXP (vals, 0, 0);
15782 rtx hi = XVECEXP (vals, 0, 1);
15783 machine_mode narrow_mode = GET_MODE (lo);
15784 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
15785 gcc_assert (narrow_mode == GET_MODE (hi));
15786
15787 /* When we want to concatenate a half-width vector with zeroes we can
15788 use the aarch64_combinez[_be] patterns. Just make sure that the
15789 zeroes are in the right half. */
15790 if (BYTES_BIG_ENDIAN
15791 && aarch64_simd_imm_zero (lo, narrow_mode)
15792 && general_operand (hi, narrow_mode))
15793 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
15794 else if (!BYTES_BIG_ENDIAN
15795 && aarch64_simd_imm_zero (hi, narrow_mode)
15796 && general_operand (lo, narrow_mode))
15797 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
15798 else
15799 {
15800 /* Else create the two half-width registers and combine them. */
15801 if (!REG_P (lo))
15802 lo = force_reg (GET_MODE (lo), lo);
15803 if (!REG_P (hi))
15804 hi = force_reg (GET_MODE (hi), hi);
15805
15806 if (BYTES_BIG_ENDIAN)
15807 std::swap (lo, hi);
15808 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
15809 }
15810 return;
15811 }
15812
15813 /* Count the number of variable elements to initialise. */
15814 for (int i = 0; i < n_elts; ++i)
15815 {
15816 rtx x = XVECEXP (vals, 0, i);
15817 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15818 ++n_var;
15819 else
15820 any_const = x;
15821
15822 all_same &= rtx_equal_p (x, v0);
15823 }
15824
15825 /* No variable elements, hand off to aarch64_simd_make_constant which knows
15826 how best to handle this. */
15827 if (n_var == 0)
15828 {
15829 rtx constant = aarch64_simd_make_constant (vals);
15830 if (constant != NULL_RTX)
15831 {
15832 emit_move_insn (target, constant);
15833 return;
15834 }
15835 }
15836
15837 /* Splat a single non-constant element if we can. */
15838 if (all_same)
15839 {
15840 rtx x = copy_to_mode_reg (inner_mode, v0);
15841 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15842 return;
15843 }
15844
15845 enum insn_code icode = optab_handler (vec_set_optab, mode);
15846 gcc_assert (icode != CODE_FOR_nothing);
15847
15848 /* If there are only variable elements, try to optimize
15849 the insertion using dup for the most common element
15850 followed by insertions. */
15851
15852 /* The algorithm will fill matches[*][0] with the earliest matching element,
15853 and matches[X][1] with the count of duplicate elements (if X is the
15854 earliest element which has duplicates). */
15855
15856 if (n_var == n_elts && n_elts <= 16)
15857 {
15858 int matches[16][2] = {0};
15859 for (int i = 0; i < n_elts; i++)
15860 {
15861 for (int j = 0; j <= i; j++)
15862 {
15863 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15864 {
15865 matches[i][0] = j;
15866 matches[j][1]++;
15867 break;
15868 }
15869 }
15870 }
15871 int maxelement = 0;
15872 int maxv = 0;
15873 for (int i = 0; i < n_elts; i++)
15874 if (matches[i][1] > maxv)
15875 {
15876 maxelement = i;
15877 maxv = matches[i][1];
15878 }
15879
15880 /* Create a duplicate of the most common element, unless all elements
15881 are equally useless to us, in which case just immediately set the
15882 vector register using the first element. */
15883
15884 if (maxv == 1)
15885 {
15886 /* For vectors of two 64-bit elements, we can do even better. */
15887 if (n_elts == 2
15888 && (inner_mode == E_DImode
15889 || inner_mode == E_DFmode))
15890
15891 {
15892 rtx x0 = XVECEXP (vals, 0, 0);
15893 rtx x1 = XVECEXP (vals, 0, 1);
15894 /* Combine can pick up this case, but handling it directly
15895 here leaves clearer RTL.
15896
15897 This is load_pair_lanes<mode>, and also gives us a clean-up
15898 for store_pair_lanes<mode>. */
15899 if (memory_operand (x0, inner_mode)
15900 && memory_operand (x1, inner_mode)
15901 && !STRICT_ALIGNMENT
15902 && rtx_equal_p (XEXP (x1, 0),
15903 plus_constant (Pmode,
15904 XEXP (x0, 0),
15905 GET_MODE_SIZE (inner_mode))))
15906 {
15907 rtx t;
15908 if (inner_mode == DFmode)
15909 t = gen_load_pair_lanesdf (target, x0, x1);
15910 else
15911 t = gen_load_pair_lanesdi (target, x0, x1);
15912 emit_insn (t);
15913 return;
15914 }
15915 }
15916 /* The subreg-move sequence below will move into lane zero of the
15917 vector register. For big-endian we want that position to hold
15918 the last element of VALS. */
15919 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15920 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15921 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15922 }
15923 else
15924 {
15925 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15926 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15927 }
15928
15929 /* Insert the rest. */
15930 for (int i = 0; i < n_elts; i++)
15931 {
15932 rtx x = XVECEXP (vals, 0, i);
15933 if (matches[i][0] == maxelement)
15934 continue;
15935 x = copy_to_mode_reg (inner_mode, x);
15936 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15937 }
15938 return;
15939 }
15940
15941 /* Initialise a vector which is part-variable. We want to first try
15942 to build those lanes which are constant in the most efficient way we
15943 can. */
15944 if (n_var != n_elts)
15945 {
15946 rtx copy = copy_rtx (vals);
15947
15948 /* Load constant part of vector. We really don't care what goes into the
15949 parts we will overwrite, but we're more likely to be able to load the
15950 constant efficiently if it has fewer, larger, repeating parts
15951 (see aarch64_simd_valid_immediate). */
15952 for (int i = 0; i < n_elts; i++)
15953 {
15954 rtx x = XVECEXP (vals, 0, i);
15955 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15956 continue;
15957 rtx subst = any_const;
15958 for (int bit = n_elts / 2; bit > 0; bit /= 2)
15959 {
15960 /* Look in the copied vector, as more elements are const. */
15961 rtx test = XVECEXP (copy, 0, i ^ bit);
15962 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15963 {
15964 subst = test;
15965 break;
15966 }
15967 }
15968 XVECEXP (copy, 0, i) = subst;
15969 }
15970 aarch64_expand_vector_init (target, copy);
15971 }
15972
15973 /* Insert the variable lanes directly. */
15974 for (int i = 0; i < n_elts; i++)
15975 {
15976 rtx x = XVECEXP (vals, 0, i);
15977 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15978 continue;
15979 x = copy_to_mode_reg (inner_mode, x);
15980 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15981 }
15982 }
15983
15984 /* Emit RTL corresponding to:
15985 insr TARGET, ELEM. */
15986
15987 static void
15988 emit_insr (rtx target, rtx elem)
15989 {
15990 machine_mode mode = GET_MODE (target);
15991 scalar_mode elem_mode = GET_MODE_INNER (mode);
15992 elem = force_reg (elem_mode, elem);
15993
15994 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
15995 gcc_assert (icode != CODE_FOR_nothing);
15996 emit_insn (GEN_FCN (icode) (target, target, elem));
15997 }
15998
15999 /* Subroutine of aarch64_sve_expand_vector_init for handling
16000 trailing constants.
16001 This function works as follows:
16002 (a) Create a new vector consisting of trailing constants.
16003 (b) Initialize TARGET with the constant vector using emit_move_insn.
16004 (c) Insert remaining elements in TARGET using insr.
16005 NELTS is the total number of elements in original vector while
16006 while NELTS_REQD is the number of elements that are actually
16007 significant.
16008
16009 ??? The heuristic used is to do above only if number of constants
16010 is at least half the total number of elements. May need fine tuning. */
16011
16012 static bool
16013 aarch64_sve_expand_vector_init_handle_trailing_constants
16014 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
16015 {
16016 machine_mode mode = GET_MODE (target);
16017 scalar_mode elem_mode = GET_MODE_INNER (mode);
16018 int n_trailing_constants = 0;
16019
16020 for (int i = nelts_reqd - 1;
16021 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
16022 i--)
16023 n_trailing_constants++;
16024
16025 if (n_trailing_constants >= nelts_reqd / 2)
16026 {
16027 rtx_vector_builder v (mode, 1, nelts);
16028 for (int i = 0; i < nelts; i++)
16029 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
16030 rtx const_vec = v.build ();
16031 emit_move_insn (target, const_vec);
16032
16033 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
16034 emit_insr (target, builder.elt (i));
16035
16036 return true;
16037 }
16038
16039 return false;
16040 }
16041
16042 /* Subroutine of aarch64_sve_expand_vector_init.
16043 Works as follows:
16044 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
16045 (b) Skip trailing elements from BUILDER, which are the same as
16046 element NELTS_REQD - 1.
16047 (c) Insert earlier elements in reverse order in TARGET using insr. */
16048
16049 static void
16050 aarch64_sve_expand_vector_init_insert_elems (rtx target,
16051 const rtx_vector_builder &builder,
16052 int nelts_reqd)
16053 {
16054 machine_mode mode = GET_MODE (target);
16055 scalar_mode elem_mode = GET_MODE_INNER (mode);
16056
16057 struct expand_operand ops[2];
16058 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
16059 gcc_assert (icode != CODE_FOR_nothing);
16060
16061 create_output_operand (&ops[0], target, mode);
16062 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
16063 expand_insn (icode, 2, ops);
16064
16065 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16066 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
16067 emit_insr (target, builder.elt (i));
16068 }
16069
16070 /* Subroutine of aarch64_sve_expand_vector_init to handle case
16071 when all trailing elements of builder are same.
16072 This works as follows:
16073 (a) Use expand_insn interface to broadcast last vector element in TARGET.
16074 (b) Insert remaining elements in TARGET using insr.
16075
16076 ??? The heuristic used is to do above if number of same trailing elements
16077 is at least 3/4 of total number of elements, loosely based on
16078 heuristic from mostly_zeros_p. May need fine-tuning. */
16079
16080 static bool
16081 aarch64_sve_expand_vector_init_handle_trailing_same_elem
16082 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
16083 {
16084 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
16085 if (ndups >= (3 * nelts_reqd) / 4)
16086 {
16087 aarch64_sve_expand_vector_init_insert_elems (target, builder,
16088 nelts_reqd - ndups + 1);
16089 return true;
16090 }
16091
16092 return false;
16093 }
16094
16095 /* Initialize register TARGET from BUILDER. NELTS is the constant number
16096 of elements in BUILDER.
16097
16098 The function tries to initialize TARGET from BUILDER if it fits one
16099 of the special cases outlined below.
16100
16101 Failing that, the function divides BUILDER into two sub-vectors:
16102 v_even = even elements of BUILDER;
16103 v_odd = odd elements of BUILDER;
16104
16105 and recursively calls itself with v_even and v_odd.
16106
16107 if (recursive call succeeded for v_even or v_odd)
16108 TARGET = zip (v_even, v_odd)
16109
16110 The function returns true if it managed to build TARGET from BUILDER
16111 with one of the special cases, false otherwise.
16112
16113 Example: {a, 1, b, 2, c, 3, d, 4}
16114
16115 The vector gets divided into:
16116 v_even = {a, b, c, d}
16117 v_odd = {1, 2, 3, 4}
16118
16119 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
16120 initialize tmp2 from constant vector v_odd using emit_move_insn.
16121
16122 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
16123 4 elements, so we construct tmp1 from v_even using insr:
16124 tmp1 = dup(d)
16125 insr tmp1, c
16126 insr tmp1, b
16127 insr tmp1, a
16128
16129 And finally:
16130 TARGET = zip (tmp1, tmp2)
16131 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
16132
16133 static bool
16134 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
16135 int nelts, int nelts_reqd)
16136 {
16137 machine_mode mode = GET_MODE (target);
16138
16139 /* Case 1: Vector contains trailing constants. */
16140
16141 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16142 (target, builder, nelts, nelts_reqd))
16143 return true;
16144
16145 /* Case 2: Vector contains leading constants. */
16146
16147 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
16148 for (int i = 0; i < nelts_reqd; i++)
16149 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
16150 rev_builder.finalize ();
16151
16152 if (aarch64_sve_expand_vector_init_handle_trailing_constants
16153 (target, rev_builder, nelts, nelts_reqd))
16154 {
16155 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16156 return true;
16157 }
16158
16159 /* Case 3: Vector contains trailing same element. */
16160
16161 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16162 (target, builder, nelts_reqd))
16163 return true;
16164
16165 /* Case 4: Vector contains leading same element. */
16166
16167 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
16168 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
16169 {
16170 emit_insn (gen_aarch64_sve_rev (mode, target, target));
16171 return true;
16172 }
16173
16174 /* Avoid recursing below 4-elements.
16175 ??? The threshold 4 may need fine-tuning. */
16176
16177 if (nelts_reqd <= 4)
16178 return false;
16179
16180 rtx_vector_builder v_even (mode, 1, nelts);
16181 rtx_vector_builder v_odd (mode, 1, nelts);
16182
16183 for (int i = 0; i < nelts * 2; i += 2)
16184 {
16185 v_even.quick_push (builder.elt (i));
16186 v_odd.quick_push (builder.elt (i + 1));
16187 }
16188
16189 v_even.finalize ();
16190 v_odd.finalize ();
16191
16192 rtx tmp1 = gen_reg_rtx (mode);
16193 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
16194 nelts, nelts_reqd / 2);
16195
16196 rtx tmp2 = gen_reg_rtx (mode);
16197 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
16198 nelts, nelts_reqd / 2);
16199
16200 if (!did_even_p && !did_odd_p)
16201 return false;
16202
16203 /* Initialize v_even and v_odd using INSR if it didn't match any of the
16204 special cases and zip v_even, v_odd. */
16205
16206 if (!did_even_p)
16207 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
16208
16209 if (!did_odd_p)
16210 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
16211
16212 rtvec v = gen_rtvec (2, tmp1, tmp2);
16213 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
16214 return true;
16215 }
16216
16217 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
16218
16219 void
16220 aarch64_sve_expand_vector_init (rtx target, rtx vals)
16221 {
16222 machine_mode mode = GET_MODE (target);
16223 int nelts = XVECLEN (vals, 0);
16224
16225 rtx_vector_builder v (mode, 1, nelts);
16226 for (int i = 0; i < nelts; i++)
16227 v.quick_push (XVECEXP (vals, 0, i));
16228 v.finalize ();
16229
16230 /* If neither sub-vectors of v could be initialized specially,
16231 then use INSR to insert all elements from v into TARGET.
16232 ??? This might not be optimal for vectors with large
16233 initializers like 16-element or above.
16234 For nelts < 4, it probably isn't useful to handle specially. */
16235
16236 if (nelts < 4
16237 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
16238 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
16239 }
16240
16241 static unsigned HOST_WIDE_INT
16242 aarch64_shift_truncation_mask (machine_mode mode)
16243 {
16244 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
16245 return 0;
16246 return GET_MODE_UNIT_BITSIZE (mode) - 1;
16247 }
16248
16249 /* Select a format to encode pointers in exception handling data. */
16250 int
16251 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
16252 {
16253 int type;
16254 switch (aarch64_cmodel)
16255 {
16256 case AARCH64_CMODEL_TINY:
16257 case AARCH64_CMODEL_TINY_PIC:
16258 case AARCH64_CMODEL_SMALL:
16259 case AARCH64_CMODEL_SMALL_PIC:
16260 case AARCH64_CMODEL_SMALL_SPIC:
16261 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
16262 for everything. */
16263 type = DW_EH_PE_sdata4;
16264 break;
16265 default:
16266 /* No assumptions here. 8-byte relocs required. */
16267 type = DW_EH_PE_sdata8;
16268 break;
16269 }
16270 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
16271 }
16272
16273 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
16274
16275 static void
16276 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
16277 {
16278 if (aarch64_simd_decl_p (decl))
16279 {
16280 fprintf (stream, "\t.variant_pcs\t");
16281 assemble_name (stream, name);
16282 fprintf (stream, "\n");
16283 }
16284 }
16285
16286 /* The last .arch and .tune assembly strings that we printed. */
16287 static std::string aarch64_last_printed_arch_string;
16288 static std::string aarch64_last_printed_tune_string;
16289
16290 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
16291 by the function fndecl. */
16292
16293 void
16294 aarch64_declare_function_name (FILE *stream, const char* name,
16295 tree fndecl)
16296 {
16297 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
16298
16299 struct cl_target_option *targ_options;
16300 if (target_parts)
16301 targ_options = TREE_TARGET_OPTION (target_parts);
16302 else
16303 targ_options = TREE_TARGET_OPTION (target_option_current_node);
16304 gcc_assert (targ_options);
16305
16306 const struct processor *this_arch
16307 = aarch64_get_arch (targ_options->x_explicit_arch);
16308
16309 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
16310 std::string extension
16311 = aarch64_get_extension_string_for_isa_flags (isa_flags,
16312 this_arch->flags);
16313 /* Only update the assembler .arch string if it is distinct from the last
16314 such string we printed. */
16315 std::string to_print = this_arch->name + extension;
16316 if (to_print != aarch64_last_printed_arch_string)
16317 {
16318 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
16319 aarch64_last_printed_arch_string = to_print;
16320 }
16321
16322 /* Print the cpu name we're tuning for in the comments, might be
16323 useful to readers of the generated asm. Do it only when it changes
16324 from function to function and verbose assembly is requested. */
16325 const struct processor *this_tune
16326 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
16327
16328 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
16329 {
16330 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
16331 this_tune->name);
16332 aarch64_last_printed_tune_string = this_tune->name;
16333 }
16334
16335 aarch64_asm_output_variant_pcs (stream, fndecl, name);
16336
16337 /* Don't forget the type directive for ELF. */
16338 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
16339 ASM_OUTPUT_LABEL (stream, name);
16340 }
16341
16342 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
16343
16344 void
16345 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
16346 {
16347 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
16348 const char *value = IDENTIFIER_POINTER (target);
16349 aarch64_asm_output_variant_pcs (stream, decl, name);
16350 ASM_OUTPUT_DEF (stream, name, value);
16351 }
16352
16353 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
16354 function symbol references. */
16355
16356 void
16357 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
16358 {
16359 default_elf_asm_output_external (stream, decl, name);
16360 aarch64_asm_output_variant_pcs (stream, decl, name);
16361 }
16362
16363 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
16364 Used to output the .cfi_b_key_frame directive when signing the current
16365 function with the B key. */
16366
16367 void
16368 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
16369 {
16370 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
16371 && aarch64_ra_sign_key == AARCH64_KEY_B)
16372 asm_fprintf (f, "\t.cfi_b_key_frame\n");
16373 }
16374
16375 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
16376
16377 static void
16378 aarch64_start_file (void)
16379 {
16380 struct cl_target_option *default_options
16381 = TREE_TARGET_OPTION (target_option_default_node);
16382
16383 const struct processor *default_arch
16384 = aarch64_get_arch (default_options->x_explicit_arch);
16385 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
16386 std::string extension
16387 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
16388 default_arch->flags);
16389
16390 aarch64_last_printed_arch_string = default_arch->name + extension;
16391 aarch64_last_printed_tune_string = "";
16392 asm_fprintf (asm_out_file, "\t.arch %s\n",
16393 aarch64_last_printed_arch_string.c_str ());
16394
16395 default_file_start ();
16396 }
16397
16398 /* Emit load exclusive. */
16399
16400 static void
16401 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
16402 rtx mem, rtx model_rtx)
16403 {
16404 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
16405 }
16406
16407 /* Emit store exclusive. */
16408
16409 static void
16410 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
16411 rtx rval, rtx mem, rtx model_rtx)
16412 {
16413 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
16414 }
16415
16416 /* Mark the previous jump instruction as unlikely. */
16417
16418 static void
16419 aarch64_emit_unlikely_jump (rtx insn)
16420 {
16421 rtx_insn *jump = emit_jump_insn (insn);
16422 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
16423 }
16424
16425 /* Expand a compare and swap pattern. */
16426
16427 void
16428 aarch64_expand_compare_and_swap (rtx operands[])
16429 {
16430 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
16431 machine_mode mode, r_mode;
16432
16433 bval = operands[0];
16434 rval = operands[1];
16435 mem = operands[2];
16436 oldval = operands[3];
16437 newval = operands[4];
16438 is_weak = operands[5];
16439 mod_s = operands[6];
16440 mod_f = operands[7];
16441 mode = GET_MODE (mem);
16442
16443 /* Normally the succ memory model must be stronger than fail, but in the
16444 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
16445 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
16446 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
16447 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
16448 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
16449
16450 r_mode = mode;
16451 if (mode == QImode || mode == HImode)
16452 {
16453 r_mode = SImode;
16454 rval = gen_reg_rtx (r_mode);
16455 }
16456
16457 if (TARGET_LSE)
16458 {
16459 /* The CAS insn requires oldval and rval overlap, but we need to
16460 have a copy of oldval saved across the operation to tell if
16461 the operation is successful. */
16462 if (reg_overlap_mentioned_p (rval, oldval))
16463 rval = copy_to_mode_reg (r_mode, oldval);
16464 else
16465 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
16466
16467 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
16468 newval, mod_s));
16469 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16470 }
16471 else
16472 {
16473 /* The oldval predicate varies by mode. Test it and force to reg. */
16474 insn_code code = code_for_aarch64_compare_and_swap (mode);
16475 if (!insn_data[code].operand[2].predicate (oldval, mode))
16476 oldval = force_reg (mode, oldval);
16477
16478 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
16479 is_weak, mod_s, mod_f));
16480 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
16481 }
16482
16483 if (r_mode != mode)
16484 rval = gen_lowpart (mode, rval);
16485 emit_move_insn (operands[1], rval);
16486
16487 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
16488 emit_insn (gen_rtx_SET (bval, x));
16489 }
16490
16491 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
16492 sequence implementing an atomic operation. */
16493
16494 static void
16495 aarch64_emit_post_barrier (enum memmodel model)
16496 {
16497 const enum memmodel base_model = memmodel_base (model);
16498
16499 if (is_mm_sync (model)
16500 && (base_model == MEMMODEL_ACQUIRE
16501 || base_model == MEMMODEL_ACQ_REL
16502 || base_model == MEMMODEL_SEQ_CST))
16503 {
16504 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
16505 }
16506 }
16507
16508 /* Split a compare and swap pattern. */
16509
16510 void
16511 aarch64_split_compare_and_swap (rtx operands[])
16512 {
16513 rtx rval, mem, oldval, newval, scratch;
16514 machine_mode mode;
16515 bool is_weak;
16516 rtx_code_label *label1, *label2;
16517 rtx x, cond;
16518 enum memmodel model;
16519 rtx model_rtx;
16520
16521 rval = operands[0];
16522 mem = operands[1];
16523 oldval = operands[2];
16524 newval = operands[3];
16525 is_weak = (operands[4] != const0_rtx);
16526 model_rtx = operands[5];
16527 scratch = operands[7];
16528 mode = GET_MODE (mem);
16529 model = memmodel_from_int (INTVAL (model_rtx));
16530
16531 /* When OLDVAL is zero and we want the strong version we can emit a tighter
16532 loop:
16533 .label1:
16534 LD[A]XR rval, [mem]
16535 CBNZ rval, .label2
16536 ST[L]XR scratch, newval, [mem]
16537 CBNZ scratch, .label1
16538 .label2:
16539 CMP rval, 0. */
16540 bool strong_zero_p = !is_weak && oldval == const0_rtx;
16541
16542 label1 = NULL;
16543 if (!is_weak)
16544 {
16545 label1 = gen_label_rtx ();
16546 emit_label (label1);
16547 }
16548 label2 = gen_label_rtx ();
16549
16550 /* The initial load can be relaxed for a __sync operation since a final
16551 barrier will be emitted to stop code hoisting. */
16552 if (is_mm_sync (model))
16553 aarch64_emit_load_exclusive (mode, rval, mem,
16554 GEN_INT (MEMMODEL_RELAXED));
16555 else
16556 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
16557
16558 if (strong_zero_p)
16559 {
16560 if (aarch64_track_speculation)
16561 {
16562 /* Emit an explicit compare instruction, so that we can correctly
16563 track the condition codes. */
16564 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
16565 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16566 }
16567 else
16568 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
16569
16570 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16571 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16572 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16573 }
16574 else
16575 {
16576 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
16577 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16578 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16579 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
16580 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16581 }
16582
16583 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
16584
16585 if (!is_weak)
16586 {
16587 if (aarch64_track_speculation)
16588 {
16589 /* Emit an explicit compare instruction, so that we can correctly
16590 track the condition codes. */
16591 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
16592 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16593 }
16594 else
16595 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
16596
16597 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16598 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
16599 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16600 }
16601 else
16602 {
16603 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16604 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
16605 emit_insn (gen_rtx_SET (cond, x));
16606 }
16607
16608 emit_label (label2);
16609 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
16610 to set the condition flags. If this is not used it will be removed by
16611 later passes. */
16612 if (strong_zero_p)
16613 {
16614 cond = gen_rtx_REG (CCmode, CC_REGNUM);
16615 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
16616 emit_insn (gen_rtx_SET (cond, x));
16617 }
16618 /* Emit any final barrier needed for a __sync operation. */
16619 if (is_mm_sync (model))
16620 aarch64_emit_post_barrier (model);
16621 }
16622
16623 /* Split an atomic operation. */
16624
16625 void
16626 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
16627 rtx value, rtx model_rtx, rtx cond)
16628 {
16629 machine_mode mode = GET_MODE (mem);
16630 machine_mode wmode = (mode == DImode ? DImode : SImode);
16631 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
16632 const bool is_sync = is_mm_sync (model);
16633 rtx_code_label *label;
16634 rtx x;
16635
16636 /* Split the atomic operation into a sequence. */
16637 label = gen_label_rtx ();
16638 emit_label (label);
16639
16640 if (new_out)
16641 new_out = gen_lowpart (wmode, new_out);
16642 if (old_out)
16643 old_out = gen_lowpart (wmode, old_out);
16644 else
16645 old_out = new_out;
16646 value = simplify_gen_subreg (wmode, value, mode, 0);
16647
16648 /* The initial load can be relaxed for a __sync operation since a final
16649 barrier will be emitted to stop code hoisting. */
16650 if (is_sync)
16651 aarch64_emit_load_exclusive (mode, old_out, mem,
16652 GEN_INT (MEMMODEL_RELAXED));
16653 else
16654 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
16655
16656 switch (code)
16657 {
16658 case SET:
16659 new_out = value;
16660 break;
16661
16662 case NOT:
16663 x = gen_rtx_AND (wmode, old_out, value);
16664 emit_insn (gen_rtx_SET (new_out, x));
16665 x = gen_rtx_NOT (wmode, new_out);
16666 emit_insn (gen_rtx_SET (new_out, x));
16667 break;
16668
16669 case MINUS:
16670 if (CONST_INT_P (value))
16671 {
16672 value = GEN_INT (-INTVAL (value));
16673 code = PLUS;
16674 }
16675 /* Fall through. */
16676
16677 default:
16678 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
16679 emit_insn (gen_rtx_SET (new_out, x));
16680 break;
16681 }
16682
16683 aarch64_emit_store_exclusive (mode, cond, mem,
16684 gen_lowpart (mode, new_out), model_rtx);
16685
16686 if (aarch64_track_speculation)
16687 {
16688 /* Emit an explicit compare instruction, so that we can correctly
16689 track the condition codes. */
16690 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
16691 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
16692 }
16693 else
16694 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
16695
16696 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16697 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
16698 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16699
16700 /* Emit any final barrier needed for a __sync operation. */
16701 if (is_sync)
16702 aarch64_emit_post_barrier (model);
16703 }
16704
16705 static void
16706 aarch64_init_libfuncs (void)
16707 {
16708 /* Half-precision float operations. The compiler handles all operations
16709 with NULL libfuncs by converting to SFmode. */
16710
16711 /* Conversions. */
16712 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
16713 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
16714
16715 /* Arithmetic. */
16716 set_optab_libfunc (add_optab, HFmode, NULL);
16717 set_optab_libfunc (sdiv_optab, HFmode, NULL);
16718 set_optab_libfunc (smul_optab, HFmode, NULL);
16719 set_optab_libfunc (neg_optab, HFmode, NULL);
16720 set_optab_libfunc (sub_optab, HFmode, NULL);
16721
16722 /* Comparisons. */
16723 set_optab_libfunc (eq_optab, HFmode, NULL);
16724 set_optab_libfunc (ne_optab, HFmode, NULL);
16725 set_optab_libfunc (lt_optab, HFmode, NULL);
16726 set_optab_libfunc (le_optab, HFmode, NULL);
16727 set_optab_libfunc (ge_optab, HFmode, NULL);
16728 set_optab_libfunc (gt_optab, HFmode, NULL);
16729 set_optab_libfunc (unord_optab, HFmode, NULL);
16730 }
16731
16732 /* Target hook for c_mode_for_suffix. */
16733 static machine_mode
16734 aarch64_c_mode_for_suffix (char suffix)
16735 {
16736 if (suffix == 'q')
16737 return TFmode;
16738
16739 return VOIDmode;
16740 }
16741
16742 /* We can only represent floating point constants which will fit in
16743 "quarter-precision" values. These values are characterised by
16744 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
16745 by:
16746
16747 (-1)^s * (n/16) * 2^r
16748
16749 Where:
16750 's' is the sign bit.
16751 'n' is an integer in the range 16 <= n <= 31.
16752 'r' is an integer in the range -3 <= r <= 4. */
16753
16754 /* Return true iff X can be represented by a quarter-precision
16755 floating point immediate operand X. Note, we cannot represent 0.0. */
16756 bool
16757 aarch64_float_const_representable_p (rtx x)
16758 {
16759 /* This represents our current view of how many bits
16760 make up the mantissa. */
16761 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
16762 int exponent;
16763 unsigned HOST_WIDE_INT mantissa, mask;
16764 REAL_VALUE_TYPE r, m;
16765 bool fail;
16766
16767 if (!CONST_DOUBLE_P (x))
16768 return false;
16769
16770 if (GET_MODE (x) == VOIDmode
16771 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
16772 return false;
16773
16774 r = *CONST_DOUBLE_REAL_VALUE (x);
16775
16776 /* We cannot represent infinities, NaNs or +/-zero. We won't
16777 know if we have +zero until we analyse the mantissa, but we
16778 can reject the other invalid values. */
16779 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
16780 || REAL_VALUE_MINUS_ZERO (r))
16781 return false;
16782
16783 /* Extract exponent. */
16784 r = real_value_abs (&r);
16785 exponent = REAL_EXP (&r);
16786
16787 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
16788 highest (sign) bit, with a fixed binary point at bit point_pos.
16789 m1 holds the low part of the mantissa, m2 the high part.
16790 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
16791 bits for the mantissa, this can fail (low bits will be lost). */
16792 real_ldexp (&m, &r, point_pos - exponent);
16793 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
16794
16795 /* If the low part of the mantissa has bits set we cannot represent
16796 the value. */
16797 if (w.ulow () != 0)
16798 return false;
16799 /* We have rejected the lower HOST_WIDE_INT, so update our
16800 understanding of how many bits lie in the mantissa and
16801 look only at the high HOST_WIDE_INT. */
16802 mantissa = w.elt (1);
16803 point_pos -= HOST_BITS_PER_WIDE_INT;
16804
16805 /* We can only represent values with a mantissa of the form 1.xxxx. */
16806 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
16807 if ((mantissa & mask) != 0)
16808 return false;
16809
16810 /* Having filtered unrepresentable values, we may now remove all
16811 but the highest 5 bits. */
16812 mantissa >>= point_pos - 5;
16813
16814 /* We cannot represent the value 0.0, so reject it. This is handled
16815 elsewhere. */
16816 if (mantissa == 0)
16817 return false;
16818
16819 /* Then, as bit 4 is always set, we can mask it off, leaving
16820 the mantissa in the range [0, 15]. */
16821 mantissa &= ~(1 << 4);
16822 gcc_assert (mantissa <= 15);
16823
16824 /* GCC internally does not use IEEE754-like encoding (where normalized
16825 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
16826 Our mantissa values are shifted 4 places to the left relative to
16827 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
16828 by 5 places to correct for GCC's representation. */
16829 exponent = 5 - exponent;
16830
16831 return (exponent >= 0 && exponent <= 7);
16832 }
16833
16834 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
16835 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
16836 output MOVI/MVNI, ORR or BIC immediate. */
16837 char*
16838 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
16839 enum simd_immediate_check which)
16840 {
16841 bool is_valid;
16842 static char templ[40];
16843 const char *mnemonic;
16844 const char *shift_op;
16845 unsigned int lane_count = 0;
16846 char element_char;
16847
16848 struct simd_immediate_info info;
16849
16850 /* This will return true to show const_vector is legal for use as either
16851 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
16852 It will also update INFO to show how the immediate should be generated.
16853 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
16854 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
16855 gcc_assert (is_valid);
16856
16857 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16858 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
16859
16860 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16861 {
16862 gcc_assert (info.insn == simd_immediate_info::MOV
16863 && info.u.mov.shift == 0);
16864 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
16865 move immediate path. */
16866 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
16867 info.u.mov.value = GEN_INT (0);
16868 else
16869 {
16870 const unsigned int buf_size = 20;
16871 char float_buf[buf_size] = {'\0'};
16872 real_to_decimal_for_mode (float_buf,
16873 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
16874 buf_size, buf_size, 1, info.elt_mode);
16875
16876 if (lane_count == 1)
16877 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
16878 else
16879 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
16880 lane_count, element_char, float_buf);
16881 return templ;
16882 }
16883 }
16884
16885 gcc_assert (CONST_INT_P (info.u.mov.value));
16886
16887 if (which == AARCH64_CHECK_MOV)
16888 {
16889 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
16890 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
16891 ? "msl" : "lsl");
16892 if (lane_count == 1)
16893 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
16894 mnemonic, UINTVAL (info.u.mov.value));
16895 else if (info.u.mov.shift)
16896 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16897 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
16898 element_char, UINTVAL (info.u.mov.value), shift_op,
16899 info.u.mov.shift);
16900 else
16901 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
16902 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
16903 element_char, UINTVAL (info.u.mov.value));
16904 }
16905 else
16906 {
16907 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
16908 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
16909 if (info.u.mov.shift)
16910 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16911 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
16912 element_char, UINTVAL (info.u.mov.value), "lsl",
16913 info.u.mov.shift);
16914 else
16915 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
16916 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
16917 element_char, UINTVAL (info.u.mov.value));
16918 }
16919 return templ;
16920 }
16921
16922 char*
16923 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
16924 {
16925
16926 /* If a floating point number was passed and we desire to use it in an
16927 integer mode do the conversion to integer. */
16928 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
16929 {
16930 unsigned HOST_WIDE_INT ival;
16931 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
16932 gcc_unreachable ();
16933 immediate = gen_int_mode (ival, mode);
16934 }
16935
16936 machine_mode vmode;
16937 /* use a 64 bit mode for everything except for DI/DF mode, where we use
16938 a 128 bit vector mode. */
16939 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
16940
16941 vmode = aarch64_simd_container_mode (mode, width);
16942 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
16943 return aarch64_output_simd_mov_immediate (v_op, width);
16944 }
16945
16946 /* Return the output string to use for moving immediate CONST_VECTOR
16947 into an SVE register. */
16948
16949 char *
16950 aarch64_output_sve_mov_immediate (rtx const_vector)
16951 {
16952 static char templ[40];
16953 struct simd_immediate_info info;
16954 char element_char;
16955
16956 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
16957 gcc_assert (is_valid);
16958
16959 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
16960
16961 machine_mode vec_mode = GET_MODE (const_vector);
16962 if (aarch64_sve_pred_mode_p (vec_mode))
16963 {
16964 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16965 if (info.insn == simd_immediate_info::MOV)
16966 {
16967 gcc_assert (info.u.mov.value == const0_rtx);
16968 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
16969 }
16970 else
16971 {
16972 gcc_assert (info.insn == simd_immediate_info::PTRUE);
16973 unsigned int total_bytes;
16974 if (info.u.pattern == AARCH64_SV_ALL
16975 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
16976 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
16977 total_bytes / GET_MODE_SIZE (info.elt_mode));
16978 else
16979 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
16980 svpattern_token (info.u.pattern));
16981 }
16982 return buf;
16983 }
16984
16985 if (info.insn == simd_immediate_info::INDEX)
16986 {
16987 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
16988 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
16989 element_char, INTVAL (info.u.index.base),
16990 INTVAL (info.u.index.step));
16991 return templ;
16992 }
16993
16994 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
16995 {
16996 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
16997 info.u.mov.value = GEN_INT (0);
16998 else
16999 {
17000 const int buf_size = 20;
17001 char float_buf[buf_size] = {};
17002 real_to_decimal_for_mode (float_buf,
17003 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
17004 buf_size, buf_size, 1, info.elt_mode);
17005
17006 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
17007 element_char, float_buf);
17008 return templ;
17009 }
17010 }
17011
17012 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
17013 element_char, INTVAL (info.u.mov.value));
17014 return templ;
17015 }
17016
17017 /* Split operands into moves from op[1] + op[2] into op[0]. */
17018
17019 void
17020 aarch64_split_combinev16qi (rtx operands[3])
17021 {
17022 unsigned int dest = REGNO (operands[0]);
17023 unsigned int src1 = REGNO (operands[1]);
17024 unsigned int src2 = REGNO (operands[2]);
17025 machine_mode halfmode = GET_MODE (operands[1]);
17026 unsigned int halfregs = REG_NREGS (operands[1]);
17027 rtx destlo, desthi;
17028
17029 gcc_assert (halfmode == V16QImode);
17030
17031 if (src1 == dest && src2 == dest + halfregs)
17032 {
17033 /* No-op move. Can't split to nothing; emit something. */
17034 emit_note (NOTE_INSN_DELETED);
17035 return;
17036 }
17037
17038 /* Preserve register attributes for variable tracking. */
17039 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
17040 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
17041 GET_MODE_SIZE (halfmode));
17042
17043 /* Special case of reversed high/low parts. */
17044 if (reg_overlap_mentioned_p (operands[2], destlo)
17045 && reg_overlap_mentioned_p (operands[1], desthi))
17046 {
17047 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17048 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
17049 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
17050 }
17051 else if (!reg_overlap_mentioned_p (operands[2], destlo))
17052 {
17053 /* Try to avoid unnecessary moves if part of the result
17054 is in the right place already. */
17055 if (src1 != dest)
17056 emit_move_insn (destlo, operands[1]);
17057 if (src2 != dest + halfregs)
17058 emit_move_insn (desthi, operands[2]);
17059 }
17060 else
17061 {
17062 if (src2 != dest + halfregs)
17063 emit_move_insn (desthi, operands[2]);
17064 if (src1 != dest)
17065 emit_move_insn (destlo, operands[1]);
17066 }
17067 }
17068
17069 /* vec_perm support. */
17070
17071 struct expand_vec_perm_d
17072 {
17073 rtx target, op0, op1;
17074 vec_perm_indices perm;
17075 machine_mode vmode;
17076 unsigned int vec_flags;
17077 bool one_vector_p;
17078 bool testing_p;
17079 };
17080
17081 /* Generate a variable permutation. */
17082
17083 static void
17084 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
17085 {
17086 machine_mode vmode = GET_MODE (target);
17087 bool one_vector_p = rtx_equal_p (op0, op1);
17088
17089 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
17090 gcc_checking_assert (GET_MODE (op0) == vmode);
17091 gcc_checking_assert (GET_MODE (op1) == vmode);
17092 gcc_checking_assert (GET_MODE (sel) == vmode);
17093 gcc_checking_assert (TARGET_SIMD);
17094
17095 if (one_vector_p)
17096 {
17097 if (vmode == V8QImode)
17098 {
17099 /* Expand the argument to a V16QI mode by duplicating it. */
17100 rtx pair = gen_reg_rtx (V16QImode);
17101 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
17102 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17103 }
17104 else
17105 {
17106 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
17107 }
17108 }
17109 else
17110 {
17111 rtx pair;
17112
17113 if (vmode == V8QImode)
17114 {
17115 pair = gen_reg_rtx (V16QImode);
17116 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
17117 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
17118 }
17119 else
17120 {
17121 pair = gen_reg_rtx (OImode);
17122 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
17123 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
17124 }
17125 }
17126 }
17127
17128 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
17129 NELT is the number of elements in the vector. */
17130
17131 void
17132 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
17133 unsigned int nelt)
17134 {
17135 machine_mode vmode = GET_MODE (target);
17136 bool one_vector_p = rtx_equal_p (op0, op1);
17137 rtx mask;
17138
17139 /* The TBL instruction does not use a modulo index, so we must take care
17140 of that ourselves. */
17141 mask = aarch64_simd_gen_const_vector_dup (vmode,
17142 one_vector_p ? nelt - 1 : 2 * nelt - 1);
17143 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
17144
17145 /* For big-endian, we also need to reverse the index within the vector
17146 (but not which vector). */
17147 if (BYTES_BIG_ENDIAN)
17148 {
17149 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
17150 if (!one_vector_p)
17151 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
17152 sel = expand_simple_binop (vmode, XOR, sel, mask,
17153 NULL, 0, OPTAB_LIB_WIDEN);
17154 }
17155 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
17156 }
17157
17158 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
17159
17160 static void
17161 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
17162 {
17163 emit_insn (gen_rtx_SET (target,
17164 gen_rtx_UNSPEC (GET_MODE (target),
17165 gen_rtvec (2, op0, op1), code)));
17166 }
17167
17168 /* Expand an SVE vec_perm with the given operands. */
17169
17170 void
17171 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
17172 {
17173 machine_mode data_mode = GET_MODE (target);
17174 machine_mode sel_mode = GET_MODE (sel);
17175 /* Enforced by the pattern condition. */
17176 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
17177
17178 /* Note: vec_perm indices are supposed to wrap when they go beyond the
17179 size of the two value vectors, i.e. the upper bits of the indices
17180 are effectively ignored. SVE TBL instead produces 0 for any
17181 out-of-range indices, so we need to modulo all the vec_perm indices
17182 to ensure they are all in range. */
17183 rtx sel_reg = force_reg (sel_mode, sel);
17184
17185 /* Check if the sel only references the first values vector. */
17186 if (GET_CODE (sel) == CONST_VECTOR
17187 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
17188 {
17189 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
17190 return;
17191 }
17192
17193 /* Check if the two values vectors are the same. */
17194 if (rtx_equal_p (op0, op1))
17195 {
17196 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
17197 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17198 NULL, 0, OPTAB_DIRECT);
17199 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
17200 return;
17201 }
17202
17203 /* Run TBL on for each value vector and combine the results. */
17204
17205 rtx res0 = gen_reg_rtx (data_mode);
17206 rtx res1 = gen_reg_rtx (data_mode);
17207 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
17208 if (GET_CODE (sel) != CONST_VECTOR
17209 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
17210 {
17211 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
17212 2 * nunits - 1);
17213 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
17214 NULL, 0, OPTAB_DIRECT);
17215 }
17216 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
17217 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
17218 NULL, 0, OPTAB_DIRECT);
17219 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
17220 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
17221 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
17222 else
17223 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
17224 }
17225
17226 /* Recognize patterns suitable for the TRN instructions. */
17227 static bool
17228 aarch64_evpc_trn (struct expand_vec_perm_d *d)
17229 {
17230 HOST_WIDE_INT odd;
17231 poly_uint64 nelt = d->perm.length ();
17232 rtx out, in0, in1, x;
17233 machine_mode vmode = d->vmode;
17234
17235 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17236 return false;
17237
17238 /* Note that these are little-endian tests.
17239 We correct for big-endian later. */
17240 if (!d->perm[0].is_constant (&odd)
17241 || (odd != 0 && odd != 1)
17242 || !d->perm.series_p (0, 2, odd, 2)
17243 || !d->perm.series_p (1, 2, nelt + odd, 2))
17244 return false;
17245
17246 /* Success! */
17247 if (d->testing_p)
17248 return true;
17249
17250 in0 = d->op0;
17251 in1 = d->op1;
17252 /* We don't need a big-endian lane correction for SVE; see the comment
17253 at the head of aarch64-sve.md for details. */
17254 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17255 {
17256 x = in0, in0 = in1, in1 = x;
17257 odd = !odd;
17258 }
17259 out = d->target;
17260
17261 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17262 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
17263 return true;
17264 }
17265
17266 /* Recognize patterns suitable for the UZP instructions. */
17267 static bool
17268 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
17269 {
17270 HOST_WIDE_INT odd;
17271 rtx out, in0, in1, x;
17272 machine_mode vmode = d->vmode;
17273
17274 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17275 return false;
17276
17277 /* Note that these are little-endian tests.
17278 We correct for big-endian later. */
17279 if (!d->perm[0].is_constant (&odd)
17280 || (odd != 0 && odd != 1)
17281 || !d->perm.series_p (0, 1, odd, 2))
17282 return false;
17283
17284 /* Success! */
17285 if (d->testing_p)
17286 return true;
17287
17288 in0 = d->op0;
17289 in1 = d->op1;
17290 /* We don't need a big-endian lane correction for SVE; see the comment
17291 at the head of aarch64-sve.md for details. */
17292 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17293 {
17294 x = in0, in0 = in1, in1 = x;
17295 odd = !odd;
17296 }
17297 out = d->target;
17298
17299 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17300 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
17301 return true;
17302 }
17303
17304 /* Recognize patterns suitable for the ZIP instructions. */
17305 static bool
17306 aarch64_evpc_zip (struct expand_vec_perm_d *d)
17307 {
17308 unsigned int high;
17309 poly_uint64 nelt = d->perm.length ();
17310 rtx out, in0, in1, x;
17311 machine_mode vmode = d->vmode;
17312
17313 if (GET_MODE_UNIT_SIZE (vmode) > 8)
17314 return false;
17315
17316 /* Note that these are little-endian tests.
17317 We correct for big-endian later. */
17318 poly_uint64 first = d->perm[0];
17319 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
17320 || !d->perm.series_p (0, 2, first, 1)
17321 || !d->perm.series_p (1, 2, first + nelt, 1))
17322 return false;
17323 high = maybe_ne (first, 0U);
17324
17325 /* Success! */
17326 if (d->testing_p)
17327 return true;
17328
17329 in0 = d->op0;
17330 in1 = d->op1;
17331 /* We don't need a big-endian lane correction for SVE; see the comment
17332 at the head of aarch64-sve.md for details. */
17333 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
17334 {
17335 x = in0, in0 = in1, in1 = x;
17336 high = !high;
17337 }
17338 out = d->target;
17339
17340 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
17341 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
17342 return true;
17343 }
17344
17345 /* Recognize patterns for the EXT insn. */
17346
17347 static bool
17348 aarch64_evpc_ext (struct expand_vec_perm_d *d)
17349 {
17350 HOST_WIDE_INT location;
17351 rtx offset;
17352
17353 /* The first element always refers to the first vector.
17354 Check if the extracted indices are increasing by one. */
17355 if (d->vec_flags == VEC_SVE_PRED
17356 || !d->perm[0].is_constant (&location)
17357 || !d->perm.series_p (0, 1, location, 1))
17358 return false;
17359
17360 /* Success! */
17361 if (d->testing_p)
17362 return true;
17363
17364 /* The case where (location == 0) is a no-op for both big- and little-endian,
17365 and is removed by the mid-end at optimization levels -O1 and higher.
17366
17367 We don't need a big-endian lane correction for SVE; see the comment
17368 at the head of aarch64-sve.md for details. */
17369 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
17370 {
17371 /* After setup, we want the high elements of the first vector (stored
17372 at the LSB end of the register), and the low elements of the second
17373 vector (stored at the MSB end of the register). So swap. */
17374 std::swap (d->op0, d->op1);
17375 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
17376 to_constant () is safe since this is restricted to Advanced SIMD
17377 vectors. */
17378 location = d->perm.length ().to_constant () - location;
17379 }
17380
17381 offset = GEN_INT (location);
17382 emit_set_insn (d->target,
17383 gen_rtx_UNSPEC (d->vmode,
17384 gen_rtvec (3, d->op0, d->op1, offset),
17385 UNSPEC_EXT));
17386 return true;
17387 }
17388
17389 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
17390 within each 64-bit, 32-bit or 16-bit granule. */
17391
17392 static bool
17393 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
17394 {
17395 HOST_WIDE_INT diff;
17396 unsigned int i, size, unspec;
17397 machine_mode pred_mode;
17398
17399 if (d->vec_flags == VEC_SVE_PRED
17400 || !d->one_vector_p
17401 || !d->perm[0].is_constant (&diff))
17402 return false;
17403
17404 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
17405 if (size == 8)
17406 {
17407 unspec = UNSPEC_REV64;
17408 pred_mode = VNx2BImode;
17409 }
17410 else if (size == 4)
17411 {
17412 unspec = UNSPEC_REV32;
17413 pred_mode = VNx4BImode;
17414 }
17415 else if (size == 2)
17416 {
17417 unspec = UNSPEC_REV16;
17418 pred_mode = VNx8BImode;
17419 }
17420 else
17421 return false;
17422
17423 unsigned int step = diff + 1;
17424 for (i = 0; i < step; ++i)
17425 if (!d->perm.series_p (i, step, diff - i, step))
17426 return false;
17427
17428 /* Success! */
17429 if (d->testing_p)
17430 return true;
17431
17432 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
17433 if (d->vec_flags == VEC_SVE_DATA)
17434 {
17435 rtx pred = aarch64_ptrue_reg (pred_mode);
17436 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
17437 UNSPEC_MERGE_PTRUE);
17438 }
17439 emit_set_insn (d->target, src);
17440 return true;
17441 }
17442
17443 /* Recognize patterns for the REV insn, which reverses elements within
17444 a full vector. */
17445
17446 static bool
17447 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
17448 {
17449 poly_uint64 nelt = d->perm.length ();
17450
17451 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
17452 return false;
17453
17454 if (!d->perm.series_p (0, 1, nelt - 1, -1))
17455 return false;
17456
17457 /* Success! */
17458 if (d->testing_p)
17459 return true;
17460
17461 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
17462 emit_set_insn (d->target, src);
17463 return true;
17464 }
17465
17466 static bool
17467 aarch64_evpc_dup (struct expand_vec_perm_d *d)
17468 {
17469 rtx out = d->target;
17470 rtx in0;
17471 HOST_WIDE_INT elt;
17472 machine_mode vmode = d->vmode;
17473 rtx lane;
17474
17475 if (d->vec_flags == VEC_SVE_PRED
17476 || d->perm.encoding ().encoded_nelts () != 1
17477 || !d->perm[0].is_constant (&elt))
17478 return false;
17479
17480 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
17481 return false;
17482
17483 /* Success! */
17484 if (d->testing_p)
17485 return true;
17486
17487 /* The generic preparation in aarch64_expand_vec_perm_const_1
17488 swaps the operand order and the permute indices if it finds
17489 d->perm[0] to be in the second operand. Thus, we can always
17490 use d->op0 and need not do any extra arithmetic to get the
17491 correct lane number. */
17492 in0 = d->op0;
17493 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
17494
17495 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
17496 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
17497 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
17498 return true;
17499 }
17500
17501 static bool
17502 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
17503 {
17504 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
17505 machine_mode vmode = d->vmode;
17506
17507 /* Make sure that the indices are constant. */
17508 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
17509 for (unsigned int i = 0; i < encoded_nelts; ++i)
17510 if (!d->perm[i].is_constant ())
17511 return false;
17512
17513 if (d->testing_p)
17514 return true;
17515
17516 /* Generic code will try constant permutation twice. Once with the
17517 original mode and again with the elements lowered to QImode.
17518 So wait and don't do the selector expansion ourselves. */
17519 if (vmode != V8QImode && vmode != V16QImode)
17520 return false;
17521
17522 /* to_constant is safe since this routine is specific to Advanced SIMD
17523 vectors. */
17524 unsigned int nelt = d->perm.length ().to_constant ();
17525 for (unsigned int i = 0; i < nelt; ++i)
17526 /* If big-endian and two vectors we end up with a weird mixed-endian
17527 mode on NEON. Reverse the index within each word but not the word
17528 itself. to_constant is safe because we checked is_constant above. */
17529 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
17530 ? d->perm[i].to_constant () ^ (nelt - 1)
17531 : d->perm[i].to_constant ());
17532
17533 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
17534 sel = force_reg (vmode, sel);
17535
17536 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
17537 return true;
17538 }
17539
17540 /* Try to implement D using an SVE TBL instruction. */
17541
17542 static bool
17543 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
17544 {
17545 unsigned HOST_WIDE_INT nelt;
17546
17547 /* Permuting two variable-length vectors could overflow the
17548 index range. */
17549 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
17550 return false;
17551
17552 if (d->testing_p)
17553 return true;
17554
17555 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
17556 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
17557 if (d->one_vector_p)
17558 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
17559 else
17560 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
17561 return true;
17562 }
17563
17564 static bool
17565 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
17566 {
17567 /* The pattern matching functions above are written to look for a small
17568 number to begin the sequence (0, 1, N/2). If we begin with an index
17569 from the second operand, we can swap the operands. */
17570 poly_int64 nelt = d->perm.length ();
17571 if (known_ge (d->perm[0], nelt))
17572 {
17573 d->perm.rotate_inputs (1);
17574 std::swap (d->op0, d->op1);
17575 }
17576
17577 if ((d->vec_flags == VEC_ADVSIMD
17578 || d->vec_flags == VEC_SVE_DATA
17579 || d->vec_flags == VEC_SVE_PRED)
17580 && known_gt (nelt, 1))
17581 {
17582 if (aarch64_evpc_rev_local (d))
17583 return true;
17584 else if (aarch64_evpc_rev_global (d))
17585 return true;
17586 else if (aarch64_evpc_ext (d))
17587 return true;
17588 else if (aarch64_evpc_dup (d))
17589 return true;
17590 else if (aarch64_evpc_zip (d))
17591 return true;
17592 else if (aarch64_evpc_uzp (d))
17593 return true;
17594 else if (aarch64_evpc_trn (d))
17595 return true;
17596 if (d->vec_flags == VEC_SVE_DATA)
17597 return aarch64_evpc_sve_tbl (d);
17598 else if (d->vec_flags == VEC_ADVSIMD)
17599 return aarch64_evpc_tbl (d);
17600 }
17601 return false;
17602 }
17603
17604 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
17605
17606 static bool
17607 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
17608 rtx op1, const vec_perm_indices &sel)
17609 {
17610 struct expand_vec_perm_d d;
17611
17612 /* Check whether the mask can be applied to a single vector. */
17613 if (sel.ninputs () == 1
17614 || (op0 && rtx_equal_p (op0, op1)))
17615 d.one_vector_p = true;
17616 else if (sel.all_from_input_p (0))
17617 {
17618 d.one_vector_p = true;
17619 op1 = op0;
17620 }
17621 else if (sel.all_from_input_p (1))
17622 {
17623 d.one_vector_p = true;
17624 op0 = op1;
17625 }
17626 else
17627 d.one_vector_p = false;
17628
17629 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
17630 sel.nelts_per_input ());
17631 d.vmode = vmode;
17632 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
17633 d.target = target;
17634 d.op0 = op0;
17635 d.op1 = op1;
17636 d.testing_p = !target;
17637
17638 if (!d.testing_p)
17639 return aarch64_expand_vec_perm_const_1 (&d);
17640
17641 rtx_insn *last = get_last_insn ();
17642 bool ret = aarch64_expand_vec_perm_const_1 (&d);
17643 gcc_assert (last == get_last_insn ());
17644
17645 return ret;
17646 }
17647
17648 /* Generate a byte permute mask for a register of mode MODE,
17649 which has NUNITS units. */
17650
17651 rtx
17652 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
17653 {
17654 /* We have to reverse each vector because we dont have
17655 a permuted load that can reverse-load according to ABI rules. */
17656 rtx mask;
17657 rtvec v = rtvec_alloc (16);
17658 unsigned int i, j;
17659 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
17660
17661 gcc_assert (BYTES_BIG_ENDIAN);
17662 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
17663
17664 for (i = 0; i < nunits; i++)
17665 for (j = 0; j < usize; j++)
17666 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
17667 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
17668 return force_reg (V16QImode, mask);
17669 }
17670
17671 /* Return true if X is a valid second operand for the SVE instruction
17672 that implements integer comparison OP_CODE. */
17673
17674 static bool
17675 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
17676 {
17677 if (register_operand (x, VOIDmode))
17678 return true;
17679
17680 switch (op_code)
17681 {
17682 case LTU:
17683 case LEU:
17684 case GEU:
17685 case GTU:
17686 return aarch64_sve_cmp_immediate_p (x, false);
17687 case LT:
17688 case LE:
17689 case GE:
17690 case GT:
17691 case NE:
17692 case EQ:
17693 return aarch64_sve_cmp_immediate_p (x, true);
17694 default:
17695 gcc_unreachable ();
17696 }
17697 }
17698
17699 /* Use predicated SVE instructions to implement the equivalent of:
17700
17701 (set TARGET OP)
17702
17703 given that PTRUE is an all-true predicate of the appropriate mode. */
17704
17705 static void
17706 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
17707 {
17708 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17709 gen_rtvec (2, ptrue, op),
17710 UNSPEC_MERGE_PTRUE);
17711 rtx_insn *insn = emit_set_insn (target, unspec);
17712 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17713 }
17714
17715 /* Likewise, but also clobber the condition codes. */
17716
17717 static void
17718 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
17719 {
17720 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
17721 gen_rtvec (2, ptrue, op),
17722 UNSPEC_MERGE_PTRUE);
17723 rtx_insn *insn = emit_insn (gen_set_clobber_cc_nzc (target, unspec));
17724 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
17725 }
17726
17727 /* Return the UNSPEC_COND_* code for comparison CODE. */
17728
17729 static unsigned int
17730 aarch64_unspec_cond_code (rtx_code code)
17731 {
17732 switch (code)
17733 {
17734 case NE:
17735 return UNSPEC_COND_FCMNE;
17736 case EQ:
17737 return UNSPEC_COND_FCMEQ;
17738 case LT:
17739 return UNSPEC_COND_FCMLT;
17740 case GT:
17741 return UNSPEC_COND_FCMGT;
17742 case LE:
17743 return UNSPEC_COND_FCMLE;
17744 case GE:
17745 return UNSPEC_COND_FCMGE;
17746 default:
17747 gcc_unreachable ();
17748 }
17749 }
17750
17751 /* Emit:
17752
17753 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
17754
17755 where <X> is the operation associated with comparison CODE. This form
17756 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
17757 semantics, such as when PRED might not be all-true and when comparing
17758 inactive lanes could have side effects. */
17759
17760 static void
17761 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
17762 rtx pred, rtx op0, rtx op1)
17763 {
17764 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
17765 gen_rtvec (3, pred, op0, op1),
17766 aarch64_unspec_cond_code (code));
17767 emit_set_insn (target, unspec);
17768 }
17769
17770 /* Expand an SVE integer comparison using the SVE equivalent of:
17771
17772 (set TARGET (CODE OP0 OP1)). */
17773
17774 void
17775 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
17776 {
17777 machine_mode pred_mode = GET_MODE (target);
17778 machine_mode data_mode = GET_MODE (op0);
17779
17780 if (!aarch64_sve_cmp_operand_p (code, op1))
17781 op1 = force_reg (data_mode, op1);
17782
17783 rtx ptrue = aarch64_ptrue_reg (pred_mode);
17784 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17785 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
17786 }
17787
17788 /* Emit the SVE equivalent of:
17789
17790 (set TMP1 (CODE1 OP0 OP1))
17791 (set TMP2 (CODE2 OP0 OP1))
17792 (set TARGET (ior:PRED_MODE TMP1 TMP2))
17793
17794 PTRUE is an all-true predicate with the same mode as TARGET. */
17795
17796 static void
17797 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
17798 rtx ptrue, rtx op0, rtx op1)
17799 {
17800 machine_mode pred_mode = GET_MODE (ptrue);
17801 rtx tmp1 = gen_reg_rtx (pred_mode);
17802 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
17803 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
17804 rtx tmp2 = gen_reg_rtx (pred_mode);
17805 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
17806 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
17807 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
17808 }
17809
17810 /* Emit the SVE equivalent of:
17811
17812 (set TMP (CODE OP0 OP1))
17813 (set TARGET (not TMP))
17814
17815 PTRUE is an all-true predicate with the same mode as TARGET. */
17816
17817 static void
17818 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
17819 rtx op0, rtx op1)
17820 {
17821 machine_mode pred_mode = GET_MODE (ptrue);
17822 rtx tmp = gen_reg_rtx (pred_mode);
17823 aarch64_emit_sve_ptrue_op (tmp, ptrue,
17824 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
17825 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17826 }
17827
17828 /* Expand an SVE floating-point comparison using the SVE equivalent of:
17829
17830 (set TARGET (CODE OP0 OP1))
17831
17832 If CAN_INVERT_P is true, the caller can also handle inverted results;
17833 return true if the result is in fact inverted. */
17834
17835 bool
17836 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
17837 rtx op0, rtx op1, bool can_invert_p)
17838 {
17839 machine_mode pred_mode = GET_MODE (target);
17840 machine_mode data_mode = GET_MODE (op0);
17841
17842 rtx ptrue = aarch64_ptrue_reg (pred_mode);
17843 switch (code)
17844 {
17845 case UNORDERED:
17846 /* UNORDERED has no immediate form. */
17847 op1 = force_reg (data_mode, op1);
17848 /* fall through */
17849 case LT:
17850 case LE:
17851 case GT:
17852 case GE:
17853 case EQ:
17854 case NE:
17855 {
17856 /* There is native support for the comparison. */
17857 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17858 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17859 return false;
17860 }
17861
17862 case LTGT:
17863 /* This is a trapping operation (LT or GT). */
17864 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
17865 return false;
17866
17867 case UNEQ:
17868 if (!flag_trapping_math)
17869 {
17870 /* This would trap for signaling NaNs. */
17871 op1 = force_reg (data_mode, op1);
17872 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
17873 return false;
17874 }
17875 /* fall through */
17876 case UNLT:
17877 case UNLE:
17878 case UNGT:
17879 case UNGE:
17880 if (flag_trapping_math)
17881 {
17882 /* Work out which elements are ordered. */
17883 rtx ordered = gen_reg_rtx (pred_mode);
17884 op1 = force_reg (data_mode, op1);
17885 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
17886
17887 /* Test the opposite condition for the ordered elements,
17888 then invert the result. */
17889 if (code == UNEQ)
17890 code = NE;
17891 else
17892 code = reverse_condition_maybe_unordered (code);
17893 if (can_invert_p)
17894 {
17895 aarch64_emit_sve_predicated_cond (target, code,
17896 ordered, op0, op1);
17897 return true;
17898 }
17899 rtx tmp = gen_reg_rtx (pred_mode);
17900 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
17901 aarch64_emit_unop (target, one_cmpl_optab, tmp);
17902 return false;
17903 }
17904 break;
17905
17906 case ORDERED:
17907 /* ORDERED has no immediate form. */
17908 op1 = force_reg (data_mode, op1);
17909 break;
17910
17911 default:
17912 gcc_unreachable ();
17913 }
17914
17915 /* There is native support for the inverse comparison. */
17916 code = reverse_condition_maybe_unordered (code);
17917 if (can_invert_p)
17918 {
17919 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
17920 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
17921 return true;
17922 }
17923 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
17924 return false;
17925 }
17926
17927 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
17928 of the data being selected and CMP_MODE is the mode of the values being
17929 compared. */
17930
17931 void
17932 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
17933 rtx *ops)
17934 {
17935 machine_mode pred_mode
17936 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
17937 GET_MODE_SIZE (cmp_mode)).require ();
17938 rtx pred = gen_reg_rtx (pred_mode);
17939 if (FLOAT_MODE_P (cmp_mode))
17940 {
17941 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
17942 ops[4], ops[5], true))
17943 std::swap (ops[1], ops[2]);
17944 }
17945 else
17946 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
17947
17948 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
17949 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
17950 }
17951
17952 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
17953 true. However due to issues with register allocation it is preferable
17954 to avoid tieing integer scalar and FP scalar modes. Executing integer
17955 operations in general registers is better than treating them as scalar
17956 vector operations. This reduces latency and avoids redundant int<->FP
17957 moves. So tie modes if they are either the same class, or vector modes
17958 with other vector modes, vector structs or any scalar mode. */
17959
17960 static bool
17961 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
17962 {
17963 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
17964 return true;
17965
17966 /* We specifically want to allow elements of "structure" modes to
17967 be tieable to the structure. This more general condition allows
17968 other rarer situations too. The reason we don't extend this to
17969 predicate modes is that there are no predicate structure modes
17970 nor any specific instructions for extracting part of a predicate
17971 register. */
17972 if (aarch64_vector_data_mode_p (mode1)
17973 && aarch64_vector_data_mode_p (mode2))
17974 return true;
17975
17976 /* Also allow any scalar modes with vectors. */
17977 if (aarch64_vector_mode_supported_p (mode1)
17978 || aarch64_vector_mode_supported_p (mode2))
17979 return true;
17980
17981 return false;
17982 }
17983
17984 /* Return a new RTX holding the result of moving POINTER forward by
17985 AMOUNT bytes. */
17986
17987 static rtx
17988 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17989 {
17990 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17991
17992 return adjust_automodify_address (pointer, GET_MODE (pointer),
17993 next, amount);
17994 }
17995
17996 /* Return a new RTX holding the result of moving POINTER forward by the
17997 size of the mode it points to. */
17998
17999 static rtx
18000 aarch64_progress_pointer (rtx pointer)
18001 {
18002 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
18003 }
18004
18005 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
18006 MODE bytes. */
18007
18008 static void
18009 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
18010 machine_mode mode)
18011 {
18012 rtx reg = gen_reg_rtx (mode);
18013
18014 /* "Cast" the pointers to the correct mode. */
18015 *src = adjust_address (*src, mode, 0);
18016 *dst = adjust_address (*dst, mode, 0);
18017 /* Emit the memcpy. */
18018 emit_move_insn (reg, *src);
18019 emit_move_insn (*dst, reg);
18020 /* Move the pointers forward. */
18021 *src = aarch64_progress_pointer (*src);
18022 *dst = aarch64_progress_pointer (*dst);
18023 }
18024
18025 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
18026 we succeed, otherwise return false. */
18027
18028 bool
18029 aarch64_expand_cpymem (rtx *operands)
18030 {
18031 int n, mode_bits;
18032 rtx dst = operands[0];
18033 rtx src = operands[1];
18034 rtx base;
18035 machine_mode cur_mode = BLKmode, next_mode;
18036 bool speed_p = !optimize_function_for_size_p (cfun);
18037
18038 /* When optimizing for size, give a better estimate of the length of a
18039 memcpy call, but use the default otherwise. Moves larger than 8 bytes
18040 will always require an even number of instructions to do now. And each
18041 operation requires both a load+store, so devide the max number by 2. */
18042 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
18043
18044 /* We can't do anything smart if the amount to copy is not constant. */
18045 if (!CONST_INT_P (operands[2]))
18046 return false;
18047
18048 n = INTVAL (operands[2]);
18049
18050 /* Try to keep the number of instructions low. For all cases we will do at
18051 most two moves for the residual amount, since we'll always overlap the
18052 remainder. */
18053 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
18054 return false;
18055
18056 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
18057 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
18058
18059 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
18060 src = adjust_automodify_address (src, VOIDmode, base, 0);
18061
18062 /* Convert n to bits to make the rest of the code simpler. */
18063 n = n * BITS_PER_UNIT;
18064
18065 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
18066 larger than TImode, but we should not use them for loads/stores here. */
18067 const int copy_limit = GET_MODE_BITSIZE (TImode);
18068
18069 while (n > 0)
18070 {
18071 /* Find the largest mode in which to do the copy in without over reading
18072 or writing. */
18073 opt_scalar_int_mode mode_iter;
18074 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
18075 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
18076 cur_mode = mode_iter.require ();
18077
18078 gcc_assert (cur_mode != BLKmode);
18079
18080 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
18081 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
18082
18083 n -= mode_bits;
18084
18085 /* Do certain trailing copies as overlapping if it's going to be
18086 cheaper. i.e. less instructions to do so. For instance doing a 15
18087 byte copy it's more efficient to do two overlapping 8 byte copies than
18088 8 + 6 + 1. */
18089 if (n > 0 && n <= 8 * BITS_PER_UNIT)
18090 {
18091 next_mode = smallest_mode_for_size (n, MODE_INT);
18092 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
18093 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
18094 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
18095 n = n_bits;
18096 }
18097 }
18098
18099 return true;
18100 }
18101
18102 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
18103 SImode stores. Handle the case when the constant has identical
18104 bottom and top halves. This is beneficial when the two stores can be
18105 merged into an STP and we avoid synthesising potentially expensive
18106 immediates twice. Return true if such a split is possible. */
18107
18108 bool
18109 aarch64_split_dimode_const_store (rtx dst, rtx src)
18110 {
18111 rtx lo = gen_lowpart (SImode, src);
18112 rtx hi = gen_highpart_mode (SImode, DImode, src);
18113
18114 bool size_p = optimize_function_for_size_p (cfun);
18115
18116 if (!rtx_equal_p (lo, hi))
18117 return false;
18118
18119 unsigned int orig_cost
18120 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
18121 unsigned int lo_cost
18122 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
18123
18124 /* We want to transform:
18125 MOV x1, 49370
18126 MOVK x1, 0x140, lsl 16
18127 MOVK x1, 0xc0da, lsl 32
18128 MOVK x1, 0x140, lsl 48
18129 STR x1, [x0]
18130 into:
18131 MOV w1, 49370
18132 MOVK w1, 0x140, lsl 16
18133 STP w1, w1, [x0]
18134 So we want to perform this only when we save two instructions
18135 or more. When optimizing for size, however, accept any code size
18136 savings we can. */
18137 if (size_p && orig_cost <= lo_cost)
18138 return false;
18139
18140 if (!size_p
18141 && (orig_cost <= lo_cost + 1))
18142 return false;
18143
18144 rtx mem_lo = adjust_address (dst, SImode, 0);
18145 if (!aarch64_mem_pair_operand (mem_lo, SImode))
18146 return false;
18147
18148 rtx tmp_reg = gen_reg_rtx (SImode);
18149 aarch64_expand_mov_immediate (tmp_reg, lo);
18150 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
18151 /* Don't emit an explicit store pair as this may not be always profitable.
18152 Let the sched-fusion logic decide whether to merge them. */
18153 emit_move_insn (mem_lo, tmp_reg);
18154 emit_move_insn (mem_hi, tmp_reg);
18155
18156 return true;
18157 }
18158
18159 /* Generate RTL for a conditional branch with rtx comparison CODE in
18160 mode CC_MODE. The destination of the unlikely conditional branch
18161 is LABEL_REF. */
18162
18163 void
18164 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
18165 rtx label_ref)
18166 {
18167 rtx x;
18168 x = gen_rtx_fmt_ee (code, VOIDmode,
18169 gen_rtx_REG (cc_mode, CC_REGNUM),
18170 const0_rtx);
18171
18172 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18173 gen_rtx_LABEL_REF (VOIDmode, label_ref),
18174 pc_rtx);
18175 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18176 }
18177
18178 /* Generate DImode scratch registers for 128-bit (TImode) addition.
18179
18180 OP1 represents the TImode destination operand 1
18181 OP2 represents the TImode destination operand 2
18182 LOW_DEST represents the low half (DImode) of TImode operand 0
18183 LOW_IN1 represents the low half (DImode) of TImode operand 1
18184 LOW_IN2 represents the low half (DImode) of TImode operand 2
18185 HIGH_DEST represents the high half (DImode) of TImode operand 0
18186 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18187 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18188
18189 void
18190 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18191 rtx *low_in1, rtx *low_in2,
18192 rtx *high_dest, rtx *high_in1,
18193 rtx *high_in2)
18194 {
18195 *low_dest = gen_reg_rtx (DImode);
18196 *low_in1 = gen_lowpart (DImode, op1);
18197 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18198 subreg_lowpart_offset (DImode, TImode));
18199 *high_dest = gen_reg_rtx (DImode);
18200 *high_in1 = gen_highpart (DImode, op1);
18201 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18202 subreg_highpart_offset (DImode, TImode));
18203 }
18204
18205 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
18206
18207 This function differs from 'arch64_addti_scratch_regs' in that
18208 OP1 can be an immediate constant (zero). We must call
18209 subreg_highpart_offset with DImode and TImode arguments, otherwise
18210 VOIDmode will be used for the const_int which generates an internal
18211 error from subreg_size_highpart_offset which does not expect a size of zero.
18212
18213 OP1 represents the TImode destination operand 1
18214 OP2 represents the TImode destination operand 2
18215 LOW_DEST represents the low half (DImode) of TImode operand 0
18216 LOW_IN1 represents the low half (DImode) of TImode operand 1
18217 LOW_IN2 represents the low half (DImode) of TImode operand 2
18218 HIGH_DEST represents the high half (DImode) of TImode operand 0
18219 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18220 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
18221
18222
18223 void
18224 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
18225 rtx *low_in1, rtx *low_in2,
18226 rtx *high_dest, rtx *high_in1,
18227 rtx *high_in2)
18228 {
18229 *low_dest = gen_reg_rtx (DImode);
18230 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
18231 subreg_lowpart_offset (DImode, TImode));
18232
18233 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
18234 subreg_lowpart_offset (DImode, TImode));
18235 *high_dest = gen_reg_rtx (DImode);
18236
18237 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
18238 subreg_highpart_offset (DImode, TImode));
18239 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
18240 subreg_highpart_offset (DImode, TImode));
18241 }
18242
18243 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
18244
18245 OP0 represents the TImode destination operand 0
18246 LOW_DEST represents the low half (DImode) of TImode operand 0
18247 LOW_IN1 represents the low half (DImode) of TImode operand 1
18248 LOW_IN2 represents the low half (DImode) of TImode operand 2
18249 HIGH_DEST represents the high half (DImode) of TImode operand 0
18250 HIGH_IN1 represents the high half (DImode) of TImode operand 1
18251 HIGH_IN2 represents the high half (DImode) of TImode operand 2
18252 UNSIGNED_P is true if the operation is being performed on unsigned
18253 values. */
18254 void
18255 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
18256 rtx low_in2, rtx high_dest, rtx high_in1,
18257 rtx high_in2, bool unsigned_p)
18258 {
18259 if (low_in2 == const0_rtx)
18260 {
18261 low_dest = low_in1;
18262 high_in2 = force_reg (DImode, high_in2);
18263 if (unsigned_p)
18264 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
18265 else
18266 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
18267 }
18268 else
18269 {
18270 if (CONST_INT_P (low_in2))
18271 {
18272 high_in2 = force_reg (DImode, high_in2);
18273 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
18274 GEN_INT (-INTVAL (low_in2))));
18275 }
18276 else
18277 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
18278
18279 if (unsigned_p)
18280 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
18281 else
18282 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
18283 }
18284
18285 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
18286 emit_move_insn (gen_highpart (DImode, op0), high_dest);
18287
18288 }
18289
18290 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
18291
18292 static unsigned HOST_WIDE_INT
18293 aarch64_asan_shadow_offset (void)
18294 {
18295 if (TARGET_ILP32)
18296 return (HOST_WIDE_INT_1 << 29);
18297 else
18298 return (HOST_WIDE_INT_1 << 36);
18299 }
18300
18301 static rtx
18302 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
18303 int code, tree treeop0, tree treeop1)
18304 {
18305 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18306 rtx op0, op1;
18307 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18308 insn_code icode;
18309 struct expand_operand ops[4];
18310
18311 start_sequence ();
18312 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18313
18314 op_mode = GET_MODE (op0);
18315 if (op_mode == VOIDmode)
18316 op_mode = GET_MODE (op1);
18317
18318 switch (op_mode)
18319 {
18320 case E_QImode:
18321 case E_HImode:
18322 case E_SImode:
18323 cmp_mode = SImode;
18324 icode = CODE_FOR_cmpsi;
18325 break;
18326
18327 case E_DImode:
18328 cmp_mode = DImode;
18329 icode = CODE_FOR_cmpdi;
18330 break;
18331
18332 case E_SFmode:
18333 cmp_mode = SFmode;
18334 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18335 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
18336 break;
18337
18338 case E_DFmode:
18339 cmp_mode = DFmode;
18340 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
18341 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
18342 break;
18343
18344 default:
18345 end_sequence ();
18346 return NULL_RTX;
18347 }
18348
18349 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
18350 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
18351 if (!op0 || !op1)
18352 {
18353 end_sequence ();
18354 return NULL_RTX;
18355 }
18356 *prep_seq = get_insns ();
18357 end_sequence ();
18358
18359 create_fixed_operand (&ops[0], op0);
18360 create_fixed_operand (&ops[1], op1);
18361
18362 start_sequence ();
18363 if (!maybe_expand_insn (icode, 2, ops))
18364 {
18365 end_sequence ();
18366 return NULL_RTX;
18367 }
18368 *gen_seq = get_insns ();
18369 end_sequence ();
18370
18371 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
18372 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
18373 }
18374
18375 static rtx
18376 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
18377 int cmp_code, tree treeop0, tree treeop1, int bit_code)
18378 {
18379 rtx op0, op1, target;
18380 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
18381 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
18382 insn_code icode;
18383 struct expand_operand ops[6];
18384 int aarch64_cond;
18385
18386 push_to_sequence (*prep_seq);
18387 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
18388
18389 op_mode = GET_MODE (op0);
18390 if (op_mode == VOIDmode)
18391 op_mode = GET_MODE (op1);
18392
18393 switch (op_mode)
18394 {
18395 case E_QImode:
18396 case E_HImode:
18397 case E_SImode:
18398 cmp_mode = SImode;
18399 icode = CODE_FOR_ccmpsi;
18400 break;
18401
18402 case E_DImode:
18403 cmp_mode = DImode;
18404 icode = CODE_FOR_ccmpdi;
18405 break;
18406
18407 case E_SFmode:
18408 cmp_mode = SFmode;
18409 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18410 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
18411 break;
18412
18413 case E_DFmode:
18414 cmp_mode = DFmode;
18415 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
18416 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
18417 break;
18418
18419 default:
18420 end_sequence ();
18421 return NULL_RTX;
18422 }
18423
18424 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
18425 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
18426 if (!op0 || !op1)
18427 {
18428 end_sequence ();
18429 return NULL_RTX;
18430 }
18431 *prep_seq = get_insns ();
18432 end_sequence ();
18433
18434 target = gen_rtx_REG (cc_mode, CC_REGNUM);
18435 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
18436
18437 if (bit_code != AND)
18438 {
18439 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
18440 GET_MODE (XEXP (prev, 0))),
18441 VOIDmode, XEXP (prev, 0), const0_rtx);
18442 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
18443 }
18444
18445 create_fixed_operand (&ops[0], XEXP (prev, 0));
18446 create_fixed_operand (&ops[1], target);
18447 create_fixed_operand (&ops[2], op0);
18448 create_fixed_operand (&ops[3], op1);
18449 create_fixed_operand (&ops[4], prev);
18450 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
18451
18452 push_to_sequence (*gen_seq);
18453 if (!maybe_expand_insn (icode, 6, ops))
18454 {
18455 end_sequence ();
18456 return NULL_RTX;
18457 }
18458
18459 *gen_seq = get_insns ();
18460 end_sequence ();
18461
18462 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
18463 }
18464
18465 #undef TARGET_GEN_CCMP_FIRST
18466 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
18467
18468 #undef TARGET_GEN_CCMP_NEXT
18469 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
18470
18471 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
18472 instruction fusion of some sort. */
18473
18474 static bool
18475 aarch64_macro_fusion_p (void)
18476 {
18477 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
18478 }
18479
18480
18481 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
18482 should be kept together during scheduling. */
18483
18484 static bool
18485 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
18486 {
18487 rtx set_dest;
18488 rtx prev_set = single_set (prev);
18489 rtx curr_set = single_set (curr);
18490 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
18491 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
18492
18493 if (!aarch64_macro_fusion_p ())
18494 return false;
18495
18496 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
18497 {
18498 /* We are trying to match:
18499 prev (mov) == (set (reg r0) (const_int imm16))
18500 curr (movk) == (set (zero_extract (reg r0)
18501 (const_int 16)
18502 (const_int 16))
18503 (const_int imm16_1)) */
18504
18505 set_dest = SET_DEST (curr_set);
18506
18507 if (GET_CODE (set_dest) == ZERO_EXTRACT
18508 && CONST_INT_P (SET_SRC (curr_set))
18509 && CONST_INT_P (SET_SRC (prev_set))
18510 && CONST_INT_P (XEXP (set_dest, 2))
18511 && INTVAL (XEXP (set_dest, 2)) == 16
18512 && REG_P (XEXP (set_dest, 0))
18513 && REG_P (SET_DEST (prev_set))
18514 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
18515 {
18516 return true;
18517 }
18518 }
18519
18520 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
18521 {
18522
18523 /* We're trying to match:
18524 prev (adrp) == (set (reg r1)
18525 (high (symbol_ref ("SYM"))))
18526 curr (add) == (set (reg r0)
18527 (lo_sum (reg r1)
18528 (symbol_ref ("SYM"))))
18529 Note that r0 need not necessarily be the same as r1, especially
18530 during pre-regalloc scheduling. */
18531
18532 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18533 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18534 {
18535 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
18536 && REG_P (XEXP (SET_SRC (curr_set), 0))
18537 && REGNO (XEXP (SET_SRC (curr_set), 0))
18538 == REGNO (SET_DEST (prev_set))
18539 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
18540 XEXP (SET_SRC (curr_set), 1)))
18541 return true;
18542 }
18543 }
18544
18545 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
18546 {
18547
18548 /* We're trying to match:
18549 prev (movk) == (set (zero_extract (reg r0)
18550 (const_int 16)
18551 (const_int 32))
18552 (const_int imm16_1))
18553 curr (movk) == (set (zero_extract (reg r0)
18554 (const_int 16)
18555 (const_int 48))
18556 (const_int imm16_2)) */
18557
18558 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
18559 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
18560 && REG_P (XEXP (SET_DEST (prev_set), 0))
18561 && REG_P (XEXP (SET_DEST (curr_set), 0))
18562 && REGNO (XEXP (SET_DEST (prev_set), 0))
18563 == REGNO (XEXP (SET_DEST (curr_set), 0))
18564 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
18565 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
18566 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
18567 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
18568 && CONST_INT_P (SET_SRC (prev_set))
18569 && CONST_INT_P (SET_SRC (curr_set)))
18570 return true;
18571
18572 }
18573 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
18574 {
18575 /* We're trying to match:
18576 prev (adrp) == (set (reg r0)
18577 (high (symbol_ref ("SYM"))))
18578 curr (ldr) == (set (reg r1)
18579 (mem (lo_sum (reg r0)
18580 (symbol_ref ("SYM")))))
18581 or
18582 curr (ldr) == (set (reg r1)
18583 (zero_extend (mem
18584 (lo_sum (reg r0)
18585 (symbol_ref ("SYM")))))) */
18586 if (satisfies_constraint_Ush (SET_SRC (prev_set))
18587 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
18588 {
18589 rtx curr_src = SET_SRC (curr_set);
18590
18591 if (GET_CODE (curr_src) == ZERO_EXTEND)
18592 curr_src = XEXP (curr_src, 0);
18593
18594 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
18595 && REG_P (XEXP (XEXP (curr_src, 0), 0))
18596 && REGNO (XEXP (XEXP (curr_src, 0), 0))
18597 == REGNO (SET_DEST (prev_set))
18598 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
18599 XEXP (SET_SRC (prev_set), 0)))
18600 return true;
18601 }
18602 }
18603
18604 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
18605 && any_condjump_p (curr))
18606 {
18607 unsigned int condreg1, condreg2;
18608 rtx cc_reg_1;
18609 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
18610 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
18611
18612 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
18613 && prev
18614 && modified_in_p (cc_reg_1, prev))
18615 {
18616 enum attr_type prev_type = get_attr_type (prev);
18617
18618 /* FIXME: this misses some which is considered simple arthematic
18619 instructions for ThunderX. Simple shifts are missed here. */
18620 if (prev_type == TYPE_ALUS_SREG
18621 || prev_type == TYPE_ALUS_IMM
18622 || prev_type == TYPE_LOGICS_REG
18623 || prev_type == TYPE_LOGICS_IMM)
18624 return true;
18625 }
18626 }
18627
18628 if (prev_set
18629 && curr_set
18630 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
18631 && any_condjump_p (curr))
18632 {
18633 /* We're trying to match:
18634 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
18635 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
18636 (const_int 0))
18637 (label_ref ("SYM"))
18638 (pc)) */
18639 if (SET_DEST (curr_set) == (pc_rtx)
18640 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
18641 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
18642 && REG_P (SET_DEST (prev_set))
18643 && REGNO (SET_DEST (prev_set))
18644 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
18645 {
18646 /* Fuse ALU operations followed by conditional branch instruction. */
18647 switch (get_attr_type (prev))
18648 {
18649 case TYPE_ALU_IMM:
18650 case TYPE_ALU_SREG:
18651 case TYPE_ADC_REG:
18652 case TYPE_ADC_IMM:
18653 case TYPE_ADCS_REG:
18654 case TYPE_ADCS_IMM:
18655 case TYPE_LOGIC_REG:
18656 case TYPE_LOGIC_IMM:
18657 case TYPE_CSEL:
18658 case TYPE_ADR:
18659 case TYPE_MOV_IMM:
18660 case TYPE_SHIFT_REG:
18661 case TYPE_SHIFT_IMM:
18662 case TYPE_BFM:
18663 case TYPE_RBIT:
18664 case TYPE_REV:
18665 case TYPE_EXTEND:
18666 return true;
18667
18668 default:;
18669 }
18670 }
18671 }
18672
18673 return false;
18674 }
18675
18676 /* Return true iff the instruction fusion described by OP is enabled. */
18677
18678 bool
18679 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
18680 {
18681 return (aarch64_tune_params.fusible_ops & op) != 0;
18682 }
18683
18684 /* If MEM is in the form of [base+offset], extract the two parts
18685 of address and set to BASE and OFFSET, otherwise return false
18686 after clearing BASE and OFFSET. */
18687
18688 bool
18689 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
18690 {
18691 rtx addr;
18692
18693 gcc_assert (MEM_P (mem));
18694
18695 addr = XEXP (mem, 0);
18696
18697 if (REG_P (addr))
18698 {
18699 *base = addr;
18700 *offset = const0_rtx;
18701 return true;
18702 }
18703
18704 if (GET_CODE (addr) == PLUS
18705 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
18706 {
18707 *base = XEXP (addr, 0);
18708 *offset = XEXP (addr, 1);
18709 return true;
18710 }
18711
18712 *base = NULL_RTX;
18713 *offset = NULL_RTX;
18714
18715 return false;
18716 }
18717
18718 /* Types for scheduling fusion. */
18719 enum sched_fusion_type
18720 {
18721 SCHED_FUSION_NONE = 0,
18722 SCHED_FUSION_LD_SIGN_EXTEND,
18723 SCHED_FUSION_LD_ZERO_EXTEND,
18724 SCHED_FUSION_LD,
18725 SCHED_FUSION_ST,
18726 SCHED_FUSION_NUM
18727 };
18728
18729 /* If INSN is a load or store of address in the form of [base+offset],
18730 extract the two parts and set to BASE and OFFSET. Return scheduling
18731 fusion type this INSN is. */
18732
18733 static enum sched_fusion_type
18734 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
18735 {
18736 rtx x, dest, src;
18737 enum sched_fusion_type fusion = SCHED_FUSION_LD;
18738
18739 gcc_assert (INSN_P (insn));
18740 x = PATTERN (insn);
18741 if (GET_CODE (x) != SET)
18742 return SCHED_FUSION_NONE;
18743
18744 src = SET_SRC (x);
18745 dest = SET_DEST (x);
18746
18747 machine_mode dest_mode = GET_MODE (dest);
18748
18749 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
18750 return SCHED_FUSION_NONE;
18751
18752 if (GET_CODE (src) == SIGN_EXTEND)
18753 {
18754 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
18755 src = XEXP (src, 0);
18756 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18757 return SCHED_FUSION_NONE;
18758 }
18759 else if (GET_CODE (src) == ZERO_EXTEND)
18760 {
18761 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
18762 src = XEXP (src, 0);
18763 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
18764 return SCHED_FUSION_NONE;
18765 }
18766
18767 if (GET_CODE (src) == MEM && REG_P (dest))
18768 extract_base_offset_in_addr (src, base, offset);
18769 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
18770 {
18771 fusion = SCHED_FUSION_ST;
18772 extract_base_offset_in_addr (dest, base, offset);
18773 }
18774 else
18775 return SCHED_FUSION_NONE;
18776
18777 if (*base == NULL_RTX || *offset == NULL_RTX)
18778 fusion = SCHED_FUSION_NONE;
18779
18780 return fusion;
18781 }
18782
18783 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
18784
18785 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
18786 and PRI are only calculated for these instructions. For other instruction,
18787 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
18788 type instruction fusion can be added by returning different priorities.
18789
18790 It's important that irrelevant instructions get the largest FUSION_PRI. */
18791
18792 static void
18793 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
18794 int *fusion_pri, int *pri)
18795 {
18796 int tmp, off_val;
18797 rtx base, offset;
18798 enum sched_fusion_type fusion;
18799
18800 gcc_assert (INSN_P (insn));
18801
18802 tmp = max_pri - 1;
18803 fusion = fusion_load_store (insn, &base, &offset);
18804 if (fusion == SCHED_FUSION_NONE)
18805 {
18806 *pri = tmp;
18807 *fusion_pri = tmp;
18808 return;
18809 }
18810
18811 /* Set FUSION_PRI according to fusion type and base register. */
18812 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
18813
18814 /* Calculate PRI. */
18815 tmp /= 2;
18816
18817 /* INSN with smaller offset goes first. */
18818 off_val = (int)(INTVAL (offset));
18819 if (off_val >= 0)
18820 tmp -= (off_val & 0xfffff);
18821 else
18822 tmp += ((- off_val) & 0xfffff);
18823
18824 *pri = tmp;
18825 return;
18826 }
18827
18828 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
18829 Adjust priority of sha1h instructions so they are scheduled before
18830 other SHA1 instructions. */
18831
18832 static int
18833 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
18834 {
18835 rtx x = PATTERN (insn);
18836
18837 if (GET_CODE (x) == SET)
18838 {
18839 x = SET_SRC (x);
18840
18841 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
18842 return priority + 10;
18843 }
18844
18845 return priority;
18846 }
18847
18848 /* Given OPERANDS of consecutive load/store, check if we can merge
18849 them into ldp/stp. LOAD is true if they are load instructions.
18850 MODE is the mode of memory operands. */
18851
18852 bool
18853 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
18854 machine_mode mode)
18855 {
18856 HOST_WIDE_INT offval_1, offval_2, msize;
18857 enum reg_class rclass_1, rclass_2;
18858 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
18859
18860 if (load)
18861 {
18862 mem_1 = operands[1];
18863 mem_2 = operands[3];
18864 reg_1 = operands[0];
18865 reg_2 = operands[2];
18866 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
18867 if (REGNO (reg_1) == REGNO (reg_2))
18868 return false;
18869 }
18870 else
18871 {
18872 mem_1 = operands[0];
18873 mem_2 = operands[2];
18874 reg_1 = operands[1];
18875 reg_2 = operands[3];
18876 }
18877
18878 /* The mems cannot be volatile. */
18879 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
18880 return false;
18881
18882 /* If we have SImode and slow unaligned ldp,
18883 check the alignment to be at least 8 byte. */
18884 if (mode == SImode
18885 && (aarch64_tune_params.extra_tuning_flags
18886 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18887 && !optimize_size
18888 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
18889 return false;
18890
18891 /* Check if the addresses are in the form of [base+offset]. */
18892 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18893 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
18894 return false;
18895 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18896 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
18897 return false;
18898
18899 /* Check if the bases are same. */
18900 if (!rtx_equal_p (base_1, base_2))
18901 return false;
18902
18903 /* The operands must be of the same size. */
18904 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
18905 GET_MODE_SIZE (GET_MODE (mem_2))));
18906
18907 offval_1 = INTVAL (offset_1);
18908 offval_2 = INTVAL (offset_2);
18909 /* We should only be trying this for fixed-sized modes. There is no
18910 SVE LDP/STP instruction. */
18911 msize = GET_MODE_SIZE (mode).to_constant ();
18912 /* Check if the offsets are consecutive. */
18913 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
18914 return false;
18915
18916 /* Check if the addresses are clobbered by load. */
18917 if (load)
18918 {
18919 if (reg_mentioned_p (reg_1, mem_1))
18920 return false;
18921
18922 /* In increasing order, the last load can clobber the address. */
18923 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
18924 return false;
18925 }
18926
18927 /* One of the memory accesses must be a mempair operand.
18928 If it is not the first one, they need to be swapped by the
18929 peephole. */
18930 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
18931 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
18932 return false;
18933
18934 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
18935 rclass_1 = FP_REGS;
18936 else
18937 rclass_1 = GENERAL_REGS;
18938
18939 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
18940 rclass_2 = FP_REGS;
18941 else
18942 rclass_2 = GENERAL_REGS;
18943
18944 /* Check if the registers are of same class. */
18945 if (rclass_1 != rclass_2)
18946 return false;
18947
18948 return true;
18949 }
18950
18951 /* Given OPERANDS of consecutive load/store that can be merged,
18952 swap them if they are not in ascending order. */
18953 void
18954 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
18955 {
18956 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
18957 HOST_WIDE_INT offval_1, offval_2;
18958
18959 if (load)
18960 {
18961 mem_1 = operands[1];
18962 mem_2 = operands[3];
18963 }
18964 else
18965 {
18966 mem_1 = operands[0];
18967 mem_2 = operands[2];
18968 }
18969
18970 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
18971 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
18972
18973 offval_1 = INTVAL (offset_1);
18974 offval_2 = INTVAL (offset_2);
18975
18976 if (offval_1 > offval_2)
18977 {
18978 /* Irrespective of whether this is a load or a store,
18979 we do the same swap. */
18980 std::swap (operands[0], operands[2]);
18981 std::swap (operands[1], operands[3]);
18982 }
18983 }
18984
18985 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18986 comparison between the two. */
18987 int
18988 aarch64_host_wide_int_compare (const void *x, const void *y)
18989 {
18990 return wi::cmps (* ((const HOST_WIDE_INT *) x),
18991 * ((const HOST_WIDE_INT *) y));
18992 }
18993
18994 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18995 other pointing to a REG rtx containing an offset, compare the offsets
18996 of the two pairs.
18997
18998 Return:
18999
19000 1 iff offset (X) > offset (Y)
19001 0 iff offset (X) == offset (Y)
19002 -1 iff offset (X) < offset (Y) */
19003 int
19004 aarch64_ldrstr_offset_compare (const void *x, const void *y)
19005 {
19006 const rtx * operands_1 = (const rtx *) x;
19007 const rtx * operands_2 = (const rtx *) y;
19008 rtx mem_1, mem_2, base, offset_1, offset_2;
19009
19010 if (MEM_P (operands_1[0]))
19011 mem_1 = operands_1[0];
19012 else
19013 mem_1 = operands_1[1];
19014
19015 if (MEM_P (operands_2[0]))
19016 mem_2 = operands_2[0];
19017 else
19018 mem_2 = operands_2[1];
19019
19020 /* Extract the offsets. */
19021 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19022 extract_base_offset_in_addr (mem_2, &base, &offset_2);
19023
19024 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
19025
19026 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
19027 }
19028
19029 /* Given OPERANDS of consecutive load/store, check if we can merge
19030 them into ldp/stp by adjusting the offset. LOAD is true if they
19031 are load instructions. MODE is the mode of memory operands.
19032
19033 Given below consecutive stores:
19034
19035 str w1, [xb, 0x100]
19036 str w1, [xb, 0x104]
19037 str w1, [xb, 0x108]
19038 str w1, [xb, 0x10c]
19039
19040 Though the offsets are out of the range supported by stp, we can
19041 still pair them after adjusting the offset, like:
19042
19043 add scratch, xb, 0x100
19044 stp w1, w1, [scratch]
19045 stp w1, w1, [scratch, 0x8]
19046
19047 The peephole patterns detecting this opportunity should guarantee
19048 the scratch register is avaliable. */
19049
19050 bool
19051 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
19052 scalar_mode mode)
19053 {
19054 const int num_insns = 4;
19055 enum reg_class rclass;
19056 HOST_WIDE_INT offvals[num_insns], msize;
19057 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
19058
19059 if (load)
19060 {
19061 for (int i = 0; i < num_insns; i++)
19062 {
19063 reg[i] = operands[2 * i];
19064 mem[i] = operands[2 * i + 1];
19065
19066 gcc_assert (REG_P (reg[i]));
19067 }
19068
19069 /* Do not attempt to merge the loads if the loads clobber each other. */
19070 for (int i = 0; i < 8; i += 2)
19071 for (int j = i + 2; j < 8; j += 2)
19072 if (reg_overlap_mentioned_p (operands[i], operands[j]))
19073 return false;
19074 }
19075 else
19076 for (int i = 0; i < num_insns; i++)
19077 {
19078 mem[i] = operands[2 * i];
19079 reg[i] = operands[2 * i + 1];
19080 }
19081
19082 /* Skip if memory operand is by itself valid for ldp/stp. */
19083 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
19084 return false;
19085
19086 for (int i = 0; i < num_insns; i++)
19087 {
19088 /* The mems cannot be volatile. */
19089 if (MEM_VOLATILE_P (mem[i]))
19090 return false;
19091
19092 /* Check if the addresses are in the form of [base+offset]. */
19093 extract_base_offset_in_addr (mem[i], base + i, offset + i);
19094 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
19095 return false;
19096 }
19097
19098 /* Check if the registers are of same class. */
19099 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
19100 ? FP_REGS : GENERAL_REGS;
19101
19102 for (int i = 1; i < num_insns; i++)
19103 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
19104 {
19105 if (rclass != FP_REGS)
19106 return false;
19107 }
19108 else
19109 {
19110 if (rclass != GENERAL_REGS)
19111 return false;
19112 }
19113
19114 /* Only the last register in the order in which they occur
19115 may be clobbered by the load. */
19116 if (rclass == GENERAL_REGS && load)
19117 for (int i = 0; i < num_insns - 1; i++)
19118 if (reg_mentioned_p (reg[i], mem[i]))
19119 return false;
19120
19121 /* Check if the bases are same. */
19122 for (int i = 0; i < num_insns - 1; i++)
19123 if (!rtx_equal_p (base[i], base[i + 1]))
19124 return false;
19125
19126 for (int i = 0; i < num_insns; i++)
19127 offvals[i] = INTVAL (offset[i]);
19128
19129 msize = GET_MODE_SIZE (mode);
19130
19131 /* Check if the offsets can be put in the right order to do a ldp/stp. */
19132 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
19133 aarch64_host_wide_int_compare);
19134
19135 if (!(offvals[1] == offvals[0] + msize
19136 && offvals[3] == offvals[2] + msize))
19137 return false;
19138
19139 /* Check that offsets are within range of each other. The ldp/stp
19140 instructions have 7 bit immediate offsets, so use 0x80. */
19141 if (offvals[2] - offvals[0] >= msize * 0x80)
19142 return false;
19143
19144 /* The offsets must be aligned with respect to each other. */
19145 if (offvals[0] % msize != offvals[2] % msize)
19146 return false;
19147
19148 /* If we have SImode and slow unaligned ldp,
19149 check the alignment to be at least 8 byte. */
19150 if (mode == SImode
19151 && (aarch64_tune_params.extra_tuning_flags
19152 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
19153 && !optimize_size
19154 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
19155 return false;
19156
19157 return true;
19158 }
19159
19160 /* Given OPERANDS of consecutive load/store, this function pairs them
19161 into LDP/STP after adjusting the offset. It depends on the fact
19162 that the operands can be sorted so the offsets are correct for STP.
19163 MODE is the mode of memory operands. CODE is the rtl operator
19164 which should be applied to all memory operands, it's SIGN_EXTEND,
19165 ZERO_EXTEND or UNKNOWN. */
19166
19167 bool
19168 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
19169 scalar_mode mode, RTX_CODE code)
19170 {
19171 rtx base, offset_1, offset_3, t1, t2;
19172 rtx mem_1, mem_2, mem_3, mem_4;
19173 rtx temp_operands[8];
19174 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
19175 stp_off_upper_limit, stp_off_lower_limit, msize;
19176
19177 /* We make changes on a copy as we may still bail out. */
19178 for (int i = 0; i < 8; i ++)
19179 temp_operands[i] = operands[i];
19180
19181 /* Sort the operands. */
19182 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
19183
19184 /* Copy the memory operands so that if we have to bail for some
19185 reason the original addresses are unchanged. */
19186 if (load)
19187 {
19188 mem_1 = copy_rtx (temp_operands[1]);
19189 mem_2 = copy_rtx (temp_operands[3]);
19190 mem_3 = copy_rtx (temp_operands[5]);
19191 mem_4 = copy_rtx (temp_operands[7]);
19192 }
19193 else
19194 {
19195 mem_1 = copy_rtx (temp_operands[0]);
19196 mem_2 = copy_rtx (temp_operands[2]);
19197 mem_3 = copy_rtx (temp_operands[4]);
19198 mem_4 = copy_rtx (temp_operands[6]);
19199 gcc_assert (code == UNKNOWN);
19200 }
19201
19202 extract_base_offset_in_addr (mem_1, &base, &offset_1);
19203 extract_base_offset_in_addr (mem_3, &base, &offset_3);
19204 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
19205 && offset_3 != NULL_RTX);
19206
19207 /* Adjust offset so it can fit in LDP/STP instruction. */
19208 msize = GET_MODE_SIZE (mode);
19209 stp_off_upper_limit = msize * (0x40 - 1);
19210 stp_off_lower_limit = - msize * 0x40;
19211
19212 off_val_1 = INTVAL (offset_1);
19213 off_val_3 = INTVAL (offset_3);
19214
19215 /* The base offset is optimally half way between the two STP/LDP offsets. */
19216 if (msize <= 4)
19217 base_off = (off_val_1 + off_val_3) / 2;
19218 else
19219 /* However, due to issues with negative LDP/STP offset generation for
19220 larger modes, for DF, DI and vector modes. we must not use negative
19221 addresses smaller than 9 signed unadjusted bits can store. This
19222 provides the most range in this case. */
19223 base_off = off_val_1;
19224
19225 /* Adjust the base so that it is aligned with the addresses but still
19226 optimal. */
19227 if (base_off % msize != off_val_1 % msize)
19228 /* Fix the offset, bearing in mind we want to make it bigger not
19229 smaller. */
19230 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19231 else if (msize <= 4)
19232 /* The negative range of LDP/STP is one larger than the positive range. */
19233 base_off += msize;
19234
19235 /* Check if base offset is too big or too small. We can attempt to resolve
19236 this issue by setting it to the maximum value and seeing if the offsets
19237 still fit. */
19238 if (base_off >= 0x1000)
19239 {
19240 base_off = 0x1000 - 1;
19241 /* We must still make sure that the base offset is aligned with respect
19242 to the address. But it may may not be made any bigger. */
19243 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19244 }
19245
19246 /* Likewise for the case where the base is too small. */
19247 if (base_off <= -0x1000)
19248 {
19249 base_off = -0x1000 + 1;
19250 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
19251 }
19252
19253 /* Offset of the first STP/LDP. */
19254 new_off_1 = off_val_1 - base_off;
19255
19256 /* Offset of the second STP/LDP. */
19257 new_off_3 = off_val_3 - base_off;
19258
19259 /* The offsets must be within the range of the LDP/STP instructions. */
19260 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
19261 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
19262 return false;
19263
19264 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
19265 new_off_1), true);
19266 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
19267 new_off_1 + msize), true);
19268 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
19269 new_off_3), true);
19270 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
19271 new_off_3 + msize), true);
19272
19273 if (!aarch64_mem_pair_operand (mem_1, mode)
19274 || !aarch64_mem_pair_operand (mem_3, mode))
19275 return false;
19276
19277 if (code == ZERO_EXTEND)
19278 {
19279 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
19280 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
19281 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
19282 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
19283 }
19284 else if (code == SIGN_EXTEND)
19285 {
19286 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
19287 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
19288 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
19289 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
19290 }
19291
19292 if (load)
19293 {
19294 operands[0] = temp_operands[0];
19295 operands[1] = mem_1;
19296 operands[2] = temp_operands[2];
19297 operands[3] = mem_2;
19298 operands[4] = temp_operands[4];
19299 operands[5] = mem_3;
19300 operands[6] = temp_operands[6];
19301 operands[7] = mem_4;
19302 }
19303 else
19304 {
19305 operands[0] = mem_1;
19306 operands[1] = temp_operands[1];
19307 operands[2] = mem_2;
19308 operands[3] = temp_operands[3];
19309 operands[4] = mem_3;
19310 operands[5] = temp_operands[5];
19311 operands[6] = mem_4;
19312 operands[7] = temp_operands[7];
19313 }
19314
19315 /* Emit adjusting instruction. */
19316 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
19317 /* Emit ldp/stp instructions. */
19318 t1 = gen_rtx_SET (operands[0], operands[1]);
19319 t2 = gen_rtx_SET (operands[2], operands[3]);
19320 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19321 t1 = gen_rtx_SET (operands[4], operands[5]);
19322 t2 = gen_rtx_SET (operands[6], operands[7]);
19323 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
19324 return true;
19325 }
19326
19327 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
19328 it isn't worth branching around empty masked ops (including masked
19329 stores). */
19330
19331 static bool
19332 aarch64_empty_mask_is_expensive (unsigned)
19333 {
19334 return false;
19335 }
19336
19337 /* Return 1 if pseudo register should be created and used to hold
19338 GOT address for PIC code. */
19339
19340 bool
19341 aarch64_use_pseudo_pic_reg (void)
19342 {
19343 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
19344 }
19345
19346 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
19347
19348 static int
19349 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
19350 {
19351 switch (XINT (x, 1))
19352 {
19353 case UNSPEC_GOTSMALLPIC:
19354 case UNSPEC_GOTSMALLPIC28K:
19355 case UNSPEC_GOTTINYPIC:
19356 return 0;
19357 default:
19358 break;
19359 }
19360
19361 return default_unspec_may_trap_p (x, flags);
19362 }
19363
19364
19365 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
19366 return the log2 of that value. Otherwise return -1. */
19367
19368 int
19369 aarch64_fpconst_pow_of_2 (rtx x)
19370 {
19371 const REAL_VALUE_TYPE *r;
19372
19373 if (!CONST_DOUBLE_P (x))
19374 return -1;
19375
19376 r = CONST_DOUBLE_REAL_VALUE (x);
19377
19378 if (REAL_VALUE_NEGATIVE (*r)
19379 || REAL_VALUE_ISNAN (*r)
19380 || REAL_VALUE_ISINF (*r)
19381 || !real_isinteger (r, DFmode))
19382 return -1;
19383
19384 return exact_log2 (real_to_integer (r));
19385 }
19386
19387 /* If X is a vector of equal CONST_DOUBLE values and that value is
19388 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
19389
19390 int
19391 aarch64_vec_fpconst_pow_of_2 (rtx x)
19392 {
19393 int nelts;
19394 if (GET_CODE (x) != CONST_VECTOR
19395 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
19396 return -1;
19397
19398 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
19399 return -1;
19400
19401 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
19402 if (firstval <= 0)
19403 return -1;
19404
19405 for (int i = 1; i < nelts; i++)
19406 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
19407 return -1;
19408
19409 return firstval;
19410 }
19411
19412 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
19413 to float.
19414
19415 __fp16 always promotes through this hook.
19416 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
19417 through the generic excess precision logic rather than here. */
19418
19419 static tree
19420 aarch64_promoted_type (const_tree t)
19421 {
19422 if (SCALAR_FLOAT_TYPE_P (t)
19423 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
19424 return float_type_node;
19425
19426 return NULL_TREE;
19427 }
19428
19429 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
19430
19431 static bool
19432 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
19433 optimization_type opt_type)
19434 {
19435 switch (op)
19436 {
19437 case rsqrt_optab:
19438 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
19439
19440 default:
19441 return true;
19442 }
19443 }
19444
19445 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
19446
19447 static unsigned int
19448 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
19449 int *offset)
19450 {
19451 /* Polynomial invariant 1 == (VG / 2) - 1. */
19452 gcc_assert (i == 1);
19453 *factor = 2;
19454 *offset = 1;
19455 return AARCH64_DWARF_VG;
19456 }
19457
19458 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
19459 if MODE is HFmode, and punt to the generic implementation otherwise. */
19460
19461 static bool
19462 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
19463 {
19464 return (mode == HFmode
19465 ? true
19466 : default_libgcc_floating_mode_supported_p (mode));
19467 }
19468
19469 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
19470 if MODE is HFmode, and punt to the generic implementation otherwise. */
19471
19472 static bool
19473 aarch64_scalar_mode_supported_p (scalar_mode mode)
19474 {
19475 return (mode == HFmode
19476 ? true
19477 : default_scalar_mode_supported_p (mode));
19478 }
19479
19480 /* Set the value of FLT_EVAL_METHOD.
19481 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
19482
19483 0: evaluate all operations and constants, whose semantic type has at
19484 most the range and precision of type float, to the range and
19485 precision of float; evaluate all other operations and constants to
19486 the range and precision of the semantic type;
19487
19488 N, where _FloatN is a supported interchange floating type
19489 evaluate all operations and constants, whose semantic type has at
19490 most the range and precision of _FloatN type, to the range and
19491 precision of the _FloatN type; evaluate all other operations and
19492 constants to the range and precision of the semantic type;
19493
19494 If we have the ARMv8.2-A extensions then we support _Float16 in native
19495 precision, so we should set this to 16. Otherwise, we support the type,
19496 but want to evaluate expressions in float precision, so set this to
19497 0. */
19498
19499 static enum flt_eval_method
19500 aarch64_excess_precision (enum excess_precision_type type)
19501 {
19502 switch (type)
19503 {
19504 case EXCESS_PRECISION_TYPE_FAST:
19505 case EXCESS_PRECISION_TYPE_STANDARD:
19506 /* We can calculate either in 16-bit range and precision or
19507 32-bit range and precision. Make that decision based on whether
19508 we have native support for the ARMv8.2-A 16-bit floating-point
19509 instructions or not. */
19510 return (TARGET_FP_F16INST
19511 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
19512 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
19513 case EXCESS_PRECISION_TYPE_IMPLICIT:
19514 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
19515 default:
19516 gcc_unreachable ();
19517 }
19518 return FLT_EVAL_METHOD_UNPREDICTABLE;
19519 }
19520
19521 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
19522 scheduled for speculative execution. Reject the long-running division
19523 and square-root instructions. */
19524
19525 static bool
19526 aarch64_sched_can_speculate_insn (rtx_insn *insn)
19527 {
19528 switch (get_attr_type (insn))
19529 {
19530 case TYPE_SDIV:
19531 case TYPE_UDIV:
19532 case TYPE_FDIVS:
19533 case TYPE_FDIVD:
19534 case TYPE_FSQRTS:
19535 case TYPE_FSQRTD:
19536 case TYPE_NEON_FP_SQRT_S:
19537 case TYPE_NEON_FP_SQRT_D:
19538 case TYPE_NEON_FP_SQRT_S_Q:
19539 case TYPE_NEON_FP_SQRT_D_Q:
19540 case TYPE_NEON_FP_DIV_S:
19541 case TYPE_NEON_FP_DIV_D:
19542 case TYPE_NEON_FP_DIV_S_Q:
19543 case TYPE_NEON_FP_DIV_D_Q:
19544 return false;
19545 default:
19546 return true;
19547 }
19548 }
19549
19550 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
19551
19552 static int
19553 aarch64_compute_pressure_classes (reg_class *classes)
19554 {
19555 int i = 0;
19556 classes[i++] = GENERAL_REGS;
19557 classes[i++] = FP_REGS;
19558 /* PR_REGS isn't a useful pressure class because many predicate pseudo
19559 registers need to go in PR_LO_REGS at some point during their
19560 lifetime. Splitting it into two halves has the effect of making
19561 all predicates count against PR_LO_REGS, so that we try whenever
19562 possible to restrict the number of live predicates to 8. This
19563 greatly reduces the amount of spilling in certain loops. */
19564 classes[i++] = PR_LO_REGS;
19565 classes[i++] = PR_HI_REGS;
19566 return i;
19567 }
19568
19569 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
19570
19571 static bool
19572 aarch64_can_change_mode_class (machine_mode from,
19573 machine_mode to, reg_class_t)
19574 {
19575 if (BYTES_BIG_ENDIAN)
19576 {
19577 bool from_sve_p = aarch64_sve_data_mode_p (from);
19578 bool to_sve_p = aarch64_sve_data_mode_p (to);
19579
19580 /* Don't allow changes between SVE data modes and non-SVE modes.
19581 See the comment at the head of aarch64-sve.md for details. */
19582 if (from_sve_p != to_sve_p)
19583 return false;
19584
19585 /* Don't allow changes in element size: lane 0 of the new vector
19586 would not then be lane 0 of the old vector. See the comment
19587 above aarch64_maybe_expand_sve_subreg_move for a more detailed
19588 description.
19589
19590 In the worst case, this forces a register to be spilled in
19591 one mode and reloaded in the other, which handles the
19592 endianness correctly. */
19593 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
19594 return false;
19595 }
19596 return true;
19597 }
19598
19599 /* Implement TARGET_EARLY_REMAT_MODES. */
19600
19601 static void
19602 aarch64_select_early_remat_modes (sbitmap modes)
19603 {
19604 /* SVE values are not normally live across a call, so it should be
19605 worth doing early rematerialization even in VL-specific mode. */
19606 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
19607 {
19608 machine_mode mode = (machine_mode) i;
19609 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19610 if (vec_flags & VEC_ANY_SVE)
19611 bitmap_set_bit (modes, i);
19612 }
19613 }
19614
19615 /* Override the default target speculation_safe_value. */
19616 static rtx
19617 aarch64_speculation_safe_value (machine_mode mode,
19618 rtx result, rtx val, rtx failval)
19619 {
19620 /* Maybe we should warn if falling back to hard barriers. They are
19621 likely to be noticably more expensive than the alternative below. */
19622 if (!aarch64_track_speculation)
19623 return default_speculation_safe_value (mode, result, val, failval);
19624
19625 if (!REG_P (val))
19626 val = copy_to_mode_reg (mode, val);
19627
19628 if (!aarch64_reg_or_zero (failval, mode))
19629 failval = copy_to_mode_reg (mode, failval);
19630
19631 emit_insn (gen_despeculate_copy (mode, result, val, failval));
19632 return result;
19633 }
19634
19635 /* Implement TARGET_ESTIMATED_POLY_VALUE.
19636 Look into the tuning structure for an estimate.
19637 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
19638 Advanced SIMD 128 bits. */
19639
19640 static HOST_WIDE_INT
19641 aarch64_estimated_poly_value (poly_int64 val)
19642 {
19643 enum aarch64_sve_vector_bits_enum width_source
19644 = aarch64_tune_params.sve_width;
19645
19646 /* If we still don't have an estimate, use the default. */
19647 if (width_source == SVE_SCALABLE)
19648 return default_estimated_poly_value (val);
19649
19650 HOST_WIDE_INT over_128 = width_source - 128;
19651 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
19652 }
19653
19654
19655 /* Return true for types that could be supported as SIMD return or
19656 argument types. */
19657
19658 static bool
19659 supported_simd_type (tree t)
19660 {
19661 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
19662 {
19663 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
19664 return s == 1 || s == 2 || s == 4 || s == 8;
19665 }
19666 return false;
19667 }
19668
19669 /* Return true for types that currently are supported as SIMD return
19670 or argument types. */
19671
19672 static bool
19673 currently_supported_simd_type (tree t, tree b)
19674 {
19675 if (COMPLEX_FLOAT_TYPE_P (t))
19676 return false;
19677
19678 if (TYPE_SIZE (t) != TYPE_SIZE (b))
19679 return false;
19680
19681 return supported_simd_type (t);
19682 }
19683
19684 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
19685
19686 static int
19687 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
19688 struct cgraph_simd_clone *clonei,
19689 tree base_type, int num)
19690 {
19691 tree t, ret_type, arg_type;
19692 unsigned int elt_bits, vec_bits, count;
19693
19694 if (!TARGET_SIMD)
19695 return 0;
19696
19697 if (clonei->simdlen
19698 && (clonei->simdlen < 2
19699 || clonei->simdlen > 1024
19700 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
19701 {
19702 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19703 "unsupported simdlen %d", clonei->simdlen);
19704 return 0;
19705 }
19706
19707 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
19708 if (TREE_CODE (ret_type) != VOID_TYPE
19709 && !currently_supported_simd_type (ret_type, base_type))
19710 {
19711 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
19712 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19713 "GCC does not currently support mixed size types "
19714 "for %<simd%> functions");
19715 else if (supported_simd_type (ret_type))
19716 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19717 "GCC does not currently support return type %qT "
19718 "for %<simd%> functions", ret_type);
19719 else
19720 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19721 "unsupported return type %qT for %<simd%> functions",
19722 ret_type);
19723 return 0;
19724 }
19725
19726 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
19727 {
19728 arg_type = TREE_TYPE (t);
19729
19730 if (!currently_supported_simd_type (arg_type, base_type))
19731 {
19732 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
19733 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19734 "GCC does not currently support mixed size types "
19735 "for %<simd%> functions");
19736 else
19737 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19738 "GCC does not currently support argument type %qT "
19739 "for %<simd%> functions", arg_type);
19740 return 0;
19741 }
19742 }
19743
19744 clonei->vecsize_mangle = 'n';
19745 clonei->mask_mode = VOIDmode;
19746 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
19747 if (clonei->simdlen == 0)
19748 {
19749 count = 2;
19750 vec_bits = (num == 0 ? 64 : 128);
19751 clonei->simdlen = vec_bits / elt_bits;
19752 }
19753 else
19754 {
19755 count = 1;
19756 vec_bits = clonei->simdlen * elt_bits;
19757 if (vec_bits != 64 && vec_bits != 128)
19758 {
19759 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
19760 "GCC does not currently support simdlen %d for type %qT",
19761 clonei->simdlen, base_type);
19762 return 0;
19763 }
19764 }
19765 clonei->vecsize_int = vec_bits;
19766 clonei->vecsize_float = vec_bits;
19767 return count;
19768 }
19769
19770 /* Implement TARGET_SIMD_CLONE_ADJUST. */
19771
19772 static void
19773 aarch64_simd_clone_adjust (struct cgraph_node *node)
19774 {
19775 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
19776 use the correct ABI. */
19777
19778 tree t = TREE_TYPE (node->decl);
19779 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
19780 TYPE_ATTRIBUTES (t));
19781 }
19782
19783 /* Implement TARGET_SIMD_CLONE_USABLE. */
19784
19785 static int
19786 aarch64_simd_clone_usable (struct cgraph_node *node)
19787 {
19788 switch (node->simdclone->vecsize_mangle)
19789 {
19790 case 'n':
19791 if (!TARGET_SIMD)
19792 return -1;
19793 return 0;
19794 default:
19795 gcc_unreachable ();
19796 }
19797 }
19798
19799 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
19800
19801 static int
19802 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
19803 {
19804 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
19805 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
19806 return 0;
19807 return 1;
19808 }
19809
19810 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
19811
19812 static const char *
19813 aarch64_get_multilib_abi_name (void)
19814 {
19815 if (TARGET_BIG_END)
19816 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
19817 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
19818 }
19819
19820 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
19821 global variable based guard use the default else
19822 return a null tree. */
19823 static tree
19824 aarch64_stack_protect_guard (void)
19825 {
19826 if (aarch64_stack_protector_guard == SSP_GLOBAL)
19827 return default_stack_protect_guard ();
19828
19829 return NULL_TREE;
19830 }
19831
19832 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
19833 section at the end if needed. */
19834 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
19835 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
19836 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
19837 void
19838 aarch64_file_end_indicate_exec_stack ()
19839 {
19840 file_end_indicate_exec_stack ();
19841
19842 unsigned feature_1_and = 0;
19843 if (aarch64_bti_enabled ())
19844 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
19845
19846 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
19847 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
19848
19849 if (feature_1_and)
19850 {
19851 /* Generate .note.gnu.property section. */
19852 switch_to_section (get_section (".note.gnu.property",
19853 SECTION_NOTYPE, NULL));
19854
19855 /* PT_NOTE header: namesz, descsz, type.
19856 namesz = 4 ("GNU\0")
19857 descsz = 16 (Size of the program property array)
19858 [(12 + padding) * Number of array elements]
19859 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
19860 assemble_align (POINTER_SIZE);
19861 assemble_integer (GEN_INT (4), 4, 32, 1);
19862 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
19863 assemble_integer (GEN_INT (5), 4, 32, 1);
19864
19865 /* PT_NOTE name. */
19866 assemble_string ("GNU", 4);
19867
19868 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
19869 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
19870 datasz = 4
19871 data = feature_1_and. */
19872 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
19873 assemble_integer (GEN_INT (4), 4, 32, 1);
19874 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
19875
19876 /* Pad the size of the note to the required alignment. */
19877 assemble_align (POINTER_SIZE);
19878 }
19879 }
19880 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
19881 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
19882 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
19883
19884 /* Target-specific selftests. */
19885
19886 #if CHECKING_P
19887
19888 namespace selftest {
19889
19890 /* Selftest for the RTL loader.
19891 Verify that the RTL loader copes with a dump from
19892 print_rtx_function. This is essentially just a test that class
19893 function_reader can handle a real dump, but it also verifies
19894 that lookup_reg_by_dump_name correctly handles hard regs.
19895 The presence of hard reg names in the dump means that the test is
19896 target-specific, hence it is in this file. */
19897
19898 static void
19899 aarch64_test_loading_full_dump ()
19900 {
19901 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
19902
19903 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
19904
19905 rtx_insn *insn_1 = get_insn_by_uid (1);
19906 ASSERT_EQ (NOTE, GET_CODE (insn_1));
19907
19908 rtx_insn *insn_15 = get_insn_by_uid (15);
19909 ASSERT_EQ (INSN, GET_CODE (insn_15));
19910 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
19911
19912 /* Verify crtl->return_rtx. */
19913 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
19914 ASSERT_EQ (0, REGNO (crtl->return_rtx));
19915 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
19916 }
19917
19918 /* Run all target-specific selftests. */
19919
19920 static void
19921 aarch64_run_selftests (void)
19922 {
19923 aarch64_test_loading_full_dump ();
19924 }
19925
19926 } // namespace selftest
19927
19928 #endif /* #if CHECKING_P */
19929
19930 #undef TARGET_STACK_PROTECT_GUARD
19931 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
19932
19933 #undef TARGET_ADDRESS_COST
19934 #define TARGET_ADDRESS_COST aarch64_address_cost
19935
19936 /* This hook will determines whether unnamed bitfields affect the alignment
19937 of the containing structure. The hook returns true if the structure
19938 should inherit the alignment requirements of an unnamed bitfield's
19939 type. */
19940 #undef TARGET_ALIGN_ANON_BITFIELD
19941 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
19942
19943 #undef TARGET_ASM_ALIGNED_DI_OP
19944 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
19945
19946 #undef TARGET_ASM_ALIGNED_HI_OP
19947 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
19948
19949 #undef TARGET_ASM_ALIGNED_SI_OP
19950 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
19951
19952 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
19953 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
19954 hook_bool_const_tree_hwi_hwi_const_tree_true
19955
19956 #undef TARGET_ASM_FILE_START
19957 #define TARGET_ASM_FILE_START aarch64_start_file
19958
19959 #undef TARGET_ASM_OUTPUT_MI_THUNK
19960 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
19961
19962 #undef TARGET_ASM_SELECT_RTX_SECTION
19963 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
19964
19965 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
19966 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
19967
19968 #undef TARGET_BUILD_BUILTIN_VA_LIST
19969 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
19970
19971 #undef TARGET_CALLEE_COPIES
19972 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
19973
19974 #undef TARGET_CAN_ELIMINATE
19975 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
19976
19977 #undef TARGET_CAN_INLINE_P
19978 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19979
19980 #undef TARGET_CANNOT_FORCE_CONST_MEM
19981 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19982
19983 #undef TARGET_CASE_VALUES_THRESHOLD
19984 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19985
19986 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19987 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19988
19989 /* Only the least significant bit is used for initialization guard
19990 variables. */
19991 #undef TARGET_CXX_GUARD_MASK_BIT
19992 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19993
19994 #undef TARGET_C_MODE_FOR_SUFFIX
19995 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19996
19997 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19998 #undef TARGET_DEFAULT_TARGET_FLAGS
19999 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
20000 #endif
20001
20002 #undef TARGET_CLASS_MAX_NREGS
20003 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
20004
20005 #undef TARGET_BUILTIN_DECL
20006 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
20007
20008 #undef TARGET_BUILTIN_RECIPROCAL
20009 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
20010
20011 #undef TARGET_C_EXCESS_PRECISION
20012 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
20013
20014 #undef TARGET_EXPAND_BUILTIN
20015 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
20016
20017 #undef TARGET_EXPAND_BUILTIN_VA_START
20018 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
20019
20020 #undef TARGET_FOLD_BUILTIN
20021 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
20022
20023 #undef TARGET_FUNCTION_ARG
20024 #define TARGET_FUNCTION_ARG aarch64_function_arg
20025
20026 #undef TARGET_FUNCTION_ARG_ADVANCE
20027 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
20028
20029 #undef TARGET_FUNCTION_ARG_BOUNDARY
20030 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
20031
20032 #undef TARGET_FUNCTION_ARG_PADDING
20033 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
20034
20035 #undef TARGET_GET_RAW_RESULT_MODE
20036 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
20037 #undef TARGET_GET_RAW_ARG_MODE
20038 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
20039
20040 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
20041 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
20042
20043 #undef TARGET_FUNCTION_VALUE
20044 #define TARGET_FUNCTION_VALUE aarch64_function_value
20045
20046 #undef TARGET_FUNCTION_VALUE_REGNO_P
20047 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
20048
20049 #undef TARGET_GIMPLE_FOLD_BUILTIN
20050 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
20051
20052 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
20053 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
20054
20055 #undef TARGET_INIT_BUILTINS
20056 #define TARGET_INIT_BUILTINS aarch64_init_builtins
20057
20058 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
20059 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
20060 aarch64_ira_change_pseudo_allocno_class
20061
20062 #undef TARGET_LEGITIMATE_ADDRESS_P
20063 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
20064
20065 #undef TARGET_LEGITIMATE_CONSTANT_P
20066 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
20067
20068 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
20069 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
20070 aarch64_legitimize_address_displacement
20071
20072 #undef TARGET_LIBGCC_CMP_RETURN_MODE
20073 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
20074
20075 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
20076 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
20077 aarch64_libgcc_floating_mode_supported_p
20078
20079 #undef TARGET_MANGLE_TYPE
20080 #define TARGET_MANGLE_TYPE aarch64_mangle_type
20081
20082 #undef TARGET_MEMORY_MOVE_COST
20083 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
20084
20085 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
20086 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
20087
20088 #undef TARGET_MUST_PASS_IN_STACK
20089 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
20090
20091 /* This target hook should return true if accesses to volatile bitfields
20092 should use the narrowest mode possible. It should return false if these
20093 accesses should use the bitfield container type. */
20094 #undef TARGET_NARROW_VOLATILE_BITFIELD
20095 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
20096
20097 #undef TARGET_OPTION_OVERRIDE
20098 #define TARGET_OPTION_OVERRIDE aarch64_override_options
20099
20100 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
20101 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
20102 aarch64_override_options_after_change
20103
20104 #undef TARGET_OPTION_SAVE
20105 #define TARGET_OPTION_SAVE aarch64_option_save
20106
20107 #undef TARGET_OPTION_RESTORE
20108 #define TARGET_OPTION_RESTORE aarch64_option_restore
20109
20110 #undef TARGET_OPTION_PRINT
20111 #define TARGET_OPTION_PRINT aarch64_option_print
20112
20113 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
20114 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
20115
20116 #undef TARGET_SET_CURRENT_FUNCTION
20117 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
20118
20119 #undef TARGET_PASS_BY_REFERENCE
20120 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
20121
20122 #undef TARGET_PREFERRED_RELOAD_CLASS
20123 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
20124
20125 #undef TARGET_SCHED_REASSOCIATION_WIDTH
20126 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
20127
20128 #undef TARGET_PROMOTED_TYPE
20129 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
20130
20131 #undef TARGET_SECONDARY_RELOAD
20132 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
20133
20134 #undef TARGET_SHIFT_TRUNCATION_MASK
20135 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
20136
20137 #undef TARGET_SETUP_INCOMING_VARARGS
20138 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
20139
20140 #undef TARGET_STRUCT_VALUE_RTX
20141 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
20142
20143 #undef TARGET_REGISTER_MOVE_COST
20144 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
20145
20146 #undef TARGET_RETURN_IN_MEMORY
20147 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
20148
20149 #undef TARGET_RETURN_IN_MSB
20150 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
20151
20152 #undef TARGET_RTX_COSTS
20153 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
20154
20155 #undef TARGET_SCALAR_MODE_SUPPORTED_P
20156 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
20157
20158 #undef TARGET_SCHED_ISSUE_RATE
20159 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
20160
20161 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
20162 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
20163 aarch64_sched_first_cycle_multipass_dfa_lookahead
20164
20165 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
20166 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
20167 aarch64_first_cycle_multipass_dfa_lookahead_guard
20168
20169 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
20170 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
20171 aarch64_get_separate_components
20172
20173 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
20174 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
20175 aarch64_components_for_bb
20176
20177 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
20178 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
20179 aarch64_disqualify_components
20180
20181 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
20182 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
20183 aarch64_emit_prologue_components
20184
20185 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
20186 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
20187 aarch64_emit_epilogue_components
20188
20189 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
20190 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
20191 aarch64_set_handled_components
20192
20193 #undef TARGET_TRAMPOLINE_INIT
20194 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
20195
20196 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
20197 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
20198
20199 #undef TARGET_VECTOR_MODE_SUPPORTED_P
20200 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
20201
20202 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
20203 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
20204 aarch64_builtin_support_vector_misalignment
20205
20206 #undef TARGET_ARRAY_MODE
20207 #define TARGET_ARRAY_MODE aarch64_array_mode
20208
20209 #undef TARGET_ARRAY_MODE_SUPPORTED_P
20210 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
20211
20212 #undef TARGET_VECTORIZE_ADD_STMT_COST
20213 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
20214
20215 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
20216 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
20217 aarch64_builtin_vectorization_cost
20218
20219 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
20220 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
20221
20222 #undef TARGET_VECTORIZE_BUILTINS
20223 #define TARGET_VECTORIZE_BUILTINS
20224
20225 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
20226 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
20227 aarch64_builtin_vectorized_function
20228
20229 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
20230 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
20231 aarch64_autovectorize_vector_sizes
20232
20233 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
20234 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
20235 aarch64_atomic_assign_expand_fenv
20236
20237 /* Section anchor support. */
20238
20239 #undef TARGET_MIN_ANCHOR_OFFSET
20240 #define TARGET_MIN_ANCHOR_OFFSET -256
20241
20242 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
20243 byte offset; we can do much more for larger data types, but have no way
20244 to determine the size of the access. We assume accesses are aligned. */
20245 #undef TARGET_MAX_ANCHOR_OFFSET
20246 #define TARGET_MAX_ANCHOR_OFFSET 4095
20247
20248 #undef TARGET_VECTOR_ALIGNMENT
20249 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
20250
20251 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
20252 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
20253 aarch64_vectorize_preferred_vector_alignment
20254 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
20255 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
20256 aarch64_simd_vector_alignment_reachable
20257
20258 /* vec_perm support. */
20259
20260 #undef TARGET_VECTORIZE_VEC_PERM_CONST
20261 #define TARGET_VECTORIZE_VEC_PERM_CONST \
20262 aarch64_vectorize_vec_perm_const
20263
20264 #undef TARGET_VECTORIZE_GET_MASK_MODE
20265 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
20266 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
20267 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
20268 aarch64_empty_mask_is_expensive
20269 #undef TARGET_PREFERRED_ELSE_VALUE
20270 #define TARGET_PREFERRED_ELSE_VALUE \
20271 aarch64_preferred_else_value
20272
20273 #undef TARGET_INIT_LIBFUNCS
20274 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
20275
20276 #undef TARGET_FIXED_CONDITION_CODE_REGS
20277 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
20278
20279 #undef TARGET_FLAGS_REGNUM
20280 #define TARGET_FLAGS_REGNUM CC_REGNUM
20281
20282 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
20283 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
20284
20285 #undef TARGET_ASAN_SHADOW_OFFSET
20286 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
20287
20288 #undef TARGET_LEGITIMIZE_ADDRESS
20289 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
20290
20291 #undef TARGET_SCHED_CAN_SPECULATE_INSN
20292 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
20293
20294 #undef TARGET_CAN_USE_DOLOOP_P
20295 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
20296
20297 #undef TARGET_SCHED_ADJUST_PRIORITY
20298 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
20299
20300 #undef TARGET_SCHED_MACRO_FUSION_P
20301 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
20302
20303 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
20304 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
20305
20306 #undef TARGET_SCHED_FUSION_PRIORITY
20307 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
20308
20309 #undef TARGET_UNSPEC_MAY_TRAP_P
20310 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
20311
20312 #undef TARGET_USE_PSEUDO_PIC_REG
20313 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
20314
20315 #undef TARGET_PRINT_OPERAND
20316 #define TARGET_PRINT_OPERAND aarch64_print_operand
20317
20318 #undef TARGET_PRINT_OPERAND_ADDRESS
20319 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
20320
20321 #undef TARGET_OPTAB_SUPPORTED_P
20322 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
20323
20324 #undef TARGET_OMIT_STRUCT_RETURN_REG
20325 #define TARGET_OMIT_STRUCT_RETURN_REG true
20326
20327 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
20328 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
20329 aarch64_dwarf_poly_indeterminate_value
20330
20331 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
20332 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
20333 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
20334
20335 #undef TARGET_HARD_REGNO_NREGS
20336 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
20337 #undef TARGET_HARD_REGNO_MODE_OK
20338 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
20339
20340 #undef TARGET_MODES_TIEABLE_P
20341 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
20342
20343 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
20344 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
20345 aarch64_hard_regno_call_part_clobbered
20346
20347 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
20348 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
20349 aarch64_remove_extra_call_preserved_regs
20350
20351 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
20352 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
20353 aarch64_return_call_with_max_clobbers
20354
20355 #undef TARGET_CONSTANT_ALIGNMENT
20356 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
20357
20358 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
20359 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
20360 aarch64_stack_clash_protection_alloca_probe_range
20361
20362 #undef TARGET_COMPUTE_PRESSURE_CLASSES
20363 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
20364
20365 #undef TARGET_CAN_CHANGE_MODE_CLASS
20366 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
20367
20368 #undef TARGET_SELECT_EARLY_REMAT_MODES
20369 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
20370
20371 #undef TARGET_SPECULATION_SAFE_VALUE
20372 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
20373
20374 #undef TARGET_ESTIMATED_POLY_VALUE
20375 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
20376
20377 #undef TARGET_ATTRIBUTE_TABLE
20378 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
20379
20380 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
20381 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
20382 aarch64_simd_clone_compute_vecsize_and_simdlen
20383
20384 #undef TARGET_SIMD_CLONE_ADJUST
20385 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
20386
20387 #undef TARGET_SIMD_CLONE_USABLE
20388 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
20389
20390 #undef TARGET_COMP_TYPE_ATTRIBUTES
20391 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
20392
20393 #undef TARGET_GET_MULTILIB_ABI_NAME
20394 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
20395
20396 #if CHECKING_P
20397 #undef TARGET_RUN_TARGET_SELFTESTS
20398 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
20399 #endif /* #if CHECKING_P */
20400
20401 #undef TARGET_ASM_POST_CFI_STARTPROC
20402 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
20403
20404 struct gcc_target targetm = TARGET_INITIALIZER;
20405
20406 #include "gt-aarch64.h"