]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/aarch64/aarch64.c
Update copyright years.
[thirdparty/gcc.git] / gcc / config / aarch64 / aarch64.c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
11
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74 #include "intl.h"
75 #include "expmed.h"
76 #include "function-abi.h"
77
78 /* This file should be included last. */
79 #include "target-def.h"
80
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
83
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
86 {
87 enum insn_type { MOV, MVN, INDEX, PTRUE };
88 enum modifier_type { LSL, MSL };
89
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode, rtx);
92 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93 insn_type = MOV, modifier_type = LSL,
94 unsigned int = 0);
95 simd_immediate_info (scalar_mode, rtx, rtx);
96 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
97
98 /* The mode of the elements. */
99 scalar_mode elt_mode;
100
101 /* The instruction to use to move the immediate into a vector. */
102 insn_type insn;
103
104 union
105 {
106 /* For MOV and MVN. */
107 struct
108 {
109 /* The value of each element. */
110 rtx value;
111
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier;
115 unsigned int shift;
116 } mov;
117
118 /* For INDEX. */
119 struct
120 {
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
123 rtx base, step;
124 } index;
125
126 /* For PTRUE. */
127 aarch64_svpattern pattern;
128 } u;
129 };
130
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
135 : elt_mode (elt_mode_in), insn (MOV)
136 {
137 u.mov.value = value_in;
138 u.mov.modifier = LSL;
139 u.mov.shift = 0;
140 }
141
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
144 fields. */
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
147 unsigned HOST_WIDE_INT value_in,
148 insn_type insn_in, modifier_type modifier_in,
149 unsigned int shift_in)
150 : elt_mode (elt_mode_in), insn (insn_in)
151 {
152 u.mov.value = gen_int_mode (value_in, elt_mode_in);
153 u.mov.modifier = modifier_in;
154 u.mov.shift = shift_in;
155 }
156
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161 : elt_mode (elt_mode_in), insn (INDEX)
162 {
163 u.index.base = base_in;
164 u.index.step = step_in;
165 }
166
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
171 aarch64_svpattern pattern_in)
172 : elt_mode (elt_mode_in), insn (PTRUE)
173 {
174 u.pattern = pattern_in;
175 }
176
177 /* The current code model. */
178 enum aarch64_code_model aarch64_cmodel;
179
180 /* The number of 64-bit elements in an SVE vector. */
181 poly_uint16 aarch64_sve_vg;
182
183 #ifdef HAVE_AS_TLS
184 #undef TARGET_HAVE_TLS
185 #define TARGET_HAVE_TLS 1
186 #endif
187
188 static bool aarch64_composite_type_p (const_tree, machine_mode);
189 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
190 const_tree,
191 machine_mode *, int *,
192 bool *);
193 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
194 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
195 static void aarch64_override_options_after_change (void);
196 static bool aarch64_vector_mode_supported_p (machine_mode);
197 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
198 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
199 const_tree type,
200 int misalignment,
201 bool is_packed);
202 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
203 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
204 aarch64_addr_query_type);
205 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
206
207 /* Major revision number of the ARM Architecture implemented by the target. */
208 unsigned aarch64_architecture_version;
209
210 /* The processor for which instructions should be scheduled. */
211 enum aarch64_processor aarch64_tune = cortexa53;
212
213 /* Mask to specify which instruction scheduling options should be used. */
214 uint64_t aarch64_tune_flags = 0;
215
216 /* Global flag for PC relative loads. */
217 bool aarch64_pcrelative_literal_loads;
218
219 /* Global flag for whether frame pointer is enabled. */
220 bool aarch64_use_frame_pointer;
221
222 #define BRANCH_PROTECT_STR_MAX 255
223 char *accepted_branch_protection_string = NULL;
224
225 static enum aarch64_parse_opt_result
226 aarch64_parse_branch_protection (const char*, char**);
227
228 /* Support for command line parsing of boolean flags in the tuning
229 structures. */
230 struct aarch64_flag_desc
231 {
232 const char* name;
233 unsigned int flag;
234 };
235
236 #define AARCH64_FUSION_PAIR(name, internal_name) \
237 { name, AARCH64_FUSE_##internal_name },
238 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
239 {
240 { "none", AARCH64_FUSE_NOTHING },
241 #include "aarch64-fusion-pairs.def"
242 { "all", AARCH64_FUSE_ALL },
243 { NULL, AARCH64_FUSE_NOTHING }
244 };
245
246 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
247 { name, AARCH64_EXTRA_TUNE_##internal_name },
248 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
249 {
250 { "none", AARCH64_EXTRA_TUNE_NONE },
251 #include "aarch64-tuning-flags.def"
252 { "all", AARCH64_EXTRA_TUNE_ALL },
253 { NULL, AARCH64_EXTRA_TUNE_NONE }
254 };
255
256 /* Tuning parameters. */
257
258 static const struct cpu_addrcost_table generic_addrcost_table =
259 {
260 {
261 1, /* hi */
262 0, /* si */
263 0, /* di */
264 1, /* ti */
265 },
266 0, /* pre_modify */
267 0, /* post_modify */
268 0, /* register_offset */
269 0, /* register_sextend */
270 0, /* register_zextend */
271 0 /* imm_offset */
272 };
273
274 static const struct cpu_addrcost_table exynosm1_addrcost_table =
275 {
276 {
277 0, /* hi */
278 0, /* si */
279 0, /* di */
280 2, /* ti */
281 },
282 0, /* pre_modify */
283 0, /* post_modify */
284 1, /* register_offset */
285 1, /* register_sextend */
286 2, /* register_zextend */
287 0, /* imm_offset */
288 };
289
290 static const struct cpu_addrcost_table xgene1_addrcost_table =
291 {
292 {
293 1, /* hi */
294 0, /* si */
295 0, /* di */
296 1, /* ti */
297 },
298 1, /* pre_modify */
299 1, /* post_modify */
300 0, /* register_offset */
301 1, /* register_sextend */
302 1, /* register_zextend */
303 0, /* imm_offset */
304 };
305
306 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
307 {
308 {
309 1, /* hi */
310 1, /* si */
311 1, /* di */
312 2, /* ti */
313 },
314 0, /* pre_modify */
315 0, /* post_modify */
316 2, /* register_offset */
317 3, /* register_sextend */
318 3, /* register_zextend */
319 0, /* imm_offset */
320 };
321
322 static const struct cpu_addrcost_table tsv110_addrcost_table =
323 {
324 {
325 1, /* hi */
326 0, /* si */
327 0, /* di */
328 1, /* ti */
329 },
330 0, /* pre_modify */
331 0, /* post_modify */
332 0, /* register_offset */
333 1, /* register_sextend */
334 1, /* register_zextend */
335 0, /* imm_offset */
336 };
337
338 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
339 {
340 {
341 1, /* hi */
342 1, /* si */
343 1, /* di */
344 2, /* ti */
345 },
346 1, /* pre_modify */
347 1, /* post_modify */
348 3, /* register_offset */
349 3, /* register_sextend */
350 3, /* register_zextend */
351 2, /* imm_offset */
352 };
353
354 static const struct cpu_regmove_cost generic_regmove_cost =
355 {
356 1, /* GP2GP */
357 /* Avoid the use of slow int<->fp moves for spilling by setting
358 their cost higher than memmov_cost. */
359 5, /* GP2FP */
360 5, /* FP2GP */
361 2 /* FP2FP */
362 };
363
364 static const struct cpu_regmove_cost cortexa57_regmove_cost =
365 {
366 1, /* GP2GP */
367 /* Avoid the use of slow int<->fp moves for spilling by setting
368 their cost higher than memmov_cost. */
369 5, /* GP2FP */
370 5, /* FP2GP */
371 2 /* FP2FP */
372 };
373
374 static const struct cpu_regmove_cost cortexa53_regmove_cost =
375 {
376 1, /* GP2GP */
377 /* Avoid the use of slow int<->fp moves for spilling by setting
378 their cost higher than memmov_cost. */
379 5, /* GP2FP */
380 5, /* FP2GP */
381 2 /* FP2FP */
382 };
383
384 static const struct cpu_regmove_cost exynosm1_regmove_cost =
385 {
386 1, /* GP2GP */
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost (actual, 4 and 9). */
389 9, /* GP2FP */
390 9, /* FP2GP */
391 1 /* FP2FP */
392 };
393
394 static const struct cpu_regmove_cost thunderx_regmove_cost =
395 {
396 2, /* GP2GP */
397 2, /* GP2FP */
398 6, /* FP2GP */
399 4 /* FP2FP */
400 };
401
402 static const struct cpu_regmove_cost xgene1_regmove_cost =
403 {
404 1, /* GP2GP */
405 /* Avoid the use of slow int<->fp moves for spilling by setting
406 their cost higher than memmov_cost. */
407 8, /* GP2FP */
408 8, /* FP2GP */
409 2 /* FP2FP */
410 };
411
412 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
413 {
414 2, /* GP2GP */
415 /* Avoid the use of int<->fp moves for spilling. */
416 6, /* GP2FP */
417 6, /* FP2GP */
418 4 /* FP2FP */
419 };
420
421 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
422 {
423 1, /* GP2GP */
424 /* Avoid the use of int<->fp moves for spilling. */
425 8, /* GP2FP */
426 8, /* FP2GP */
427 4 /* FP2FP */
428 };
429
430 static const struct cpu_regmove_cost tsv110_regmove_cost =
431 {
432 1, /* GP2GP */
433 /* Avoid the use of slow int<->fp moves for spilling by setting
434 their cost higher than memmov_cost. */
435 2, /* GP2FP */
436 3, /* FP2GP */
437 2 /* FP2FP */
438 };
439
440 /* Generic costs for vector insn classes. */
441 static const struct cpu_vector_cost generic_vector_cost =
442 {
443 1, /* scalar_int_stmt_cost */
444 1, /* scalar_fp_stmt_cost */
445 1, /* scalar_load_cost */
446 1, /* scalar_store_cost */
447 1, /* vec_int_stmt_cost */
448 1, /* vec_fp_stmt_cost */
449 2, /* vec_permute_cost */
450 2, /* vec_to_scalar_cost */
451 1, /* scalar_to_vec_cost */
452 1, /* vec_align_load_cost */
453 1, /* vec_unalign_load_cost */
454 1, /* vec_unalign_store_cost */
455 1, /* vec_store_cost */
456 3, /* cond_taken_branch_cost */
457 1 /* cond_not_taken_branch_cost */
458 };
459
460 /* QDF24XX costs for vector insn classes. */
461 static const struct cpu_vector_cost qdf24xx_vector_cost =
462 {
463 1, /* scalar_int_stmt_cost */
464 1, /* scalar_fp_stmt_cost */
465 1, /* scalar_load_cost */
466 1, /* scalar_store_cost */
467 1, /* vec_int_stmt_cost */
468 3, /* vec_fp_stmt_cost */
469 2, /* vec_permute_cost */
470 1, /* vec_to_scalar_cost */
471 1, /* scalar_to_vec_cost */
472 1, /* vec_align_load_cost */
473 1, /* vec_unalign_load_cost */
474 1, /* vec_unalign_store_cost */
475 1, /* vec_store_cost */
476 3, /* cond_taken_branch_cost */
477 1 /* cond_not_taken_branch_cost */
478 };
479
480 /* ThunderX costs for vector insn classes. */
481 static const struct cpu_vector_cost thunderx_vector_cost =
482 {
483 1, /* scalar_int_stmt_cost */
484 1, /* scalar_fp_stmt_cost */
485 3, /* scalar_load_cost */
486 1, /* scalar_store_cost */
487 4, /* vec_int_stmt_cost */
488 1, /* vec_fp_stmt_cost */
489 4, /* vec_permute_cost */
490 2, /* vec_to_scalar_cost */
491 2, /* scalar_to_vec_cost */
492 3, /* vec_align_load_cost */
493 5, /* vec_unalign_load_cost */
494 5, /* vec_unalign_store_cost */
495 1, /* vec_store_cost */
496 3, /* cond_taken_branch_cost */
497 3 /* cond_not_taken_branch_cost */
498 };
499
500 static const struct cpu_vector_cost tsv110_vector_cost =
501 {
502 1, /* scalar_int_stmt_cost */
503 1, /* scalar_fp_stmt_cost */
504 5, /* scalar_load_cost */
505 1, /* scalar_store_cost */
506 2, /* vec_int_stmt_cost */
507 2, /* vec_fp_stmt_cost */
508 2, /* vec_permute_cost */
509 3, /* vec_to_scalar_cost */
510 2, /* scalar_to_vec_cost */
511 5, /* vec_align_load_cost */
512 5, /* vec_unalign_load_cost */
513 1, /* vec_unalign_store_cost */
514 1, /* vec_store_cost */
515 1, /* cond_taken_branch_cost */
516 1 /* cond_not_taken_branch_cost */
517 };
518
519 /* Generic costs for vector insn classes. */
520 static const struct cpu_vector_cost cortexa57_vector_cost =
521 {
522 1, /* scalar_int_stmt_cost */
523 1, /* scalar_fp_stmt_cost */
524 4, /* scalar_load_cost */
525 1, /* scalar_store_cost */
526 2, /* vec_int_stmt_cost */
527 2, /* vec_fp_stmt_cost */
528 3, /* vec_permute_cost */
529 8, /* vec_to_scalar_cost */
530 8, /* scalar_to_vec_cost */
531 4, /* vec_align_load_cost */
532 4, /* vec_unalign_load_cost */
533 1, /* vec_unalign_store_cost */
534 1, /* vec_store_cost */
535 1, /* cond_taken_branch_cost */
536 1 /* cond_not_taken_branch_cost */
537 };
538
539 static const struct cpu_vector_cost exynosm1_vector_cost =
540 {
541 1, /* scalar_int_stmt_cost */
542 1, /* scalar_fp_stmt_cost */
543 5, /* scalar_load_cost */
544 1, /* scalar_store_cost */
545 3, /* vec_int_stmt_cost */
546 3, /* vec_fp_stmt_cost */
547 3, /* vec_permute_cost */
548 3, /* vec_to_scalar_cost */
549 3, /* scalar_to_vec_cost */
550 5, /* vec_align_load_cost */
551 5, /* vec_unalign_load_cost */
552 1, /* vec_unalign_store_cost */
553 1, /* vec_store_cost */
554 1, /* cond_taken_branch_cost */
555 1 /* cond_not_taken_branch_cost */
556 };
557
558 /* Generic costs for vector insn classes. */
559 static const struct cpu_vector_cost xgene1_vector_cost =
560 {
561 1, /* scalar_int_stmt_cost */
562 1, /* scalar_fp_stmt_cost */
563 5, /* scalar_load_cost */
564 1, /* scalar_store_cost */
565 2, /* vec_int_stmt_cost */
566 2, /* vec_fp_stmt_cost */
567 2, /* vec_permute_cost */
568 4, /* vec_to_scalar_cost */
569 4, /* scalar_to_vec_cost */
570 10, /* vec_align_load_cost */
571 10, /* vec_unalign_load_cost */
572 2, /* vec_unalign_store_cost */
573 2, /* vec_store_cost */
574 2, /* cond_taken_branch_cost */
575 1 /* cond_not_taken_branch_cost */
576 };
577
578 /* Costs for vector insn classes for Vulcan. */
579 static const struct cpu_vector_cost thunderx2t99_vector_cost =
580 {
581 1, /* scalar_int_stmt_cost */
582 6, /* scalar_fp_stmt_cost */
583 4, /* scalar_load_cost */
584 1, /* scalar_store_cost */
585 5, /* vec_int_stmt_cost */
586 6, /* vec_fp_stmt_cost */
587 10, /* vec_permute_cost */
588 6, /* vec_to_scalar_cost */
589 5, /* scalar_to_vec_cost */
590 8, /* vec_align_load_cost */
591 8, /* vec_unalign_load_cost */
592 4, /* vec_unalign_store_cost */
593 4, /* vec_store_cost */
594 2, /* cond_taken_branch_cost */
595 1 /* cond_not_taken_branch_cost */
596 };
597
598 /* Generic costs for branch instructions. */
599 static const struct cpu_branch_cost generic_branch_cost =
600 {
601 1, /* Predictable. */
602 3 /* Unpredictable. */
603 };
604
605 /* Generic approximation modes. */
606 static const cpu_approx_modes generic_approx_modes =
607 {
608 AARCH64_APPROX_NONE, /* division */
609 AARCH64_APPROX_NONE, /* sqrt */
610 AARCH64_APPROX_NONE /* recip_sqrt */
611 };
612
613 /* Approximation modes for Exynos M1. */
614 static const cpu_approx_modes exynosm1_approx_modes =
615 {
616 AARCH64_APPROX_NONE, /* division */
617 AARCH64_APPROX_ALL, /* sqrt */
618 AARCH64_APPROX_ALL /* recip_sqrt */
619 };
620
621 /* Approximation modes for X-Gene 1. */
622 static const cpu_approx_modes xgene1_approx_modes =
623 {
624 AARCH64_APPROX_NONE, /* division */
625 AARCH64_APPROX_NONE, /* sqrt */
626 AARCH64_APPROX_ALL /* recip_sqrt */
627 };
628
629 /* Generic prefetch settings (which disable prefetch). */
630 static const cpu_prefetch_tune generic_prefetch_tune =
631 {
632 0, /* num_slots */
633 -1, /* l1_cache_size */
634 -1, /* l1_cache_line_size */
635 -1, /* l2_cache_size */
636 true, /* prefetch_dynamic_strides */
637 -1, /* minimum_stride */
638 -1 /* default_opt_level */
639 };
640
641 static const cpu_prefetch_tune exynosm1_prefetch_tune =
642 {
643 0, /* num_slots */
644 -1, /* l1_cache_size */
645 64, /* l1_cache_line_size */
646 -1, /* l2_cache_size */
647 true, /* prefetch_dynamic_strides */
648 -1, /* minimum_stride */
649 -1 /* default_opt_level */
650 };
651
652 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
653 {
654 4, /* num_slots */
655 32, /* l1_cache_size */
656 64, /* l1_cache_line_size */
657 512, /* l2_cache_size */
658 false, /* prefetch_dynamic_strides */
659 2048, /* minimum_stride */
660 3 /* default_opt_level */
661 };
662
663 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
664 {
665 8, /* num_slots */
666 32, /* l1_cache_size */
667 128, /* l1_cache_line_size */
668 16*1024, /* l2_cache_size */
669 true, /* prefetch_dynamic_strides */
670 -1, /* minimum_stride */
671 3 /* default_opt_level */
672 };
673
674 static const cpu_prefetch_tune thunderx_prefetch_tune =
675 {
676 8, /* num_slots */
677 32, /* l1_cache_size */
678 128, /* l1_cache_line_size */
679 -1, /* l2_cache_size */
680 true, /* prefetch_dynamic_strides */
681 -1, /* minimum_stride */
682 -1 /* default_opt_level */
683 };
684
685 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
686 {
687 8, /* num_slots */
688 32, /* l1_cache_size */
689 64, /* l1_cache_line_size */
690 256, /* l2_cache_size */
691 true, /* prefetch_dynamic_strides */
692 -1, /* minimum_stride */
693 -1 /* default_opt_level */
694 };
695
696 static const cpu_prefetch_tune tsv110_prefetch_tune =
697 {
698 0, /* num_slots */
699 64, /* l1_cache_size */
700 64, /* l1_cache_line_size */
701 512, /* l2_cache_size */
702 true, /* prefetch_dynamic_strides */
703 -1, /* minimum_stride */
704 -1 /* default_opt_level */
705 };
706
707 static const cpu_prefetch_tune xgene1_prefetch_tune =
708 {
709 8, /* num_slots */
710 32, /* l1_cache_size */
711 64, /* l1_cache_line_size */
712 256, /* l2_cache_size */
713 true, /* prefetch_dynamic_strides */
714 -1, /* minimum_stride */
715 -1 /* default_opt_level */
716 };
717
718 static const struct tune_params generic_tunings =
719 {
720 &cortexa57_extra_costs,
721 &generic_addrcost_table,
722 &generic_regmove_cost,
723 &generic_vector_cost,
724 &generic_branch_cost,
725 &generic_approx_modes,
726 SVE_NOT_IMPLEMENTED, /* sve_width */
727 4, /* memmov_cost */
728 2, /* issue_rate */
729 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
730 "16:12", /* function_align. */
731 "4", /* jump_align. */
732 "8", /* loop_align. */
733 2, /* int_reassoc_width. */
734 4, /* fp_reassoc_width. */
735 1, /* vec_reassoc_width. */
736 2, /* min_div_recip_mul_sf. */
737 2, /* min_div_recip_mul_df. */
738 0, /* max_case_values. */
739 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
741 &generic_prefetch_tune
742 };
743
744 static const struct tune_params cortexa35_tunings =
745 {
746 &cortexa53_extra_costs,
747 &generic_addrcost_table,
748 &cortexa53_regmove_cost,
749 &generic_vector_cost,
750 &generic_branch_cost,
751 &generic_approx_modes,
752 SVE_NOT_IMPLEMENTED, /* sve_width */
753 4, /* memmov_cost */
754 1, /* issue_rate */
755 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
756 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
757 "16", /* function_align. */
758 "4", /* jump_align. */
759 "8", /* loop_align. */
760 2, /* int_reassoc_width. */
761 4, /* fp_reassoc_width. */
762 1, /* vec_reassoc_width. */
763 2, /* min_div_recip_mul_sf. */
764 2, /* min_div_recip_mul_df. */
765 0, /* max_case_values. */
766 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
767 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
768 &generic_prefetch_tune
769 };
770
771 static const struct tune_params cortexa53_tunings =
772 {
773 &cortexa53_extra_costs,
774 &generic_addrcost_table,
775 &cortexa53_regmove_cost,
776 &generic_vector_cost,
777 &generic_branch_cost,
778 &generic_approx_modes,
779 SVE_NOT_IMPLEMENTED, /* sve_width */
780 4, /* memmov_cost */
781 2, /* issue_rate */
782 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
783 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
784 "16", /* function_align. */
785 "4", /* jump_align. */
786 "8", /* loop_align. */
787 2, /* int_reassoc_width. */
788 4, /* fp_reassoc_width. */
789 1, /* vec_reassoc_width. */
790 2, /* min_div_recip_mul_sf. */
791 2, /* min_div_recip_mul_df. */
792 0, /* max_case_values. */
793 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
794 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
795 &generic_prefetch_tune
796 };
797
798 static const struct tune_params cortexa57_tunings =
799 {
800 &cortexa57_extra_costs,
801 &generic_addrcost_table,
802 &cortexa57_regmove_cost,
803 &cortexa57_vector_cost,
804 &generic_branch_cost,
805 &generic_approx_modes,
806 SVE_NOT_IMPLEMENTED, /* sve_width */
807 4, /* memmov_cost */
808 3, /* issue_rate */
809 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
810 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
811 "16", /* function_align. */
812 "4", /* jump_align. */
813 "8", /* loop_align. */
814 2, /* int_reassoc_width. */
815 4, /* fp_reassoc_width. */
816 1, /* vec_reassoc_width. */
817 2, /* min_div_recip_mul_sf. */
818 2, /* min_div_recip_mul_df. */
819 0, /* max_case_values. */
820 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
821 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
822 &generic_prefetch_tune
823 };
824
825 static const struct tune_params cortexa72_tunings =
826 {
827 &cortexa57_extra_costs,
828 &generic_addrcost_table,
829 &cortexa57_regmove_cost,
830 &cortexa57_vector_cost,
831 &generic_branch_cost,
832 &generic_approx_modes,
833 SVE_NOT_IMPLEMENTED, /* sve_width */
834 4, /* memmov_cost */
835 3, /* issue_rate */
836 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
837 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
838 "16", /* function_align. */
839 "4", /* jump_align. */
840 "8", /* loop_align. */
841 2, /* int_reassoc_width. */
842 4, /* fp_reassoc_width. */
843 1, /* vec_reassoc_width. */
844 2, /* min_div_recip_mul_sf. */
845 2, /* min_div_recip_mul_df. */
846 0, /* max_case_values. */
847 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
848 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
849 &generic_prefetch_tune
850 };
851
852 static const struct tune_params cortexa73_tunings =
853 {
854 &cortexa57_extra_costs,
855 &generic_addrcost_table,
856 &cortexa57_regmove_cost,
857 &cortexa57_vector_cost,
858 &generic_branch_cost,
859 &generic_approx_modes,
860 SVE_NOT_IMPLEMENTED, /* sve_width */
861 4, /* memmov_cost. */
862 2, /* issue_rate. */
863 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
864 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
865 "16", /* function_align. */
866 "4", /* jump_align. */
867 "8", /* loop_align. */
868 2, /* int_reassoc_width. */
869 4, /* fp_reassoc_width. */
870 1, /* vec_reassoc_width. */
871 2, /* min_div_recip_mul_sf. */
872 2, /* min_div_recip_mul_df. */
873 0, /* max_case_values. */
874 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
875 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
876 &generic_prefetch_tune
877 };
878
879
880
881 static const struct tune_params exynosm1_tunings =
882 {
883 &exynosm1_extra_costs,
884 &exynosm1_addrcost_table,
885 &exynosm1_regmove_cost,
886 &exynosm1_vector_cost,
887 &generic_branch_cost,
888 &exynosm1_approx_modes,
889 SVE_NOT_IMPLEMENTED, /* sve_width */
890 4, /* memmov_cost */
891 3, /* issue_rate */
892 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
893 "4", /* function_align. */
894 "4", /* jump_align. */
895 "4", /* loop_align. */
896 2, /* int_reassoc_width. */
897 4, /* fp_reassoc_width. */
898 1, /* vec_reassoc_width. */
899 2, /* min_div_recip_mul_sf. */
900 2, /* min_div_recip_mul_df. */
901 48, /* max_case_values. */
902 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
903 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
904 &exynosm1_prefetch_tune
905 };
906
907 static const struct tune_params thunderxt88_tunings =
908 {
909 &thunderx_extra_costs,
910 &generic_addrcost_table,
911 &thunderx_regmove_cost,
912 &thunderx_vector_cost,
913 &generic_branch_cost,
914 &generic_approx_modes,
915 SVE_NOT_IMPLEMENTED, /* sve_width */
916 6, /* memmov_cost */
917 2, /* issue_rate */
918 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
919 "8", /* function_align. */
920 "8", /* jump_align. */
921 "8", /* loop_align. */
922 2, /* int_reassoc_width. */
923 4, /* fp_reassoc_width. */
924 1, /* vec_reassoc_width. */
925 2, /* min_div_recip_mul_sf. */
926 2, /* min_div_recip_mul_df. */
927 0, /* max_case_values. */
928 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
929 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
930 &thunderxt88_prefetch_tune
931 };
932
933 static const struct tune_params thunderx_tunings =
934 {
935 &thunderx_extra_costs,
936 &generic_addrcost_table,
937 &thunderx_regmove_cost,
938 &thunderx_vector_cost,
939 &generic_branch_cost,
940 &generic_approx_modes,
941 SVE_NOT_IMPLEMENTED, /* sve_width */
942 6, /* memmov_cost */
943 2, /* issue_rate */
944 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
945 "8", /* function_align. */
946 "8", /* jump_align. */
947 "8", /* loop_align. */
948 2, /* int_reassoc_width. */
949 4, /* fp_reassoc_width. */
950 1, /* vec_reassoc_width. */
951 2, /* min_div_recip_mul_sf. */
952 2, /* min_div_recip_mul_df. */
953 0, /* max_case_values. */
954 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
955 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
956 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
957 &thunderx_prefetch_tune
958 };
959
960 static const struct tune_params tsv110_tunings =
961 {
962 &tsv110_extra_costs,
963 &tsv110_addrcost_table,
964 &tsv110_regmove_cost,
965 &tsv110_vector_cost,
966 &generic_branch_cost,
967 &generic_approx_modes,
968 SVE_NOT_IMPLEMENTED, /* sve_width */
969 4, /* memmov_cost */
970 4, /* issue_rate */
971 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
972 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
973 "16", /* function_align. */
974 "4", /* jump_align. */
975 "8", /* loop_align. */
976 2, /* int_reassoc_width. */
977 4, /* fp_reassoc_width. */
978 1, /* vec_reassoc_width. */
979 2, /* min_div_recip_mul_sf. */
980 2, /* min_div_recip_mul_df. */
981 0, /* max_case_values. */
982 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
983 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
984 &tsv110_prefetch_tune
985 };
986
987 static const struct tune_params xgene1_tunings =
988 {
989 &xgene1_extra_costs,
990 &xgene1_addrcost_table,
991 &xgene1_regmove_cost,
992 &xgene1_vector_cost,
993 &generic_branch_cost,
994 &xgene1_approx_modes,
995 SVE_NOT_IMPLEMENTED, /* sve_width */
996 6, /* memmov_cost */
997 4, /* issue_rate */
998 AARCH64_FUSE_NOTHING, /* fusible_ops */
999 "16", /* function_align. */
1000 "16", /* jump_align. */
1001 "16", /* loop_align. */
1002 2, /* int_reassoc_width. */
1003 4, /* fp_reassoc_width. */
1004 1, /* vec_reassoc_width. */
1005 2, /* min_div_recip_mul_sf. */
1006 2, /* min_div_recip_mul_df. */
1007 17, /* max_case_values. */
1008 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1009 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1010 &xgene1_prefetch_tune
1011 };
1012
1013 static const struct tune_params emag_tunings =
1014 {
1015 &xgene1_extra_costs,
1016 &xgene1_addrcost_table,
1017 &xgene1_regmove_cost,
1018 &xgene1_vector_cost,
1019 &generic_branch_cost,
1020 &xgene1_approx_modes,
1021 SVE_NOT_IMPLEMENTED,
1022 6, /* memmov_cost */
1023 4, /* issue_rate */
1024 AARCH64_FUSE_NOTHING, /* fusible_ops */
1025 "16", /* function_align. */
1026 "16", /* jump_align. */
1027 "16", /* loop_align. */
1028 2, /* int_reassoc_width. */
1029 4, /* fp_reassoc_width. */
1030 1, /* vec_reassoc_width. */
1031 2, /* min_div_recip_mul_sf. */
1032 2, /* min_div_recip_mul_df. */
1033 17, /* max_case_values. */
1034 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1035 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1036 &xgene1_prefetch_tune
1037 };
1038
1039 static const struct tune_params qdf24xx_tunings =
1040 {
1041 &qdf24xx_extra_costs,
1042 &qdf24xx_addrcost_table,
1043 &qdf24xx_regmove_cost,
1044 &qdf24xx_vector_cost,
1045 &generic_branch_cost,
1046 &generic_approx_modes,
1047 SVE_NOT_IMPLEMENTED, /* sve_width */
1048 4, /* memmov_cost */
1049 4, /* issue_rate */
1050 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1051 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1052 "16", /* function_align. */
1053 "8", /* jump_align. */
1054 "16", /* loop_align. */
1055 2, /* int_reassoc_width. */
1056 4, /* fp_reassoc_width. */
1057 1, /* vec_reassoc_width. */
1058 2, /* min_div_recip_mul_sf. */
1059 2, /* min_div_recip_mul_df. */
1060 0, /* max_case_values. */
1061 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1062 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1063 &qdf24xx_prefetch_tune
1064 };
1065
1066 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1067 for now. */
1068 static const struct tune_params saphira_tunings =
1069 {
1070 &generic_extra_costs,
1071 &generic_addrcost_table,
1072 &generic_regmove_cost,
1073 &generic_vector_cost,
1074 &generic_branch_cost,
1075 &generic_approx_modes,
1076 SVE_NOT_IMPLEMENTED, /* sve_width */
1077 4, /* memmov_cost */
1078 4, /* issue_rate */
1079 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1080 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1081 "16", /* function_align. */
1082 "8", /* jump_align. */
1083 "16", /* loop_align. */
1084 2, /* int_reassoc_width. */
1085 4, /* fp_reassoc_width. */
1086 1, /* vec_reassoc_width. */
1087 2, /* min_div_recip_mul_sf. */
1088 2, /* min_div_recip_mul_df. */
1089 0, /* max_case_values. */
1090 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1091 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1092 &generic_prefetch_tune
1093 };
1094
1095 static const struct tune_params thunderx2t99_tunings =
1096 {
1097 &thunderx2t99_extra_costs,
1098 &thunderx2t99_addrcost_table,
1099 &thunderx2t99_regmove_cost,
1100 &thunderx2t99_vector_cost,
1101 &generic_branch_cost,
1102 &generic_approx_modes,
1103 SVE_NOT_IMPLEMENTED, /* sve_width */
1104 4, /* memmov_cost. */
1105 4, /* issue_rate. */
1106 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1107 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1108 "16", /* function_align. */
1109 "8", /* jump_align. */
1110 "16", /* loop_align. */
1111 3, /* int_reassoc_width. */
1112 2, /* fp_reassoc_width. */
1113 2, /* vec_reassoc_width. */
1114 2, /* min_div_recip_mul_sf. */
1115 2, /* min_div_recip_mul_df. */
1116 0, /* max_case_values. */
1117 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1118 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1119 &thunderx2t99_prefetch_tune
1120 };
1121
1122 static const struct tune_params neoversen1_tunings =
1123 {
1124 &cortexa57_extra_costs,
1125 &generic_addrcost_table,
1126 &generic_regmove_cost,
1127 &cortexa57_vector_cost,
1128 &generic_branch_cost,
1129 &generic_approx_modes,
1130 SVE_NOT_IMPLEMENTED, /* sve_width */
1131 4, /* memmov_cost */
1132 3, /* issue_rate */
1133 AARCH64_FUSE_AES_AESMC, /* fusible_ops */
1134 "32:16", /* function_align. */
1135 "32:16", /* jump_align. */
1136 "32:16", /* loop_align. */
1137 2, /* int_reassoc_width. */
1138 4, /* fp_reassoc_width. */
1139 2, /* vec_reassoc_width. */
1140 2, /* min_div_recip_mul_sf. */
1141 2, /* min_div_recip_mul_df. */
1142 0, /* max_case_values. */
1143 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1144 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1145 &generic_prefetch_tune
1146 };
1147
1148 /* Support for fine-grained override of the tuning structures. */
1149 struct aarch64_tuning_override_function
1150 {
1151 const char* name;
1152 void (*parse_override)(const char*, struct tune_params*);
1153 };
1154
1155 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1156 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1157 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1158
1159 static const struct aarch64_tuning_override_function
1160 aarch64_tuning_override_functions[] =
1161 {
1162 { "fuse", aarch64_parse_fuse_string },
1163 { "tune", aarch64_parse_tune_string },
1164 { "sve_width", aarch64_parse_sve_width_string },
1165 { NULL, NULL }
1166 };
1167
1168 /* A processor implementing AArch64. */
1169 struct processor
1170 {
1171 const char *const name;
1172 enum aarch64_processor ident;
1173 enum aarch64_processor sched_core;
1174 enum aarch64_arch arch;
1175 unsigned architecture_version;
1176 const uint64_t flags;
1177 const struct tune_params *const tune;
1178 };
1179
1180 /* Architectures implementing AArch64. */
1181 static const struct processor all_architectures[] =
1182 {
1183 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1184 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1185 #include "aarch64-arches.def"
1186 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1187 };
1188
1189 /* Processor cores implementing AArch64. */
1190 static const struct processor all_cores[] =
1191 {
1192 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1193 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1194 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1195 FLAGS, &COSTS##_tunings},
1196 #include "aarch64-cores.def"
1197 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1198 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1199 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1200 };
1201
1202
1203 /* Target specification. These are populated by the -march, -mtune, -mcpu
1204 handling code or by target attributes. */
1205 static const struct processor *selected_arch;
1206 static const struct processor *selected_cpu;
1207 static const struct processor *selected_tune;
1208
1209 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1210
1211 /* The current tuning set. */
1212 struct tune_params aarch64_tune_params = generic_tunings;
1213
1214 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1215
1216 static tree
1217 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1218 int, bool *no_add_attrs)
1219 {
1220 /* Since we set fn_type_req to true, the caller should have checked
1221 this for us. */
1222 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1223 switch ((arm_pcs) fntype_abi (*node).id ())
1224 {
1225 case ARM_PCS_AAPCS64:
1226 case ARM_PCS_SIMD:
1227 return NULL_TREE;
1228
1229 case ARM_PCS_SVE:
1230 error ("the %qE attribute cannot be applied to an SVE function type",
1231 name);
1232 *no_add_attrs = true;
1233 return NULL_TREE;
1234
1235 case ARM_PCS_TLSDESC:
1236 case ARM_PCS_UNKNOWN:
1237 break;
1238 }
1239 gcc_unreachable ();
1240 }
1241
1242 /* Table of machine attributes. */
1243 static const struct attribute_spec aarch64_attribute_table[] =
1244 {
1245 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1246 affects_type_identity, handler, exclude } */
1247 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1248 handle_aarch64_vector_pcs_attribute, NULL },
1249 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1250 };
1251
1252 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1253
1254 /* An ISA extension in the co-processor and main instruction set space. */
1255 struct aarch64_option_extension
1256 {
1257 const char *const name;
1258 const unsigned long flags_on;
1259 const unsigned long flags_off;
1260 };
1261
1262 typedef enum aarch64_cond_code
1263 {
1264 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1265 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1266 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1267 }
1268 aarch64_cc;
1269
1270 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1271
1272 struct aarch64_branch_protect_type
1273 {
1274 /* The type's name that the user passes to the branch-protection option
1275 string. */
1276 const char* name;
1277 /* Function to handle the protection type and set global variables.
1278 First argument is the string token corresponding with this type and the
1279 second argument is the next token in the option string.
1280 Return values:
1281 * AARCH64_PARSE_OK: Handling was sucessful.
1282 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1283 should print an error.
1284 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1285 own error. */
1286 enum aarch64_parse_opt_result (*handler)(char*, char*);
1287 /* A list of types that can follow this type in the option string. */
1288 const aarch64_branch_protect_type* subtypes;
1289 unsigned int num_subtypes;
1290 };
1291
1292 static enum aarch64_parse_opt_result
1293 aarch64_handle_no_branch_protection (char* str, char* rest)
1294 {
1295 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1296 aarch64_enable_bti = 0;
1297 if (rest)
1298 {
1299 error ("unexpected %<%s%> after %<%s%>", rest, str);
1300 return AARCH64_PARSE_INVALID_FEATURE;
1301 }
1302 return AARCH64_PARSE_OK;
1303 }
1304
1305 static enum aarch64_parse_opt_result
1306 aarch64_handle_standard_branch_protection (char* str, char* rest)
1307 {
1308 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1309 aarch64_ra_sign_key = AARCH64_KEY_A;
1310 aarch64_enable_bti = 1;
1311 if (rest)
1312 {
1313 error ("unexpected %<%s%> after %<%s%>", rest, str);
1314 return AARCH64_PARSE_INVALID_FEATURE;
1315 }
1316 return AARCH64_PARSE_OK;
1317 }
1318
1319 static enum aarch64_parse_opt_result
1320 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1321 char* rest ATTRIBUTE_UNUSED)
1322 {
1323 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1324 aarch64_ra_sign_key = AARCH64_KEY_A;
1325 return AARCH64_PARSE_OK;
1326 }
1327
1328 static enum aarch64_parse_opt_result
1329 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1330 char* rest ATTRIBUTE_UNUSED)
1331 {
1332 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1333 return AARCH64_PARSE_OK;
1334 }
1335
1336 static enum aarch64_parse_opt_result
1337 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1338 char* rest ATTRIBUTE_UNUSED)
1339 {
1340 aarch64_ra_sign_key = AARCH64_KEY_B;
1341 return AARCH64_PARSE_OK;
1342 }
1343
1344 static enum aarch64_parse_opt_result
1345 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1346 char* rest ATTRIBUTE_UNUSED)
1347 {
1348 aarch64_enable_bti = 1;
1349 return AARCH64_PARSE_OK;
1350 }
1351
1352 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1353 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1354 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1355 { NULL, NULL, NULL, 0 }
1356 };
1357
1358 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1359 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1360 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1361 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1362 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1363 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1364 { NULL, NULL, NULL, 0 }
1365 };
1366
1367 /* The condition codes of the processor, and the inverse function. */
1368 static const char * const aarch64_condition_codes[] =
1369 {
1370 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1371 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1372 };
1373
1374 /* The preferred condition codes for SVE conditions. */
1375 static const char *const aarch64_sve_condition_codes[] =
1376 {
1377 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1378 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1379 };
1380
1381 /* Return the assembly token for svpattern value VALUE. */
1382
1383 static const char *
1384 svpattern_token (enum aarch64_svpattern pattern)
1385 {
1386 switch (pattern)
1387 {
1388 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1389 AARCH64_FOR_SVPATTERN (CASE)
1390 #undef CASE
1391 case AARCH64_NUM_SVPATTERNS:
1392 break;
1393 }
1394 gcc_unreachable ();
1395 }
1396
1397 /* Return the descriptor of the SIMD ABI. */
1398
1399 static const predefined_function_abi &
1400 aarch64_simd_abi (void)
1401 {
1402 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1403 if (!simd_abi.initialized_p ())
1404 {
1405 HARD_REG_SET full_reg_clobbers
1406 = default_function_abi.full_reg_clobbers ();
1407 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1408 if (FP_SIMD_SAVED_REGNUM_P (regno))
1409 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1410 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1411 }
1412 return simd_abi;
1413 }
1414
1415 /* Return the descriptor of the SVE PCS. */
1416
1417 static const predefined_function_abi &
1418 aarch64_sve_abi (void)
1419 {
1420 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1421 if (!sve_abi.initialized_p ())
1422 {
1423 HARD_REG_SET full_reg_clobbers
1424 = default_function_abi.full_reg_clobbers ();
1425 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1426 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1427 for (int regno = P4_REGNUM; regno <= P11_REGNUM; ++regno)
1428 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1429 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1430 }
1431 return sve_abi;
1432 }
1433
1434 /* Generate code to enable conditional branches in functions over 1 MiB. */
1435 const char *
1436 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1437 const char * branch_format)
1438 {
1439 rtx_code_label * tmp_label = gen_label_rtx ();
1440 char label_buf[256];
1441 char buffer[128];
1442 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1443 CODE_LABEL_NUMBER (tmp_label));
1444 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1445 rtx dest_label = operands[pos_label];
1446 operands[pos_label] = tmp_label;
1447
1448 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1449 output_asm_insn (buffer, operands);
1450
1451 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1452 operands[pos_label] = dest_label;
1453 output_asm_insn (buffer, operands);
1454 return "";
1455 }
1456
1457 void
1458 aarch64_err_no_fpadvsimd (machine_mode mode)
1459 {
1460 if (TARGET_GENERAL_REGS_ONLY)
1461 if (FLOAT_MODE_P (mode))
1462 error ("%qs is incompatible with the use of floating-point types",
1463 "-mgeneral-regs-only");
1464 else
1465 error ("%qs is incompatible with the use of vector types",
1466 "-mgeneral-regs-only");
1467 else
1468 if (FLOAT_MODE_P (mode))
1469 error ("%qs feature modifier is incompatible with the use of"
1470 " floating-point types", "+nofp");
1471 else
1472 error ("%qs feature modifier is incompatible with the use of"
1473 " vector types", "+nofp");
1474 }
1475
1476 /* Report when we try to do something that requires SVE when SVE is disabled.
1477 This is an error of last resort and isn't very high-quality. It usually
1478 involves attempts to measure the vector length in some way. */
1479 static void
1480 aarch64_report_sve_required (void)
1481 {
1482 static bool reported_p = false;
1483
1484 /* Avoid reporting a slew of messages for a single oversight. */
1485 if (reported_p)
1486 return;
1487
1488 error ("this operation requires the SVE ISA extension");
1489 inform (input_location, "you can enable SVE using the command-line"
1490 " option %<-march%>, or by using the %<target%>"
1491 " attribute or pragma");
1492 reported_p = true;
1493 }
1494
1495 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1496 registers. */
1497 inline bool
1498 pr_or_ffr_regnum_p (unsigned int regno)
1499 {
1500 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1501 }
1502
1503 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1504 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1505 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1506 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1507 and GENERAL_REGS is lower than the memory cost (in this case the best class
1508 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1509 cost results in bad allocations with many redundant int<->FP moves which
1510 are expensive on various cores.
1511 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1512 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1513 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1514 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1515 The result of this is that it is no longer inefficient to have a higher
1516 memory move cost than the register move cost.
1517 */
1518
1519 static reg_class_t
1520 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1521 reg_class_t best_class)
1522 {
1523 machine_mode mode;
1524
1525 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1526 || !reg_class_subset_p (FP_REGS, allocno_class))
1527 return allocno_class;
1528
1529 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1530 || !reg_class_subset_p (FP_REGS, best_class))
1531 return best_class;
1532
1533 mode = PSEUDO_REGNO_MODE (regno);
1534 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1535 }
1536
1537 static unsigned int
1538 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1539 {
1540 if (GET_MODE_UNIT_SIZE (mode) == 4)
1541 return aarch64_tune_params.min_div_recip_mul_sf;
1542 return aarch64_tune_params.min_div_recip_mul_df;
1543 }
1544
1545 /* Return the reassociation width of treeop OPC with mode MODE. */
1546 static int
1547 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1548 {
1549 if (VECTOR_MODE_P (mode))
1550 return aarch64_tune_params.vec_reassoc_width;
1551 if (INTEGRAL_MODE_P (mode))
1552 return aarch64_tune_params.int_reassoc_width;
1553 /* Avoid reassociating floating point addition so we emit more FMAs. */
1554 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1555 return aarch64_tune_params.fp_reassoc_width;
1556 return 1;
1557 }
1558
1559 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1560 unsigned
1561 aarch64_dbx_register_number (unsigned regno)
1562 {
1563 if (GP_REGNUM_P (regno))
1564 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1565 else if (regno == SP_REGNUM)
1566 return AARCH64_DWARF_SP;
1567 else if (FP_REGNUM_P (regno))
1568 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1569 else if (PR_REGNUM_P (regno))
1570 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1571 else if (regno == VG_REGNUM)
1572 return AARCH64_DWARF_VG;
1573
1574 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1575 equivalent DWARF register. */
1576 return DWARF_FRAME_REGISTERS;
1577 }
1578
1579 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1580 integer, otherwise return X unmodified. */
1581 static rtx
1582 aarch64_bit_representation (rtx x)
1583 {
1584 if (CONST_DOUBLE_P (x))
1585 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1586 return x;
1587 }
1588
1589 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1590 static bool
1591 aarch64_advsimd_struct_mode_p (machine_mode mode)
1592 {
1593 return (TARGET_SIMD
1594 && (mode == OImode || mode == CImode || mode == XImode));
1595 }
1596
1597 /* Return true if MODE is an SVE predicate mode. */
1598 static bool
1599 aarch64_sve_pred_mode_p (machine_mode mode)
1600 {
1601 return (TARGET_SVE
1602 && (mode == VNx16BImode
1603 || mode == VNx8BImode
1604 || mode == VNx4BImode
1605 || mode == VNx2BImode));
1606 }
1607
1608 /* Three mutually-exclusive flags describing a vector or predicate type. */
1609 const unsigned int VEC_ADVSIMD = 1;
1610 const unsigned int VEC_SVE_DATA = 2;
1611 const unsigned int VEC_SVE_PRED = 4;
1612 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1613 a structure of 2, 3 or 4 vectors. */
1614 const unsigned int VEC_STRUCT = 8;
1615 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1616 vector has fewer significant bytes than a full SVE vector. */
1617 const unsigned int VEC_PARTIAL = 16;
1618 /* Useful combinations of the above. */
1619 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1620 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1621
1622 /* Return a set of flags describing the vector properties of mode MODE.
1623 Ignore modes that are not supported by the current target. */
1624 static unsigned int
1625 aarch64_classify_vector_mode (machine_mode mode)
1626 {
1627 if (aarch64_advsimd_struct_mode_p (mode))
1628 return VEC_ADVSIMD | VEC_STRUCT;
1629
1630 if (aarch64_sve_pred_mode_p (mode))
1631 return VEC_SVE_PRED;
1632
1633 /* Make the decision based on the mode's enum value rather than its
1634 properties, so that we keep the correct classification regardless
1635 of -msve-vector-bits. */
1636 switch (mode)
1637 {
1638 /* Partial SVE QI vectors. */
1639 case E_VNx2QImode:
1640 case E_VNx4QImode:
1641 case E_VNx8QImode:
1642 /* Partial SVE HI vectors. */
1643 case E_VNx2HImode:
1644 case E_VNx4HImode:
1645 /* Partial SVE SI vector. */
1646 case E_VNx2SImode:
1647 /* Partial SVE HF vectors. */
1648 case E_VNx2HFmode:
1649 case E_VNx4HFmode:
1650 /* Partial SVE SF vector. */
1651 case E_VNx2SFmode:
1652 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1653
1654 case E_VNx16QImode:
1655 case E_VNx8HImode:
1656 case E_VNx4SImode:
1657 case E_VNx2DImode:
1658 case E_VNx8HFmode:
1659 case E_VNx4SFmode:
1660 case E_VNx2DFmode:
1661 return TARGET_SVE ? VEC_SVE_DATA : 0;
1662
1663 /* x2 SVE vectors. */
1664 case E_VNx32QImode:
1665 case E_VNx16HImode:
1666 case E_VNx8SImode:
1667 case E_VNx4DImode:
1668 case E_VNx16HFmode:
1669 case E_VNx8SFmode:
1670 case E_VNx4DFmode:
1671 /* x3 SVE vectors. */
1672 case E_VNx48QImode:
1673 case E_VNx24HImode:
1674 case E_VNx12SImode:
1675 case E_VNx6DImode:
1676 case E_VNx24HFmode:
1677 case E_VNx12SFmode:
1678 case E_VNx6DFmode:
1679 /* x4 SVE vectors. */
1680 case E_VNx64QImode:
1681 case E_VNx32HImode:
1682 case E_VNx16SImode:
1683 case E_VNx8DImode:
1684 case E_VNx32HFmode:
1685 case E_VNx16SFmode:
1686 case E_VNx8DFmode:
1687 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
1688
1689 /* 64-bit Advanced SIMD vectors. */
1690 case E_V8QImode:
1691 case E_V4HImode:
1692 case E_V2SImode:
1693 /* ...E_V1DImode doesn't exist. */
1694 case E_V4HFmode:
1695 case E_V2SFmode:
1696 case E_V1DFmode:
1697 /* 128-bit Advanced SIMD vectors. */
1698 case E_V16QImode:
1699 case E_V8HImode:
1700 case E_V4SImode:
1701 case E_V2DImode:
1702 case E_V8HFmode:
1703 case E_V4SFmode:
1704 case E_V2DFmode:
1705 return TARGET_SIMD ? VEC_ADVSIMD : 0;
1706
1707 default:
1708 return 0;
1709 }
1710 }
1711
1712 /* Return true if MODE is any of the data vector modes, including
1713 structure modes. */
1714 static bool
1715 aarch64_vector_data_mode_p (machine_mode mode)
1716 {
1717 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1718 }
1719
1720 /* Return true if MODE is any form of SVE mode, including predicates,
1721 vectors and structures. */
1722 bool
1723 aarch64_sve_mode_p (machine_mode mode)
1724 {
1725 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1726 }
1727
1728 /* Return true if MODE is an SVE data vector mode; either a single vector
1729 or a structure of vectors. */
1730 static bool
1731 aarch64_sve_data_mode_p (machine_mode mode)
1732 {
1733 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1734 }
1735
1736 /* Return the number of defined bytes in one constituent vector of
1737 SVE mode MODE, which has vector flags VEC_FLAGS. */
1738 static poly_int64
1739 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1740 {
1741 if (vec_flags & VEC_PARTIAL)
1742 /* A single partial vector. */
1743 return GET_MODE_SIZE (mode);
1744
1745 if (vec_flags & VEC_SVE_DATA)
1746 /* A single vector or a tuple. */
1747 return BYTES_PER_SVE_VECTOR;
1748
1749 /* A single predicate. */
1750 gcc_assert (vec_flags & VEC_SVE_PRED);
1751 return BYTES_PER_SVE_PRED;
1752 }
1753
1754 /* Implement target hook TARGET_ARRAY_MODE. */
1755 static opt_machine_mode
1756 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1757 {
1758 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1759 && IN_RANGE (nelems, 2, 4))
1760 return mode_for_vector (GET_MODE_INNER (mode),
1761 GET_MODE_NUNITS (mode) * nelems);
1762
1763 return opt_machine_mode ();
1764 }
1765
1766 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1767 static bool
1768 aarch64_array_mode_supported_p (machine_mode mode,
1769 unsigned HOST_WIDE_INT nelems)
1770 {
1771 if (TARGET_SIMD
1772 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1773 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1774 && (nelems >= 2 && nelems <= 4))
1775 return true;
1776
1777 return false;
1778 }
1779
1780 /* MODE is some form of SVE vector mode. For data modes, return the number
1781 of vector register bits that each element of MODE occupies, such as 64
1782 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1783 in a 64-bit container). For predicate modes, return the number of
1784 data bits controlled by each significant predicate bit. */
1785
1786 static unsigned int
1787 aarch64_sve_container_bits (machine_mode mode)
1788 {
1789 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1790 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1791 ? BITS_PER_SVE_VECTOR
1792 : GET_MODE_BITSIZE (mode));
1793 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1794 }
1795
1796 /* Return the SVE predicate mode to use for elements that have
1797 ELEM_NBYTES bytes, if such a mode exists. */
1798
1799 opt_machine_mode
1800 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1801 {
1802 if (TARGET_SVE)
1803 {
1804 if (elem_nbytes == 1)
1805 return VNx16BImode;
1806 if (elem_nbytes == 2)
1807 return VNx8BImode;
1808 if (elem_nbytes == 4)
1809 return VNx4BImode;
1810 if (elem_nbytes == 8)
1811 return VNx2BImode;
1812 }
1813 return opt_machine_mode ();
1814 }
1815
1816 /* Return the SVE predicate mode that should be used to control
1817 SVE mode MODE. */
1818
1819 machine_mode
1820 aarch64_sve_pred_mode (machine_mode mode)
1821 {
1822 unsigned int bits = aarch64_sve_container_bits (mode);
1823 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1824 }
1825
1826 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1827
1828 static opt_machine_mode
1829 aarch64_get_mask_mode (machine_mode mode)
1830 {
1831 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1832 if (vec_flags & VEC_SVE_DATA)
1833 return aarch64_sve_pred_mode (mode);
1834
1835 return default_get_mask_mode (mode);
1836 }
1837
1838 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
1839
1840 opt_machine_mode
1841 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1842 {
1843 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1844 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1845 machine_mode mode;
1846 FOR_EACH_MODE_IN_CLASS (mode, mclass)
1847 if (inner_mode == GET_MODE_INNER (mode)
1848 && known_eq (nunits, GET_MODE_NUNITS (mode))
1849 && aarch64_sve_data_mode_p (mode))
1850 return mode;
1851 return opt_machine_mode ();
1852 }
1853
1854 /* Return the integer element mode associated with SVE mode MODE. */
1855
1856 static scalar_int_mode
1857 aarch64_sve_element_int_mode (machine_mode mode)
1858 {
1859 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1860 ? BITS_PER_SVE_VECTOR
1861 : GET_MODE_BITSIZE (mode));
1862 unsigned int elt_bits = vector_element_size (vector_bits,
1863 GET_MODE_NUNITS (mode));
1864 return int_mode_for_size (elt_bits, 0).require ();
1865 }
1866
1867 /* Return an integer element mode that contains exactly
1868 aarch64_sve_container_bits (MODE) bits. This is wider than
1869 aarch64_sve_element_int_mode if MODE is a partial vector,
1870 otherwise it's the same. */
1871
1872 static scalar_int_mode
1873 aarch64_sve_container_int_mode (machine_mode mode)
1874 {
1875 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1876 }
1877
1878 /* Return the integer vector mode associated with SVE mode MODE.
1879 Unlike related_int_vector_mode, this can handle the case in which
1880 MODE is a predicate (and thus has a different total size). */
1881
1882 machine_mode
1883 aarch64_sve_int_mode (machine_mode mode)
1884 {
1885 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1886 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1887 }
1888
1889 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
1890
1891 static opt_machine_mode
1892 aarch64_vectorize_related_mode (machine_mode vector_mode,
1893 scalar_mode element_mode,
1894 poly_uint64 nunits)
1895 {
1896 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1897
1898 /* If we're operating on SVE vectors, try to return an SVE mode. */
1899 poly_uint64 sve_nunits;
1900 if ((vec_flags & VEC_SVE_DATA)
1901 && multiple_p (BYTES_PER_SVE_VECTOR,
1902 GET_MODE_SIZE (element_mode), &sve_nunits))
1903 {
1904 machine_mode sve_mode;
1905 if (maybe_ne (nunits, 0U))
1906 {
1907 /* Try to find a full or partial SVE mode with exactly
1908 NUNITS units. */
1909 if (multiple_p (sve_nunits, nunits)
1910 && aarch64_sve_data_mode (element_mode,
1911 nunits).exists (&sve_mode))
1912 return sve_mode;
1913 }
1914 else
1915 {
1916 /* Take the preferred number of units from the number of bytes
1917 that fit in VECTOR_MODE. We always start by "autodetecting"
1918 a full vector mode with preferred_simd_mode, so vectors
1919 chosen here will also be full vector modes. Then
1920 autovectorize_vector_modes tries smaller starting modes
1921 and thus smaller preferred numbers of units. */
1922 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
1923 if (aarch64_sve_data_mode (element_mode,
1924 sve_nunits).exists (&sve_mode))
1925 return sve_mode;
1926 }
1927 }
1928
1929 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
1930 if ((vec_flags & VEC_ADVSIMD)
1931 && known_eq (nunits, 0U)
1932 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
1933 && maybe_ge (GET_MODE_BITSIZE (element_mode)
1934 * GET_MODE_NUNITS (vector_mode), 128U))
1935 {
1936 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
1937 if (VECTOR_MODE_P (res))
1938 return res;
1939 }
1940
1941 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
1942 }
1943
1944 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1945 prefer to use the first arithmetic operand as the else value if
1946 the else value doesn't matter, since that exactly matches the SVE
1947 destructive merging form. For ternary operations we could either
1948 pick the first operand and use FMAD-like instructions or the last
1949 operand and use FMLA-like instructions; the latter seems more
1950 natural. */
1951
1952 static tree
1953 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1954 {
1955 return nops == 3 ? ops[2] : ops[0];
1956 }
1957
1958 /* Implement TARGET_HARD_REGNO_NREGS. */
1959
1960 static unsigned int
1961 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1962 {
1963 /* ??? Logically we should only need to provide a value when
1964 HARD_REGNO_MODE_OK says that the combination is valid,
1965 but at the moment we need to handle all modes. Just ignore
1966 any runtime parts for registers that can't store them. */
1967 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1968 switch (aarch64_regno_regclass (regno))
1969 {
1970 case FP_REGS:
1971 case FP_LO_REGS:
1972 case FP_LO8_REGS:
1973 {
1974 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1975 if (vec_flags & VEC_SVE_DATA)
1976 return exact_div (GET_MODE_SIZE (mode),
1977 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
1978 return CEIL (lowest_size, UNITS_PER_VREG);
1979 }
1980 case PR_REGS:
1981 case PR_LO_REGS:
1982 case PR_HI_REGS:
1983 case FFR_REGS:
1984 case PR_AND_FFR_REGS:
1985 return 1;
1986 default:
1987 return CEIL (lowest_size, UNITS_PER_WORD);
1988 }
1989 gcc_unreachable ();
1990 }
1991
1992 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1993
1994 static bool
1995 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1996 {
1997 if (GET_MODE_CLASS (mode) == MODE_CC)
1998 return regno == CC_REGNUM;
1999
2000 if (regno == VG_REGNUM)
2001 /* This must have the same size as _Unwind_Word. */
2002 return mode == DImode;
2003
2004 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2005 if (vec_flags & VEC_SVE_PRED)
2006 return pr_or_ffr_regnum_p (regno);
2007
2008 if (pr_or_ffr_regnum_p (regno))
2009 return false;
2010
2011 if (regno == SP_REGNUM)
2012 /* The purpose of comparing with ptr_mode is to support the
2013 global register variable associated with the stack pointer
2014 register via the syntax of asm ("wsp") in ILP32. */
2015 return mode == Pmode || mode == ptr_mode;
2016
2017 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2018 return mode == Pmode;
2019
2020 if (GP_REGNUM_P (regno))
2021 {
2022 if (vec_flags & VEC_ANY_SVE)
2023 return false;
2024 if (known_le (GET_MODE_SIZE (mode), 8))
2025 return true;
2026 if (known_le (GET_MODE_SIZE (mode), 16))
2027 return (regno & 1) == 0;
2028 }
2029 else if (FP_REGNUM_P (regno))
2030 {
2031 if (vec_flags & VEC_STRUCT)
2032 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2033 else
2034 return !VECTOR_MODE_P (mode) || vec_flags != 0;
2035 }
2036
2037 return false;
2038 }
2039
2040 /* Return true if TYPE is a type that should be passed or returned in
2041 SVE registers, assuming enough registers are available. When returning
2042 true, set *NUM_ZR and *NUM_PR to the number of required Z and P registers
2043 respectively. */
2044
2045 static bool
2046 aarch64_sve_argument_p (const_tree type, unsigned int *num_zr,
2047 unsigned int *num_pr)
2048 {
2049 if (aarch64_sve::svbool_type_p (type))
2050 {
2051 *num_pr = 1;
2052 *num_zr = 0;
2053 return true;
2054 }
2055
2056 if (unsigned int nvectors = aarch64_sve::nvectors_if_data_type (type))
2057 {
2058 *num_pr = 0;
2059 *num_zr = nvectors;
2060 return true;
2061 }
2062
2063 return false;
2064 }
2065
2066 /* Return true if a function with type FNTYPE returns its value in
2067 SVE vector or predicate registers. */
2068
2069 static bool
2070 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2071 {
2072 unsigned int num_zr, num_pr;
2073 tree return_type = TREE_TYPE (fntype);
2074 return (return_type != error_mark_node
2075 && aarch64_sve_argument_p (return_type, &num_zr, &num_pr));
2076 }
2077
2078 /* Return true if a function with type FNTYPE takes arguments in
2079 SVE vector or predicate registers. */
2080
2081 static bool
2082 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2083 {
2084 CUMULATIVE_ARGS args_so_far_v;
2085 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2086 NULL_TREE, 0, true);
2087 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2088
2089 for (tree chain = TYPE_ARG_TYPES (fntype);
2090 chain && chain != void_list_node;
2091 chain = TREE_CHAIN (chain))
2092 {
2093 tree arg_type = TREE_VALUE (chain);
2094 if (arg_type == error_mark_node)
2095 return false;
2096
2097 function_arg_info arg (arg_type, /*named=*/true);
2098 apply_pass_by_reference_rules (&args_so_far_v, arg);
2099 unsigned int num_zr, num_pr;
2100 if (aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
2101 return true;
2102
2103 targetm.calls.function_arg_advance (args_so_far, arg);
2104 }
2105 return false;
2106 }
2107
2108 /* Implement TARGET_FNTYPE_ABI. */
2109
2110 static const predefined_function_abi &
2111 aarch64_fntype_abi (const_tree fntype)
2112 {
2113 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2114 return aarch64_simd_abi ();
2115
2116 if (aarch64_returns_value_in_sve_regs_p (fntype)
2117 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2118 return aarch64_sve_abi ();
2119
2120 return default_function_abi;
2121 }
2122
2123 /* Return true if we should emit CFI for register REGNO. */
2124
2125 static bool
2126 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2127 {
2128 return (GP_REGNUM_P (regno)
2129 || !default_function_abi.clobbers_full_reg_p (regno));
2130 }
2131
2132 /* Return the mode we should use to save and restore register REGNO. */
2133
2134 static machine_mode
2135 aarch64_reg_save_mode (unsigned int regno)
2136 {
2137 if (GP_REGNUM_P (regno))
2138 return DImode;
2139
2140 if (FP_REGNUM_P (regno))
2141 switch (crtl->abi->id ())
2142 {
2143 case ARM_PCS_AAPCS64:
2144 /* Only the low 64 bits are saved by the base PCS. */
2145 return DFmode;
2146
2147 case ARM_PCS_SIMD:
2148 /* The vector PCS saves the low 128 bits (which is the full
2149 register on non-SVE targets). */
2150 return TFmode;
2151
2152 case ARM_PCS_SVE:
2153 /* Use vectors of DImode for registers that need frame
2154 information, so that the first 64 bytes of the save slot
2155 are always the equivalent of what storing D<n> would give. */
2156 if (aarch64_emit_cfi_for_reg_p (regno))
2157 return VNx2DImode;
2158
2159 /* Use vectors of bytes otherwise, so that the layout is
2160 endian-agnostic, and so that we can use LDR and STR for
2161 big-endian targets. */
2162 return VNx16QImode;
2163
2164 case ARM_PCS_TLSDESC:
2165 case ARM_PCS_UNKNOWN:
2166 break;
2167 }
2168
2169 if (PR_REGNUM_P (regno))
2170 /* Save the full predicate register. */
2171 return VNx16BImode;
2172
2173 gcc_unreachable ();
2174 }
2175
2176 /* Implement TARGET_INSN_CALLEE_ABI. */
2177
2178 const predefined_function_abi &
2179 aarch64_insn_callee_abi (const rtx_insn *insn)
2180 {
2181 rtx pat = PATTERN (insn);
2182 gcc_assert (GET_CODE (pat) == PARALLEL);
2183 rtx unspec = XVECEXP (pat, 0, 1);
2184 gcc_assert (GET_CODE (unspec) == UNSPEC
2185 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2186 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2187 }
2188
2189 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2190 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2191 clobbers the top 64 bits when restoring the bottom 64 bits. */
2192
2193 static bool
2194 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2195 unsigned int regno,
2196 machine_mode mode)
2197 {
2198 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2199 {
2200 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2201 unsigned int nregs = hard_regno_nregs (regno, mode);
2202 if (nregs > 1)
2203 per_register_size = exact_div (per_register_size, nregs);
2204 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2205 return maybe_gt (per_register_size, 16);
2206 return maybe_gt (per_register_size, 8);
2207 }
2208 return false;
2209 }
2210
2211 /* Implement REGMODE_NATURAL_SIZE. */
2212 poly_uint64
2213 aarch64_regmode_natural_size (machine_mode mode)
2214 {
2215 /* The natural size for SVE data modes is one SVE data vector,
2216 and similarly for predicates. We can't independently modify
2217 anything smaller than that. */
2218 /* ??? For now, only do this for variable-width SVE registers.
2219 Doing it for constant-sized registers breaks lower-subreg.c. */
2220 /* ??? And once that's fixed, we should probably have similar
2221 code for Advanced SIMD. */
2222 if (!aarch64_sve_vg.is_constant ())
2223 {
2224 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2225 if (vec_flags & VEC_SVE_PRED)
2226 return BYTES_PER_SVE_PRED;
2227 if (vec_flags & VEC_SVE_DATA)
2228 return BYTES_PER_SVE_VECTOR;
2229 }
2230 return UNITS_PER_WORD;
2231 }
2232
2233 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2234 machine_mode
2235 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2236 machine_mode mode)
2237 {
2238 /* The predicate mode determines which bits are significant and
2239 which are "don't care". Decreasing the number of lanes would
2240 lose data while increasing the number of lanes would make bits
2241 unnecessarily significant. */
2242 if (PR_REGNUM_P (regno))
2243 return mode;
2244 if (known_ge (GET_MODE_SIZE (mode), 4))
2245 return mode;
2246 else
2247 return SImode;
2248 }
2249
2250 /* Return true if I's bits are consecutive ones from the MSB. */
2251 bool
2252 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2253 {
2254 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2255 }
2256
2257 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2258 that strcpy from constants will be faster. */
2259
2260 static HOST_WIDE_INT
2261 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2262 {
2263 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2264 return MAX (align, BITS_PER_WORD);
2265 return align;
2266 }
2267
2268 /* Return true if calls to DECL should be treated as
2269 long-calls (ie called via a register). */
2270 static bool
2271 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2272 {
2273 return false;
2274 }
2275
2276 /* Return true if calls to symbol-ref SYM should be treated as
2277 long-calls (ie called via a register). */
2278 bool
2279 aarch64_is_long_call_p (rtx sym)
2280 {
2281 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2282 }
2283
2284 /* Return true if calls to symbol-ref SYM should not go through
2285 plt stubs. */
2286
2287 bool
2288 aarch64_is_noplt_call_p (rtx sym)
2289 {
2290 const_tree decl = SYMBOL_REF_DECL (sym);
2291
2292 if (flag_pic
2293 && decl
2294 && (!flag_plt
2295 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2296 && !targetm.binds_local_p (decl))
2297 return true;
2298
2299 return false;
2300 }
2301
2302 /* Return true if the offsets to a zero/sign-extract operation
2303 represent an expression that matches an extend operation. The
2304 operands represent the paramters from
2305
2306 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2307 bool
2308 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2309 rtx extract_imm)
2310 {
2311 HOST_WIDE_INT mult_val, extract_val;
2312
2313 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2314 return false;
2315
2316 mult_val = INTVAL (mult_imm);
2317 extract_val = INTVAL (extract_imm);
2318
2319 if (extract_val > 8
2320 && extract_val < GET_MODE_BITSIZE (mode)
2321 && exact_log2 (extract_val & ~7) > 0
2322 && (extract_val & 7) <= 4
2323 && mult_val == (1 << (extract_val & 7)))
2324 return true;
2325
2326 return false;
2327 }
2328
2329 /* Emit an insn that's a simple single-set. Both the operands must be
2330 known to be valid. */
2331 inline static rtx_insn *
2332 emit_set_insn (rtx x, rtx y)
2333 {
2334 return emit_insn (gen_rtx_SET (x, y));
2335 }
2336
2337 /* X and Y are two things to compare using CODE. Emit the compare insn and
2338 return the rtx for register 0 in the proper mode. */
2339 rtx
2340 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2341 {
2342 machine_mode cmp_mode = GET_MODE (x);
2343 machine_mode cc_mode;
2344 rtx cc_reg;
2345
2346 if (cmp_mode == TImode)
2347 {
2348 gcc_assert (code == NE);
2349
2350 cc_mode = CCmode;
2351 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2352
2353 rtx x_lo = operand_subword (x, 0, 0, TImode);
2354 rtx y_lo = operand_subword (y, 0, 0, TImode);
2355 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2356
2357 rtx x_hi = operand_subword (x, 1, 0, TImode);
2358 rtx y_hi = operand_subword (y, 1, 0, TImode);
2359 emit_insn (gen_ccmpdi (cc_reg, cc_reg, x_hi, y_hi,
2360 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2361 GEN_INT (AARCH64_EQ)));
2362 }
2363 else
2364 {
2365 cc_mode = SELECT_CC_MODE (code, x, y);
2366 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2367 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2368 }
2369 return cc_reg;
2370 }
2371
2372 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2373
2374 static rtx
2375 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2376 machine_mode y_mode)
2377 {
2378 if (y_mode == E_QImode || y_mode == E_HImode)
2379 {
2380 if (CONST_INT_P (y))
2381 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2382 else
2383 {
2384 rtx t, cc_reg;
2385 machine_mode cc_mode;
2386
2387 t = gen_rtx_ZERO_EXTEND (SImode, y);
2388 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2389 cc_mode = CC_SWPmode;
2390 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2391 emit_set_insn (cc_reg, t);
2392 return cc_reg;
2393 }
2394 }
2395
2396 if (!aarch64_plus_operand (y, y_mode))
2397 y = force_reg (y_mode, y);
2398
2399 return aarch64_gen_compare_reg (code, x, y);
2400 }
2401
2402 /* Build the SYMBOL_REF for __tls_get_addr. */
2403
2404 static GTY(()) rtx tls_get_addr_libfunc;
2405
2406 rtx
2407 aarch64_tls_get_addr (void)
2408 {
2409 if (!tls_get_addr_libfunc)
2410 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2411 return tls_get_addr_libfunc;
2412 }
2413
2414 /* Return the TLS model to use for ADDR. */
2415
2416 static enum tls_model
2417 tls_symbolic_operand_type (rtx addr)
2418 {
2419 enum tls_model tls_kind = TLS_MODEL_NONE;
2420 if (GET_CODE (addr) == CONST)
2421 {
2422 poly_int64 addend;
2423 rtx sym = strip_offset (addr, &addend);
2424 if (GET_CODE (sym) == SYMBOL_REF)
2425 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2426 }
2427 else if (GET_CODE (addr) == SYMBOL_REF)
2428 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2429
2430 return tls_kind;
2431 }
2432
2433 /* We'll allow lo_sum's in addresses in our legitimate addresses
2434 so that combine would take care of combining addresses where
2435 necessary, but for generation purposes, we'll generate the address
2436 as :
2437 RTL Absolute
2438 tmp = hi (symbol_ref); adrp x1, foo
2439 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2440 nop
2441
2442 PIC TLS
2443 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2444 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2445 bl __tls_get_addr
2446 nop
2447
2448 Load TLS symbol, depending on TLS mechanism and TLS access model.
2449
2450 Global Dynamic - Traditional TLS:
2451 adrp tmp, :tlsgd:imm
2452 add dest, tmp, #:tlsgd_lo12:imm
2453 bl __tls_get_addr
2454
2455 Global Dynamic - TLS Descriptors:
2456 adrp dest, :tlsdesc:imm
2457 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2458 add dest, dest, #:tlsdesc_lo12:imm
2459 blr tmp
2460 mrs tp, tpidr_el0
2461 add dest, dest, tp
2462
2463 Initial Exec:
2464 mrs tp, tpidr_el0
2465 adrp tmp, :gottprel:imm
2466 ldr dest, [tmp, #:gottprel_lo12:imm]
2467 add dest, dest, tp
2468
2469 Local Exec:
2470 mrs tp, tpidr_el0
2471 add t0, tp, #:tprel_hi12:imm, lsl #12
2472 add t0, t0, #:tprel_lo12_nc:imm
2473 */
2474
2475 static void
2476 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2477 enum aarch64_symbol_type type)
2478 {
2479 switch (type)
2480 {
2481 case SYMBOL_SMALL_ABSOLUTE:
2482 {
2483 /* In ILP32, the mode of dest can be either SImode or DImode. */
2484 rtx tmp_reg = dest;
2485 machine_mode mode = GET_MODE (dest);
2486
2487 gcc_assert (mode == Pmode || mode == ptr_mode);
2488
2489 if (can_create_pseudo_p ())
2490 tmp_reg = gen_reg_rtx (mode);
2491
2492 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2493 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2494 return;
2495 }
2496
2497 case SYMBOL_TINY_ABSOLUTE:
2498 emit_insn (gen_rtx_SET (dest, imm));
2499 return;
2500
2501 case SYMBOL_SMALL_GOT_28K:
2502 {
2503 machine_mode mode = GET_MODE (dest);
2504 rtx gp_rtx = pic_offset_table_rtx;
2505 rtx insn;
2506 rtx mem;
2507
2508 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2509 here before rtl expand. Tree IVOPT will generate rtl pattern to
2510 decide rtx costs, in which case pic_offset_table_rtx is not
2511 initialized. For that case no need to generate the first adrp
2512 instruction as the final cost for global variable access is
2513 one instruction. */
2514 if (gp_rtx != NULL)
2515 {
2516 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2517 using the page base as GOT base, the first page may be wasted,
2518 in the worst scenario, there is only 28K space for GOT).
2519
2520 The generate instruction sequence for accessing global variable
2521 is:
2522
2523 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2524
2525 Only one instruction needed. But we must initialize
2526 pic_offset_table_rtx properly. We generate initialize insn for
2527 every global access, and allow CSE to remove all redundant.
2528
2529 The final instruction sequences will look like the following
2530 for multiply global variables access.
2531
2532 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2533
2534 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2535 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2536 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2537 ... */
2538
2539 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2540 crtl->uses_pic_offset_table = 1;
2541 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2542
2543 if (mode != GET_MODE (gp_rtx))
2544 gp_rtx = gen_lowpart (mode, gp_rtx);
2545
2546 }
2547
2548 if (mode == ptr_mode)
2549 {
2550 if (mode == DImode)
2551 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2552 else
2553 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2554
2555 mem = XVECEXP (SET_SRC (insn), 0, 0);
2556 }
2557 else
2558 {
2559 gcc_assert (mode == Pmode);
2560
2561 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2562 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2563 }
2564
2565 /* The operand is expected to be MEM. Whenever the related insn
2566 pattern changed, above code which calculate mem should be
2567 updated. */
2568 gcc_assert (GET_CODE (mem) == MEM);
2569 MEM_READONLY_P (mem) = 1;
2570 MEM_NOTRAP_P (mem) = 1;
2571 emit_insn (insn);
2572 return;
2573 }
2574
2575 case SYMBOL_SMALL_GOT_4G:
2576 {
2577 /* In ILP32, the mode of dest can be either SImode or DImode,
2578 while the got entry is always of SImode size. The mode of
2579 dest depends on how dest is used: if dest is assigned to a
2580 pointer (e.g. in the memory), it has SImode; it may have
2581 DImode if dest is dereferenced to access the memeory.
2582 This is why we have to handle three different ldr_got_small
2583 patterns here (two patterns for ILP32). */
2584
2585 rtx insn;
2586 rtx mem;
2587 rtx tmp_reg = dest;
2588 machine_mode mode = GET_MODE (dest);
2589
2590 if (can_create_pseudo_p ())
2591 tmp_reg = gen_reg_rtx (mode);
2592
2593 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2594 if (mode == ptr_mode)
2595 {
2596 if (mode == DImode)
2597 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2598 else
2599 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2600
2601 mem = XVECEXP (SET_SRC (insn), 0, 0);
2602 }
2603 else
2604 {
2605 gcc_assert (mode == Pmode);
2606
2607 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2608 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2609 }
2610
2611 gcc_assert (GET_CODE (mem) == MEM);
2612 MEM_READONLY_P (mem) = 1;
2613 MEM_NOTRAP_P (mem) = 1;
2614 emit_insn (insn);
2615 return;
2616 }
2617
2618 case SYMBOL_SMALL_TLSGD:
2619 {
2620 rtx_insn *insns;
2621 machine_mode mode = GET_MODE (dest);
2622 rtx result = gen_rtx_REG (mode, R0_REGNUM);
2623
2624 start_sequence ();
2625 if (TARGET_ILP32)
2626 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2627 else
2628 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2629 insns = get_insns ();
2630 end_sequence ();
2631
2632 RTL_CONST_CALL_P (insns) = 1;
2633 emit_libcall_block (insns, dest, result, imm);
2634 return;
2635 }
2636
2637 case SYMBOL_SMALL_TLSDESC:
2638 {
2639 machine_mode mode = GET_MODE (dest);
2640 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2641 rtx tp;
2642
2643 gcc_assert (mode == Pmode || mode == ptr_mode);
2644
2645 /* In ILP32, the got entry is always of SImode size. Unlike
2646 small GOT, the dest is fixed at reg 0. */
2647 if (TARGET_ILP32)
2648 emit_insn (gen_tlsdesc_small_si (imm));
2649 else
2650 emit_insn (gen_tlsdesc_small_di (imm));
2651 tp = aarch64_load_tp (NULL);
2652
2653 if (mode != Pmode)
2654 tp = gen_lowpart (mode, tp);
2655
2656 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2657 if (REG_P (dest))
2658 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2659 return;
2660 }
2661
2662 case SYMBOL_SMALL_TLSIE:
2663 {
2664 /* In ILP32, the mode of dest can be either SImode or DImode,
2665 while the got entry is always of SImode size. The mode of
2666 dest depends on how dest is used: if dest is assigned to a
2667 pointer (e.g. in the memory), it has SImode; it may have
2668 DImode if dest is dereferenced to access the memeory.
2669 This is why we have to handle three different tlsie_small
2670 patterns here (two patterns for ILP32). */
2671 machine_mode mode = GET_MODE (dest);
2672 rtx tmp_reg = gen_reg_rtx (mode);
2673 rtx tp = aarch64_load_tp (NULL);
2674
2675 if (mode == ptr_mode)
2676 {
2677 if (mode == DImode)
2678 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2679 else
2680 {
2681 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2682 tp = gen_lowpart (mode, tp);
2683 }
2684 }
2685 else
2686 {
2687 gcc_assert (mode == Pmode);
2688 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2689 }
2690
2691 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2692 if (REG_P (dest))
2693 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2694 return;
2695 }
2696
2697 case SYMBOL_TLSLE12:
2698 case SYMBOL_TLSLE24:
2699 case SYMBOL_TLSLE32:
2700 case SYMBOL_TLSLE48:
2701 {
2702 machine_mode mode = GET_MODE (dest);
2703 rtx tp = aarch64_load_tp (NULL);
2704
2705 if (mode != Pmode)
2706 tp = gen_lowpart (mode, tp);
2707
2708 switch (type)
2709 {
2710 case SYMBOL_TLSLE12:
2711 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2712 (dest, tp, imm));
2713 break;
2714 case SYMBOL_TLSLE24:
2715 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2716 (dest, tp, imm));
2717 break;
2718 case SYMBOL_TLSLE32:
2719 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2720 (dest, imm));
2721 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2722 (dest, dest, tp));
2723 break;
2724 case SYMBOL_TLSLE48:
2725 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2726 (dest, imm));
2727 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2728 (dest, dest, tp));
2729 break;
2730 default:
2731 gcc_unreachable ();
2732 }
2733
2734 if (REG_P (dest))
2735 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2736 return;
2737 }
2738
2739 case SYMBOL_TINY_GOT:
2740 emit_insn (gen_ldr_got_tiny (dest, imm));
2741 return;
2742
2743 case SYMBOL_TINY_TLSIE:
2744 {
2745 machine_mode mode = GET_MODE (dest);
2746 rtx tp = aarch64_load_tp (NULL);
2747
2748 if (mode == ptr_mode)
2749 {
2750 if (mode == DImode)
2751 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2752 else
2753 {
2754 tp = gen_lowpart (mode, tp);
2755 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2756 }
2757 }
2758 else
2759 {
2760 gcc_assert (mode == Pmode);
2761 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2762 }
2763
2764 if (REG_P (dest))
2765 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2766 return;
2767 }
2768
2769 default:
2770 gcc_unreachable ();
2771 }
2772 }
2773
2774 /* Emit a move from SRC to DEST. Assume that the move expanders can
2775 handle all moves if !can_create_pseudo_p (). The distinction is
2776 important because, unlike emit_move_insn, the move expanders know
2777 how to force Pmode objects into the constant pool even when the
2778 constant pool address is not itself legitimate. */
2779 static rtx
2780 aarch64_emit_move (rtx dest, rtx src)
2781 {
2782 return (can_create_pseudo_p ()
2783 ? emit_move_insn (dest, src)
2784 : emit_move_insn_1 (dest, src));
2785 }
2786
2787 /* Apply UNOPTAB to OP and store the result in DEST. */
2788
2789 static void
2790 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2791 {
2792 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2793 if (dest != tmp)
2794 emit_move_insn (dest, tmp);
2795 }
2796
2797 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2798
2799 static void
2800 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2801 {
2802 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2803 OPTAB_DIRECT);
2804 if (dest != tmp)
2805 emit_move_insn (dest, tmp);
2806 }
2807
2808 /* Split a 128-bit move operation into two 64-bit move operations,
2809 taking care to handle partial overlap of register to register
2810 copies. Special cases are needed when moving between GP regs and
2811 FP regs. SRC can be a register, constant or memory; DST a register
2812 or memory. If either operand is memory it must not have any side
2813 effects. */
2814 void
2815 aarch64_split_128bit_move (rtx dst, rtx src)
2816 {
2817 rtx dst_lo, dst_hi;
2818 rtx src_lo, src_hi;
2819
2820 machine_mode mode = GET_MODE (dst);
2821
2822 gcc_assert (mode == TImode || mode == TFmode);
2823 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2824 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2825
2826 if (REG_P (dst) && REG_P (src))
2827 {
2828 int src_regno = REGNO (src);
2829 int dst_regno = REGNO (dst);
2830
2831 /* Handle FP <-> GP regs. */
2832 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2833 {
2834 src_lo = gen_lowpart (word_mode, src);
2835 src_hi = gen_highpart (word_mode, src);
2836
2837 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2838 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2839 return;
2840 }
2841 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2842 {
2843 dst_lo = gen_lowpart (word_mode, dst);
2844 dst_hi = gen_highpart (word_mode, dst);
2845
2846 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2847 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2848 return;
2849 }
2850 }
2851
2852 dst_lo = gen_lowpart (word_mode, dst);
2853 dst_hi = gen_highpart (word_mode, dst);
2854 src_lo = gen_lowpart (word_mode, src);
2855 src_hi = gen_highpart_mode (word_mode, mode, src);
2856
2857 /* At most one pairing may overlap. */
2858 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2859 {
2860 aarch64_emit_move (dst_hi, src_hi);
2861 aarch64_emit_move (dst_lo, src_lo);
2862 }
2863 else
2864 {
2865 aarch64_emit_move (dst_lo, src_lo);
2866 aarch64_emit_move (dst_hi, src_hi);
2867 }
2868 }
2869
2870 bool
2871 aarch64_split_128bit_move_p (rtx dst, rtx src)
2872 {
2873 return (! REG_P (src)
2874 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2875 }
2876
2877 /* Split a complex SIMD combine. */
2878
2879 void
2880 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2881 {
2882 machine_mode src_mode = GET_MODE (src1);
2883 machine_mode dst_mode = GET_MODE (dst);
2884
2885 gcc_assert (VECTOR_MODE_P (dst_mode));
2886 gcc_assert (register_operand (dst, dst_mode)
2887 && register_operand (src1, src_mode)
2888 && register_operand (src2, src_mode));
2889
2890 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2891 return;
2892 }
2893
2894 /* Split a complex SIMD move. */
2895
2896 void
2897 aarch64_split_simd_move (rtx dst, rtx src)
2898 {
2899 machine_mode src_mode = GET_MODE (src);
2900 machine_mode dst_mode = GET_MODE (dst);
2901
2902 gcc_assert (VECTOR_MODE_P (dst_mode));
2903
2904 if (REG_P (dst) && REG_P (src))
2905 {
2906 gcc_assert (VECTOR_MODE_P (src_mode));
2907 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2908 }
2909 }
2910
2911 bool
2912 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2913 machine_mode ymode, rtx y)
2914 {
2915 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2916 gcc_assert (r != NULL);
2917 return rtx_equal_p (x, r);
2918 }
2919
2920 /* Return TARGET if it is nonnull and a register of mode MODE.
2921 Otherwise, return a fresh register of mode MODE if we can,
2922 or TARGET reinterpreted as MODE if we can't. */
2923
2924 static rtx
2925 aarch64_target_reg (rtx target, machine_mode mode)
2926 {
2927 if (target && REG_P (target) && GET_MODE (target) == mode)
2928 return target;
2929 if (!can_create_pseudo_p ())
2930 {
2931 gcc_assert (target);
2932 return gen_lowpart (mode, target);
2933 }
2934 return gen_reg_rtx (mode);
2935 }
2936
2937 /* Return a register that contains the constant in BUILDER, given that
2938 the constant is a legitimate move operand. Use TARGET as the register
2939 if it is nonnull and convenient. */
2940
2941 static rtx
2942 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
2943 {
2944 rtx src = builder.build ();
2945 target = aarch64_target_reg (target, GET_MODE (src));
2946 emit_insn (gen_rtx_SET (target, src));
2947 return target;
2948 }
2949
2950 static rtx
2951 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2952 {
2953 if (can_create_pseudo_p ())
2954 return force_reg (mode, value);
2955 else
2956 {
2957 gcc_assert (x);
2958 aarch64_emit_move (x, value);
2959 return x;
2960 }
2961 }
2962
2963 /* Return true if predicate value X is a constant in which every element
2964 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
2965 value, i.e. as a predicate in which all bits are significant. */
2966
2967 static bool
2968 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
2969 {
2970 if (GET_CODE (x) != CONST_VECTOR)
2971 return false;
2972
2973 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
2974 GET_MODE_NUNITS (GET_MODE (x)));
2975 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
2976 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
2977 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
2978
2979 unsigned int nelts = const_vector_encoded_nelts (x);
2980 for (unsigned int i = 0; i < nelts; ++i)
2981 {
2982 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
2983 if (!CONST_INT_P (elt))
2984 return false;
2985
2986 builder.quick_push (elt);
2987 for (unsigned int j = 1; j < factor; ++j)
2988 builder.quick_push (const0_rtx);
2989 }
2990 builder.finalize ();
2991 return true;
2992 }
2993
2994 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
2995 widest predicate element size it can have (that is, the largest size
2996 for which each element would still be 0 or 1). */
2997
2998 unsigned int
2999 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3000 {
3001 /* Start with the most optimistic assumption: that we only need
3002 one bit per pattern. This is what we will use if only the first
3003 bit in each pattern is ever set. */
3004 unsigned int mask = GET_MODE_SIZE (DImode);
3005 mask |= builder.npatterns ();
3006
3007 /* Look for set bits. */
3008 unsigned int nelts = builder.encoded_nelts ();
3009 for (unsigned int i = 1; i < nelts; ++i)
3010 if (INTVAL (builder.elt (i)) != 0)
3011 {
3012 if (i & 1)
3013 return 1;
3014 mask |= i;
3015 }
3016 return mask & -mask;
3017 }
3018
3019 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3020 return that predicate mode, otherwise return opt_machine_mode (). */
3021
3022 opt_machine_mode
3023 aarch64_ptrue_all_mode (rtx x)
3024 {
3025 gcc_assert (GET_MODE (x) == VNx16BImode);
3026 if (GET_CODE (x) != CONST_VECTOR
3027 || !CONST_VECTOR_DUPLICATE_P (x)
3028 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3029 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3030 return opt_machine_mode ();
3031
3032 unsigned int nelts = const_vector_encoded_nelts (x);
3033 for (unsigned int i = 1; i < nelts; ++i)
3034 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3035 return opt_machine_mode ();
3036
3037 return aarch64_sve_pred_mode (nelts);
3038 }
3039
3040 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3041 that the constant would have with predicate element size ELT_SIZE
3042 (ignoring the upper bits in each element) and return:
3043
3044 * -1 if all bits are set
3045 * N if the predicate has N leading set bits followed by all clear bits
3046 * 0 if the predicate does not have any of these forms. */
3047
3048 int
3049 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3050 unsigned int elt_size)
3051 {
3052 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3053 followed by set bits. */
3054 if (builder.nelts_per_pattern () == 3)
3055 return 0;
3056
3057 /* Skip over leading set bits. */
3058 unsigned int nelts = builder.encoded_nelts ();
3059 unsigned int i = 0;
3060 for (; i < nelts; i += elt_size)
3061 if (INTVAL (builder.elt (i)) == 0)
3062 break;
3063 unsigned int vl = i / elt_size;
3064
3065 /* Check for the all-true case. */
3066 if (i == nelts)
3067 return -1;
3068
3069 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3070 repeating pattern of set bits followed by clear bits. */
3071 if (builder.nelts_per_pattern () != 2)
3072 return 0;
3073
3074 /* We have a "foreground" value and a duplicated "background" value.
3075 If the background might repeat and the last set bit belongs to it,
3076 we might have set bits followed by clear bits followed by set bits. */
3077 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3078 return 0;
3079
3080 /* Make sure that the rest are all clear. */
3081 for (; i < nelts; i += elt_size)
3082 if (INTVAL (builder.elt (i)) != 0)
3083 return 0;
3084
3085 return vl;
3086 }
3087
3088 /* See if there is an svpattern that encodes an SVE predicate of mode
3089 PRED_MODE in which the first VL bits are set and the rest are clear.
3090 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3091 A VL of -1 indicates an all-true vector. */
3092
3093 aarch64_svpattern
3094 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3095 {
3096 if (vl < 0)
3097 return AARCH64_SV_ALL;
3098
3099 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3100 return AARCH64_NUM_SVPATTERNS;
3101
3102 if (vl >= 1 && vl <= 8)
3103 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3104
3105 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3106 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3107
3108 int max_vl;
3109 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3110 {
3111 if (vl == (max_vl / 3) * 3)
3112 return AARCH64_SV_MUL3;
3113 /* These would only trigger for non-power-of-2 lengths. */
3114 if (vl == (max_vl & -4))
3115 return AARCH64_SV_MUL4;
3116 if (vl == (1 << floor_log2 (max_vl)))
3117 return AARCH64_SV_POW2;
3118 if (vl == max_vl)
3119 return AARCH64_SV_ALL;
3120 }
3121 return AARCH64_NUM_SVPATTERNS;
3122 }
3123
3124 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3125 bits has the lowest bit set and the upper bits clear. This is the
3126 VNx16BImode equivalent of a PTRUE for controlling elements of
3127 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3128 all bits are significant, even the upper zeros. */
3129
3130 rtx
3131 aarch64_ptrue_all (unsigned int elt_size)
3132 {
3133 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3134 builder.quick_push (const1_rtx);
3135 for (unsigned int i = 1; i < elt_size; ++i)
3136 builder.quick_push (const0_rtx);
3137 return builder.build ();
3138 }
3139
3140 /* Return an all-true predicate register of mode MODE. */
3141
3142 rtx
3143 aarch64_ptrue_reg (machine_mode mode)
3144 {
3145 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3146 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3147 return gen_lowpart (mode, reg);
3148 }
3149
3150 /* Return an all-false predicate register of mode MODE. */
3151
3152 rtx
3153 aarch64_pfalse_reg (machine_mode mode)
3154 {
3155 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3156 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3157 return gen_lowpart (mode, reg);
3158 }
3159
3160 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3161 true, or alternatively if we know that the operation predicated by
3162 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
3163 aarch64_sve_gp_strictness operand that describes the operation
3164 predicated by PRED1[0]. */
3165
3166 bool
3167 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3168 {
3169 machine_mode mode = GET_MODE (pred2);
3170 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3171 && mode == GET_MODE (pred1[0])
3172 && aarch64_sve_gp_strictness (pred1[1], SImode));
3173 return (pred1[0] == CONSTM1_RTX (mode)
3174 || INTVAL (pred1[1]) == SVE_RELAXED_GP
3175 || rtx_equal_p (pred1[0], pred2));
3176 }
3177
3178 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3179 for it. PRED2[0] is the predicate for the instruction whose result
3180 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3181 for it. Return true if we can prove that the two predicates are
3182 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3183 with PRED1[0] without changing behavior. */
3184
3185 bool
3186 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3187 {
3188 machine_mode mode = GET_MODE (pred1[0]);
3189 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3190 && mode == GET_MODE (pred2[0])
3191 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3192 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3193
3194 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3195 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3196 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3197 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3198 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3199 }
3200
3201 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3202 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3203 Use TARGET as the target register if nonnull and convenient. */
3204
3205 static rtx
3206 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3207 machine_mode data_mode, rtx op1, rtx op2)
3208 {
3209 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3210 expand_operand ops[5];
3211 create_output_operand (&ops[0], target, pred_mode);
3212 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3213 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3214 create_input_operand (&ops[3], op1, data_mode);
3215 create_input_operand (&ops[4], op2, data_mode);
3216 expand_insn (icode, 5, ops);
3217 return ops[0].value;
3218 }
3219
3220 /* Use a comparison to convert integer vector SRC into MODE, which is
3221 the corresponding SVE predicate mode. Use TARGET for the result
3222 if it's nonnull and convenient. */
3223
3224 rtx
3225 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3226 {
3227 machine_mode src_mode = GET_MODE (src);
3228 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3229 src, CONST0_RTX (src_mode));
3230 }
3231
3232 /* Return the assembly token for svprfop value PRFOP. */
3233
3234 static const char *
3235 svprfop_token (enum aarch64_svprfop prfop)
3236 {
3237 switch (prfop)
3238 {
3239 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3240 AARCH64_FOR_SVPRFOP (CASE)
3241 #undef CASE
3242 case AARCH64_NUM_SVPRFOPS:
3243 break;
3244 }
3245 gcc_unreachable ();
3246 }
3247
3248 /* Return the assembly string for an SVE prefetch operation with
3249 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3250 and that SUFFIX is the format for the remaining operands. */
3251
3252 char *
3253 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3254 const char *suffix)
3255 {
3256 static char buffer[128];
3257 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3258 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3259 mnemonic, svprfop_token (prfop), suffix);
3260 gcc_assert (written < sizeof (buffer));
3261 return buffer;
3262 }
3263
3264 /* Check whether we can calculate the number of elements in PATTERN
3265 at compile time, given that there are NELTS_PER_VQ elements per
3266 128-bit block. Return the value if so, otherwise return -1. */
3267
3268 HOST_WIDE_INT
3269 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3270 {
3271 unsigned int vl, const_vg;
3272 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3273 vl = 1 + (pattern - AARCH64_SV_VL1);
3274 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3275 vl = 16 << (pattern - AARCH64_SV_VL16);
3276 else if (aarch64_sve_vg.is_constant (&const_vg))
3277 {
3278 /* There are two vector granules per quadword. */
3279 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3280 switch (pattern)
3281 {
3282 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3283 case AARCH64_SV_MUL4: return nelts & -4;
3284 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3285 case AARCH64_SV_ALL: return nelts;
3286 default: gcc_unreachable ();
3287 }
3288 }
3289 else
3290 return -1;
3291
3292 /* There are two vector granules per quadword. */
3293 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3294 if (known_le (vl, nelts_all))
3295 return vl;
3296
3297 /* Requesting more elements than are available results in a PFALSE. */
3298 if (known_gt (vl, nelts_all))
3299 return 0;
3300
3301 return -1;
3302 }
3303
3304 /* Return true if we can move VALUE into a register using a single
3305 CNT[BHWD] instruction. */
3306
3307 static bool
3308 aarch64_sve_cnt_immediate_p (poly_int64 value)
3309 {
3310 HOST_WIDE_INT factor = value.coeffs[0];
3311 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3312 return (value.coeffs[1] == factor
3313 && IN_RANGE (factor, 2, 16 * 16)
3314 && (factor & 1) == 0
3315 && factor <= 16 * (factor & -factor));
3316 }
3317
3318 /* Likewise for rtx X. */
3319
3320 bool
3321 aarch64_sve_cnt_immediate_p (rtx x)
3322 {
3323 poly_int64 value;
3324 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3325 }
3326
3327 /* Return the asm string for an instruction with a CNT-like vector size
3328 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3329 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3330 first part of the operands template (the part that comes before the
3331 vector size itself). PATTERN is the pattern to use. FACTOR is the
3332 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3333 in each quadword. If it is zero, we can use any element size. */
3334
3335 static char *
3336 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3337 aarch64_svpattern pattern,
3338 unsigned int factor,
3339 unsigned int nelts_per_vq)
3340 {
3341 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3342
3343 if (nelts_per_vq == 0)
3344 /* There is some overlap in the ranges of the four CNT instructions.
3345 Here we always use the smallest possible element size, so that the
3346 multiplier is 1 whereever possible. */
3347 nelts_per_vq = factor & -factor;
3348 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3349 gcc_assert (IN_RANGE (shift, 1, 4));
3350 char suffix = "dwhb"[shift - 1];
3351
3352 factor >>= shift;
3353 unsigned int written;
3354 if (pattern == AARCH64_SV_ALL && factor == 1)
3355 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3356 prefix, suffix, operands);
3357 else if (factor == 1)
3358 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3359 prefix, suffix, operands, svpattern_token (pattern));
3360 else
3361 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3362 prefix, suffix, operands, svpattern_token (pattern),
3363 factor);
3364 gcc_assert (written < sizeof (buffer));
3365 return buffer;
3366 }
3367
3368 /* Return the asm string for an instruction with a CNT-like vector size
3369 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3370 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3371 first part of the operands template (the part that comes before the
3372 vector size itself). X is the value of the vector size operand,
3373 as a polynomial integer rtx; we need to convert this into an "all"
3374 pattern with a multiplier. */
3375
3376 char *
3377 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3378 rtx x)
3379 {
3380 poly_int64 value = rtx_to_poly_int64 (x);
3381 gcc_assert (aarch64_sve_cnt_immediate_p (value));
3382 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3383 value.coeffs[1], 0);
3384 }
3385
3386 /* Return the asm string for an instruction with a CNT-like vector size
3387 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3388 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3389 first part of the operands template (the part that comes before the
3390 vector size itself). CNT_PAT[0..2] are the operands of the
3391 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3392
3393 char *
3394 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3395 const char *operands, rtx *cnt_pat)
3396 {
3397 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3398 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3399 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3400 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3401 factor, nelts_per_vq);
3402 }
3403
3404 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3405
3406 bool
3407 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3408 {
3409 poly_int64 value;
3410 return (poly_int_rtx_p (x, &value)
3411 && (aarch64_sve_cnt_immediate_p (value)
3412 || aarch64_sve_cnt_immediate_p (-value)));
3413 }
3414
3415 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3416 operand 0. */
3417
3418 char *
3419 aarch64_output_sve_scalar_inc_dec (rtx offset)
3420 {
3421 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3422 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3423 if (offset_value.coeffs[1] > 0)
3424 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3425 offset_value.coeffs[1], 0);
3426 else
3427 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3428 -offset_value.coeffs[1], 0);
3429 }
3430
3431 /* Return true if we can add VALUE to a register using a single ADDVL
3432 or ADDPL instruction. */
3433
3434 static bool
3435 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3436 {
3437 HOST_WIDE_INT factor = value.coeffs[0];
3438 if (factor == 0 || value.coeffs[1] != factor)
3439 return false;
3440 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3441 and a value of 16 is one vector width. */
3442 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3443 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3444 }
3445
3446 /* Likewise for rtx X. */
3447
3448 bool
3449 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3450 {
3451 poly_int64 value;
3452 return (poly_int_rtx_p (x, &value)
3453 && aarch64_sve_addvl_addpl_immediate_p (value));
3454 }
3455
3456 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3457 to operand 1 and storing the result in operand 0. */
3458
3459 char *
3460 aarch64_output_sve_addvl_addpl (rtx offset)
3461 {
3462 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3463 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3464 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3465
3466 int factor = offset_value.coeffs[1];
3467 if ((factor & 15) == 0)
3468 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3469 else
3470 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3471 return buffer;
3472 }
3473
3474 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3475 instruction. If it is, store the number of elements in each vector
3476 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3477 factor in *FACTOR_OUT (if nonnull). */
3478
3479 bool
3480 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3481 unsigned int *nelts_per_vq_out)
3482 {
3483 rtx elt;
3484 poly_int64 value;
3485
3486 if (!const_vec_duplicate_p (x, &elt)
3487 || !poly_int_rtx_p (elt, &value))
3488 return false;
3489
3490 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3491 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3492 /* There's no vector INCB. */
3493 return false;
3494
3495 HOST_WIDE_INT factor = value.coeffs[0];
3496 if (value.coeffs[1] != factor)
3497 return false;
3498
3499 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
3500 if ((factor % nelts_per_vq) != 0
3501 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3502 return false;
3503
3504 if (factor_out)
3505 *factor_out = factor;
3506 if (nelts_per_vq_out)
3507 *nelts_per_vq_out = nelts_per_vq;
3508 return true;
3509 }
3510
3511 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3512 instruction. */
3513
3514 bool
3515 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3516 {
3517 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3518 }
3519
3520 /* Return the asm template for an SVE vector INC or DEC instruction.
3521 OPERANDS gives the operands before the vector count and X is the
3522 value of the vector count operand itself. */
3523
3524 char *
3525 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3526 {
3527 int factor;
3528 unsigned int nelts_per_vq;
3529 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3530 gcc_unreachable ();
3531 if (factor < 0)
3532 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3533 -factor, nelts_per_vq);
3534 else
3535 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3536 factor, nelts_per_vq);
3537 }
3538
3539 static int
3540 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
3541 scalar_int_mode mode)
3542 {
3543 int i;
3544 unsigned HOST_WIDE_INT val, val2, mask;
3545 int one_match, zero_match;
3546 int num_insns;
3547
3548 val = INTVAL (imm);
3549
3550 if (aarch64_move_imm (val, mode))
3551 {
3552 if (generate)
3553 emit_insn (gen_rtx_SET (dest, imm));
3554 return 1;
3555 }
3556
3557 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
3558 (with XXXX non-zero). In that case check to see if the move can be done in
3559 a smaller mode. */
3560 val2 = val & 0xffffffff;
3561 if (mode == DImode
3562 && aarch64_move_imm (val2, SImode)
3563 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
3564 {
3565 if (generate)
3566 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3567
3568 /* Check if we have to emit a second instruction by checking to see
3569 if any of the upper 32 bits of the original DI mode value is set. */
3570 if (val == val2)
3571 return 1;
3572
3573 i = (val >> 48) ? 48 : 32;
3574
3575 if (generate)
3576 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3577 GEN_INT ((val >> i) & 0xffff)));
3578
3579 return 2;
3580 }
3581
3582 if ((val >> 32) == 0 || mode == SImode)
3583 {
3584 if (generate)
3585 {
3586 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
3587 if (mode == SImode)
3588 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
3589 GEN_INT ((val >> 16) & 0xffff)));
3590 else
3591 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
3592 GEN_INT ((val >> 16) & 0xffff)));
3593 }
3594 return 2;
3595 }
3596
3597 /* Remaining cases are all for DImode. */
3598
3599 mask = 0xffff;
3600 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
3601 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
3602 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
3603 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
3604
3605 if (zero_match != 2 && one_match != 2)
3606 {
3607 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
3608 For a 64-bit bitmask try whether changing 16 bits to all ones or
3609 zeroes creates a valid bitmask. To check any repeated bitmask,
3610 try using 16 bits from the other 32-bit half of val. */
3611
3612 for (i = 0; i < 64; i += 16, mask <<= 16)
3613 {
3614 val2 = val & ~mask;
3615 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3616 break;
3617 val2 = val | mask;
3618 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3619 break;
3620 val2 = val2 & ~mask;
3621 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
3622 if (val2 != val && aarch64_bitmask_imm (val2, mode))
3623 break;
3624 }
3625 if (i != 64)
3626 {
3627 if (generate)
3628 {
3629 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
3630 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3631 GEN_INT ((val >> i) & 0xffff)));
3632 }
3633 return 2;
3634 }
3635 }
3636
3637 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
3638 are emitted by the initial mov. If one_match > zero_match, skip set bits,
3639 otherwise skip zero bits. */
3640
3641 num_insns = 1;
3642 mask = 0xffff;
3643 val2 = one_match > zero_match ? ~val : val;
3644 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
3645
3646 if (generate)
3647 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
3648 ? (val | ~(mask << i))
3649 : (val & (mask << i)))));
3650 for (i += 16; i < 64; i += 16)
3651 {
3652 if ((val2 & (mask << i)) == 0)
3653 continue;
3654 if (generate)
3655 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
3656 GEN_INT ((val >> i) & 0xffff)));
3657 num_insns ++;
3658 }
3659
3660 return num_insns;
3661 }
3662
3663 /* Return whether imm is a 128-bit immediate which is simple enough to
3664 expand inline. */
3665 bool
3666 aarch64_mov128_immediate (rtx imm)
3667 {
3668 if (GET_CODE (imm) == CONST_INT)
3669 return true;
3670
3671 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
3672
3673 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
3674 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
3675
3676 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
3677 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
3678 }
3679
3680
3681 /* Return the number of temporary registers that aarch64_add_offset_1
3682 would need to add OFFSET to a register. */
3683
3684 static unsigned int
3685 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
3686 {
3687 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
3688 }
3689
3690 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
3691 a non-polynomial OFFSET. MODE is the mode of the addition.
3692 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3693 be set and CFA adjustments added to the generated instructions.
3694
3695 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3696 temporary if register allocation is already complete. This temporary
3697 register may overlap DEST but must not overlap SRC. If TEMP1 is known
3698 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
3699 the immediate again.
3700
3701 Since this function may be used to adjust the stack pointer, we must
3702 ensure that it cannot cause transient stack deallocation (for example
3703 by first incrementing SP and then decrementing when adjusting by a
3704 large immediate). */
3705
3706 static void
3707 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
3708 rtx src, HOST_WIDE_INT offset, rtx temp1,
3709 bool frame_related_p, bool emit_move_imm)
3710 {
3711 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3712 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3713
3714 HOST_WIDE_INT moffset = abs_hwi (offset);
3715 rtx_insn *insn;
3716
3717 if (!moffset)
3718 {
3719 if (!rtx_equal_p (dest, src))
3720 {
3721 insn = emit_insn (gen_rtx_SET (dest, src));
3722 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3723 }
3724 return;
3725 }
3726
3727 /* Single instruction adjustment. */
3728 if (aarch64_uimm12_shift (moffset))
3729 {
3730 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
3731 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3732 return;
3733 }
3734
3735 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
3736 and either:
3737
3738 a) the offset cannot be loaded by a 16-bit move or
3739 b) there is no spare register into which we can move it. */
3740 if (moffset < 0x1000000
3741 && ((!temp1 && !can_create_pseudo_p ())
3742 || !aarch64_move_imm (moffset, mode)))
3743 {
3744 HOST_WIDE_INT low_off = moffset & 0xfff;
3745
3746 low_off = offset < 0 ? -low_off : low_off;
3747 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
3748 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3749 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
3750 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3751 return;
3752 }
3753
3754 /* Emit a move immediate if required and an addition/subtraction. */
3755 if (emit_move_imm)
3756 {
3757 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
3758 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
3759 }
3760 insn = emit_insn (offset < 0
3761 ? gen_sub3_insn (dest, src, temp1)
3762 : gen_add3_insn (dest, src, temp1));
3763 if (frame_related_p)
3764 {
3765 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3766 rtx adj = plus_constant (mode, src, offset);
3767 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
3768 }
3769 }
3770
3771 /* Return the number of temporary registers that aarch64_add_offset
3772 would need to move OFFSET into a register or add OFFSET to a register;
3773 ADD_P is true if we want the latter rather than the former. */
3774
3775 static unsigned int
3776 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
3777 {
3778 /* This follows the same structure as aarch64_add_offset. */
3779 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
3780 return 0;
3781
3782 unsigned int count = 0;
3783 HOST_WIDE_INT factor = offset.coeffs[1];
3784 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3785 poly_int64 poly_offset (factor, factor);
3786 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3787 /* Need one register for the ADDVL/ADDPL result. */
3788 count += 1;
3789 else if (factor != 0)
3790 {
3791 factor = abs (factor);
3792 if (factor > 16 * (factor & -factor))
3793 /* Need one register for the CNT result and one for the multiplication
3794 factor. If necessary, the second temporary can be reused for the
3795 constant part of the offset. */
3796 return 2;
3797 /* Need one register for the CNT result (which might then
3798 be shifted). */
3799 count += 1;
3800 }
3801 return count + aarch64_add_offset_1_temporaries (constant);
3802 }
3803
3804 /* If X can be represented as a poly_int64, return the number
3805 of temporaries that are required to add it to a register.
3806 Return -1 otherwise. */
3807
3808 int
3809 aarch64_add_offset_temporaries (rtx x)
3810 {
3811 poly_int64 offset;
3812 if (!poly_int_rtx_p (x, &offset))
3813 return -1;
3814 return aarch64_offset_temporaries (true, offset);
3815 }
3816
3817 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
3818 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
3819 be set and CFA adjustments added to the generated instructions.
3820
3821 TEMP1, if nonnull, is a register of mode MODE that can be used as a
3822 temporary if register allocation is already complete. This temporary
3823 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
3824 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
3825 false to avoid emitting the immediate again.
3826
3827 TEMP2, if nonnull, is a second temporary register that doesn't
3828 overlap either DEST or REG.
3829
3830 Since this function may be used to adjust the stack pointer, we must
3831 ensure that it cannot cause transient stack deallocation (for example
3832 by first incrementing SP and then decrementing when adjusting by a
3833 large immediate). */
3834
3835 static void
3836 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3837 poly_int64 offset, rtx temp1, rtx temp2,
3838 bool frame_related_p, bool emit_move_imm = true)
3839 {
3840 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
3841 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
3842 gcc_assert (temp1 == NULL_RTX
3843 || !frame_related_p
3844 || !reg_overlap_mentioned_p (temp1, dest));
3845 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
3846
3847 /* Try using ADDVL or ADDPL to add the whole value. */
3848 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
3849 {
3850 rtx offset_rtx = gen_int_mode (offset, mode);
3851 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3852 RTX_FRAME_RELATED_P (insn) = frame_related_p;
3853 return;
3854 }
3855
3856 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
3857 SVE vector register, over and above the minimum size of 128 bits.
3858 This is equivalent to half the value returned by CNTD with a
3859 vector shape of ALL. */
3860 HOST_WIDE_INT factor = offset.coeffs[1];
3861 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
3862
3863 /* Try using ADDVL or ADDPL to add the VG-based part. */
3864 poly_int64 poly_offset (factor, factor);
3865 if (src != const0_rtx
3866 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
3867 {
3868 rtx offset_rtx = gen_int_mode (poly_offset, mode);
3869 if (frame_related_p)
3870 {
3871 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
3872 RTX_FRAME_RELATED_P (insn) = true;
3873 src = dest;
3874 }
3875 else
3876 {
3877 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
3878 src = aarch64_force_temporary (mode, temp1, addr);
3879 temp1 = temp2;
3880 temp2 = NULL_RTX;
3881 }
3882 }
3883 /* Otherwise use a CNT-based sequence. */
3884 else if (factor != 0)
3885 {
3886 /* Use a subtraction if we have a negative factor. */
3887 rtx_code code = PLUS;
3888 if (factor < 0)
3889 {
3890 factor = -factor;
3891 code = MINUS;
3892 }
3893
3894 /* Calculate CNTD * FACTOR / 2. First try to fold the division
3895 into the multiplication. */
3896 rtx val;
3897 int shift = 0;
3898 if (factor & 1)
3899 /* Use a right shift by 1. */
3900 shift = -1;
3901 else
3902 factor /= 2;
3903 HOST_WIDE_INT low_bit = factor & -factor;
3904 if (factor <= 16 * low_bit)
3905 {
3906 if (factor > 16 * 8)
3907 {
3908 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3909 the value with the minimum multiplier and shift it into
3910 position. */
3911 int extra_shift = exact_log2 (low_bit);
3912 shift += extra_shift;
3913 factor >>= extra_shift;
3914 }
3915 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3916 }
3917 else
3918 {
3919 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
3920 directly, since that should increase the chances of being
3921 able to use a shift and add sequence. If LOW_BIT itself
3922 is out of range, just use CNTD. */
3923 if (low_bit <= 16 * 8)
3924 factor /= low_bit;
3925 else
3926 low_bit = 1;
3927
3928 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
3929 val = aarch64_force_temporary (mode, temp1, val);
3930
3931 if (can_create_pseudo_p ())
3932 {
3933 rtx coeff1 = gen_int_mode (factor, mode);
3934 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
3935 }
3936 else
3937 {
3938 /* Go back to using a negative multiplication factor if we have
3939 no register from which to subtract. */
3940 if (code == MINUS && src == const0_rtx)
3941 {
3942 factor = -factor;
3943 code = PLUS;
3944 }
3945 rtx coeff1 = gen_int_mode (factor, mode);
3946 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3947 val = gen_rtx_MULT (mode, val, coeff1);
3948 }
3949 }
3950
3951 if (shift > 0)
3952 {
3953 /* Multiply by 1 << SHIFT. */
3954 val = aarch64_force_temporary (mode, temp1, val);
3955 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3956 }
3957 else if (shift == -1)
3958 {
3959 /* Divide by 2. */
3960 val = aarch64_force_temporary (mode, temp1, val);
3961 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3962 }
3963
3964 /* Calculate SRC +/- CNTD * FACTOR / 2. */
3965 if (src != const0_rtx)
3966 {
3967 val = aarch64_force_temporary (mode, temp1, val);
3968 val = gen_rtx_fmt_ee (code, mode, src, val);
3969 }
3970 else if (code == MINUS)
3971 {
3972 val = aarch64_force_temporary (mode, temp1, val);
3973 val = gen_rtx_NEG (mode, val);
3974 }
3975
3976 if (constant == 0 || frame_related_p)
3977 {
3978 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3979 if (frame_related_p)
3980 {
3981 RTX_FRAME_RELATED_P (insn) = true;
3982 add_reg_note (insn, REG_CFA_ADJUST_CFA,
3983 gen_rtx_SET (dest, plus_constant (Pmode, src,
3984 poly_offset)));
3985 }
3986 src = dest;
3987 if (constant == 0)
3988 return;
3989 }
3990 else
3991 {
3992 src = aarch64_force_temporary (mode, temp1, val);
3993 temp1 = temp2;
3994 temp2 = NULL_RTX;
3995 }
3996
3997 emit_move_imm = true;
3998 }
3999
4000 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4001 frame_related_p, emit_move_imm);
4002 }
4003
4004 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4005 than a poly_int64. */
4006
4007 void
4008 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4009 rtx offset_rtx, rtx temp1, rtx temp2)
4010 {
4011 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4012 temp1, temp2, false);
4013 }
4014
4015 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4016 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
4017 if TEMP1 already contains abs (DELTA). */
4018
4019 static inline void
4020 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4021 {
4022 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4023 temp1, temp2, true, emit_move_imm);
4024 }
4025
4026 /* Subtract DELTA from the stack pointer, marking the instructions
4027 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
4028 if nonnull. */
4029
4030 static inline void
4031 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4032 bool emit_move_imm = true)
4033 {
4034 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4035 temp1, temp2, frame_related_p, emit_move_imm);
4036 }
4037
4038 /* Set DEST to (vec_series BASE STEP). */
4039
4040 static void
4041 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4042 {
4043 machine_mode mode = GET_MODE (dest);
4044 scalar_mode inner = GET_MODE_INNER (mode);
4045
4046 /* Each operand can be a register or an immediate in the range [-16, 15]. */
4047 if (!aarch64_sve_index_immediate_p (base))
4048 base = force_reg (inner, base);
4049 if (!aarch64_sve_index_immediate_p (step))
4050 step = force_reg (inner, step);
4051
4052 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4053 }
4054
4055 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4056 register of mode MODE. Use TARGET for the result if it's nonnull
4057 and convenient.
4058
4059 The two vector modes must have the same element mode. The behavior
4060 is to duplicate architectural lane N of SRC into architectural lanes
4061 N + I * STEP of the result. On big-endian targets, architectural
4062 lane 0 of an Advanced SIMD vector is the last element of the vector
4063 in memory layout, so for big-endian targets this operation has the
4064 effect of reversing SRC before duplicating it. Callers need to
4065 account for this. */
4066
4067 rtx
4068 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4069 {
4070 machine_mode src_mode = GET_MODE (src);
4071 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4072 insn_code icode = (BYTES_BIG_ENDIAN
4073 ? code_for_aarch64_vec_duplicate_vq_be (mode)
4074 : code_for_aarch64_vec_duplicate_vq_le (mode));
4075
4076 unsigned int i = 0;
4077 expand_operand ops[3];
4078 create_output_operand (&ops[i++], target, mode);
4079 create_output_operand (&ops[i++], src, src_mode);
4080 if (BYTES_BIG_ENDIAN)
4081 {
4082 /* Create a PARALLEL describing the reversal of SRC. */
4083 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4084 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4085 nelts_per_vq - 1, -1);
4086 create_fixed_operand (&ops[i++], sel);
4087 }
4088 expand_insn (icode, i, ops);
4089 return ops[0].value;
4090 }
4091
4092 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4093 the memory image into DEST. Return true on success. */
4094
4095 static bool
4096 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4097 {
4098 src = force_const_mem (GET_MODE (src), src);
4099 if (!src)
4100 return false;
4101
4102 /* Make sure that the address is legitimate. */
4103 if (!aarch64_sve_ld1rq_operand_p (src))
4104 {
4105 rtx addr = force_reg (Pmode, XEXP (src, 0));
4106 src = replace_equiv_address (src, addr);
4107 }
4108
4109 machine_mode mode = GET_MODE (dest);
4110 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4111 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4112 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4113 return true;
4114 }
4115
4116 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4117 SVE data mode and isn't a legitimate constant. Use TARGET for the
4118 result if convenient.
4119
4120 The returned register can have whatever mode seems most natural
4121 given the contents of SRC. */
4122
4123 static rtx
4124 aarch64_expand_sve_const_vector (rtx target, rtx src)
4125 {
4126 machine_mode mode = GET_MODE (src);
4127 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4128 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4129 scalar_mode elt_mode = GET_MODE_INNER (mode);
4130 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4131 unsigned int container_bits = aarch64_sve_container_bits (mode);
4132 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4133
4134 if (nelts_per_pattern == 1
4135 && encoded_bits <= 128
4136 && container_bits != elt_bits)
4137 {
4138 /* We have a partial vector mode and a constant whose full-vector
4139 equivalent would occupy a repeating 128-bit sequence. Build that
4140 full-vector equivalent instead, so that we have the option of
4141 using LD1RQ and Advanced SIMD operations. */
4142 unsigned int repeat = container_bits / elt_bits;
4143 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4144 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4145 for (unsigned int i = 0; i < npatterns; ++i)
4146 for (unsigned int j = 0; j < repeat; ++j)
4147 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4148 target = aarch64_target_reg (target, full_mode);
4149 return aarch64_expand_sve_const_vector (target, builder.build ());
4150 }
4151
4152 if (nelts_per_pattern == 1 && encoded_bits == 128)
4153 {
4154 /* The constant is a duplicated quadword but can't be narrowed
4155 beyond a quadword. Get the memory image of the first quadword
4156 as a 128-bit vector and try using LD1RQ to load it from memory.
4157
4158 The effect for both endiannesses is to load memory lane N into
4159 architectural lanes N + I * STEP of the result. On big-endian
4160 targets, the layout of the 128-bit vector in an Advanced SIMD
4161 register would be different from its layout in an SVE register,
4162 but this 128-bit vector is a memory value only. */
4163 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4164 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4165 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4166 return target;
4167 }
4168
4169 if (nelts_per_pattern == 1 && encoded_bits < 128)
4170 {
4171 /* The vector is a repeating sequence of 64 bits or fewer.
4172 See if we can load them using an Advanced SIMD move and then
4173 duplicate it to fill a vector. This is better than using a GPR
4174 move because it keeps everything in the same register file. */
4175 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4176 rtx_vector_builder builder (vq_mode, npatterns, 1);
4177 for (unsigned int i = 0; i < npatterns; ++i)
4178 {
4179 /* We want memory lane N to go into architectural lane N,
4180 so reverse for big-endian targets. The DUP .Q pattern
4181 has a compensating reverse built-in. */
4182 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4183 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4184 }
4185 rtx vq_src = builder.build ();
4186 if (aarch64_simd_valid_immediate (vq_src, NULL))
4187 {
4188 vq_src = force_reg (vq_mode, vq_src);
4189 return aarch64_expand_sve_dupq (target, mode, vq_src);
4190 }
4191
4192 /* Get an integer representation of the repeating part of Advanced
4193 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4194 which for big-endian targets is lane-swapped wrt a normal
4195 Advanced SIMD vector. This means that for both endiannesses,
4196 memory lane N of SVE vector SRC corresponds to architectural
4197 lane N of a register holding VQ_SRC. This in turn means that
4198 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4199 as a single 128-bit value) and thus that memory lane 0 of SRC is
4200 in the lsb of the integer. Duplicating the integer therefore
4201 ensures that memory lane N of SRC goes into architectural lane
4202 N + I * INDEX of the SVE register. */
4203 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4204 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4205 if (elt_value)
4206 {
4207 /* Pretend that we had a vector of INT_MODE to start with. */
4208 elt_mode = int_mode;
4209 mode = aarch64_full_sve_mode (int_mode).require ();
4210
4211 /* If the integer can be moved into a general register by a
4212 single instruction, do that and duplicate the result. */
4213 if (CONST_INT_P (elt_value)
4214 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4215 {
4216 elt_value = force_reg (elt_mode, elt_value);
4217 return expand_vector_broadcast (mode, elt_value);
4218 }
4219 }
4220 else if (npatterns == 1)
4221 /* We're duplicating a single value, but can't do better than
4222 force it to memory and load from there. This handles things
4223 like symbolic constants. */
4224 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4225
4226 if (elt_value)
4227 {
4228 /* Load the element from memory if we can, otherwise move it into
4229 a register and use a DUP. */
4230 rtx op = force_const_mem (elt_mode, elt_value);
4231 if (!op)
4232 op = force_reg (elt_mode, elt_value);
4233 return expand_vector_broadcast (mode, op);
4234 }
4235 }
4236
4237 /* Try using INDEX. */
4238 rtx base, step;
4239 if (const_vec_series_p (src, &base, &step))
4240 {
4241 aarch64_expand_vec_series (target, base, step);
4242 return target;
4243 }
4244
4245 /* From here on, it's better to force the whole constant to memory
4246 if we can. */
4247 if (GET_MODE_NUNITS (mode).is_constant ())
4248 return NULL_RTX;
4249
4250 /* Expand each pattern individually. */
4251 gcc_assert (npatterns > 1);
4252 rtx_vector_builder builder;
4253 auto_vec<rtx, 16> vectors (npatterns);
4254 for (unsigned int i = 0; i < npatterns; ++i)
4255 {
4256 builder.new_vector (mode, 1, nelts_per_pattern);
4257 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4258 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4259 vectors.quick_push (force_reg (mode, builder.build ()));
4260 }
4261
4262 /* Use permutes to interleave the separate vectors. */
4263 while (npatterns > 1)
4264 {
4265 npatterns /= 2;
4266 for (unsigned int i = 0; i < npatterns; ++i)
4267 {
4268 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4269 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4270 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4271 vectors[i] = tmp;
4272 }
4273 }
4274 gcc_assert (vectors[0] == target);
4275 return target;
4276 }
4277
4278 /* Use WHILE to set a predicate register of mode MODE in which the first
4279 VL bits are set and the rest are clear. Use TARGET for the register
4280 if it's nonnull and convenient. */
4281
4282 static rtx
4283 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4284 unsigned int vl)
4285 {
4286 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4287 target = aarch64_target_reg (target, mode);
4288 emit_insn (gen_while (UNSPEC_WHILE_LO, DImode, mode,
4289 target, const0_rtx, limit));
4290 return target;
4291 }
4292
4293 static rtx
4294 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4295
4296 /* BUILDER is a constant predicate in which the index of every set bit
4297 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4298 by inverting every element at a multiple of ELT_SIZE and EORing the
4299 result with an ELT_SIZE PTRUE.
4300
4301 Return a register that contains the constant on success, otherwise
4302 return null. Use TARGET as the register if it is nonnull and
4303 convenient. */
4304
4305 static rtx
4306 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4307 unsigned int elt_size)
4308 {
4309 /* Invert every element at a multiple of ELT_SIZE, keeping the
4310 other bits zero. */
4311 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4312 builder.nelts_per_pattern ());
4313 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4314 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4315 inv_builder.quick_push (const1_rtx);
4316 else
4317 inv_builder.quick_push (const0_rtx);
4318 inv_builder.finalize ();
4319
4320 /* See if we can load the constant cheaply. */
4321 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4322 if (!inv)
4323 return NULL_RTX;
4324
4325 /* EOR the result with an ELT_SIZE PTRUE. */
4326 rtx mask = aarch64_ptrue_all (elt_size);
4327 mask = force_reg (VNx16BImode, mask);
4328 target = aarch64_target_reg (target, VNx16BImode);
4329 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4330 return target;
4331 }
4332
4333 /* BUILDER is a constant predicate in which the index of every set bit
4334 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4335 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4336 register on success, otherwise return null. Use TARGET as the register
4337 if nonnull and convenient. */
4338
4339 static rtx
4340 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4341 unsigned int elt_size,
4342 unsigned int permute_size)
4343 {
4344 /* We're going to split the constant into two new constants A and B,
4345 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4346 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4347
4348 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4349 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4350
4351 where _ indicates elements that will be discarded by the permute.
4352
4353 First calculate the ELT_SIZEs for A and B. */
4354 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4355 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4356 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4357 if (INTVAL (builder.elt (i)) != 0)
4358 {
4359 if (i & permute_size)
4360 b_elt_size |= i - permute_size;
4361 else
4362 a_elt_size |= i;
4363 }
4364 a_elt_size &= -a_elt_size;
4365 b_elt_size &= -b_elt_size;
4366
4367 /* Now construct the vectors themselves. */
4368 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4369 builder.nelts_per_pattern ());
4370 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4371 builder.nelts_per_pattern ());
4372 unsigned int nelts = builder.encoded_nelts ();
4373 for (unsigned int i = 0; i < nelts; ++i)
4374 if (i & (elt_size - 1))
4375 {
4376 a_builder.quick_push (const0_rtx);
4377 b_builder.quick_push (const0_rtx);
4378 }
4379 else if ((i & permute_size) == 0)
4380 {
4381 /* The A and B elements are significant. */
4382 a_builder.quick_push (builder.elt (i));
4383 b_builder.quick_push (builder.elt (i + permute_size));
4384 }
4385 else
4386 {
4387 /* The A and B elements are going to be discarded, so pick whatever
4388 is likely to give a nice constant. We are targeting element
4389 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4390 with the aim of each being a sequence of ones followed by
4391 a sequence of zeros. So:
4392
4393 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4394 duplicate the last X_ELT_SIZE element, to extend the
4395 current sequence of ones or zeros.
4396
4397 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4398 zero, so that the constant really does have X_ELT_SIZE and
4399 not a smaller size. */
4400 if (a_elt_size > permute_size)
4401 a_builder.quick_push (const0_rtx);
4402 else
4403 a_builder.quick_push (a_builder.elt (i - a_elt_size));
4404 if (b_elt_size > permute_size)
4405 b_builder.quick_push (const0_rtx);
4406 else
4407 b_builder.quick_push (b_builder.elt (i - b_elt_size));
4408 }
4409 a_builder.finalize ();
4410 b_builder.finalize ();
4411
4412 /* Try loading A into a register. */
4413 rtx_insn *last = get_last_insn ();
4414 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4415 if (!a)
4416 return NULL_RTX;
4417
4418 /* Try loading B into a register. */
4419 rtx b = a;
4420 if (a_builder != b_builder)
4421 {
4422 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4423 if (!b)
4424 {
4425 delete_insns_since (last);
4426 return NULL_RTX;
4427 }
4428 }
4429
4430 /* Emit the TRN1 itself. */
4431 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4432 target = aarch64_target_reg (target, mode);
4433 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4434 gen_lowpart (mode, a),
4435 gen_lowpart (mode, b)));
4436 return target;
4437 }
4438
4439 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4440 constant in BUILDER into an SVE predicate register. Return the register
4441 on success, otherwise return null. Use TARGET for the register if
4442 nonnull and convenient.
4443
4444 ALLOW_RECURSE_P is true if we can use methods that would call this
4445 function recursively. */
4446
4447 static rtx
4448 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4449 bool allow_recurse_p)
4450 {
4451 if (builder.encoded_nelts () == 1)
4452 /* A PFALSE or a PTRUE .B ALL. */
4453 return aarch64_emit_set_immediate (target, builder);
4454
4455 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4456 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4457 {
4458 /* If we can load the constant using PTRUE, use it as-is. */
4459 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4460 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4461 return aarch64_emit_set_immediate (target, builder);
4462
4463 /* Otherwise use WHILE to set the first VL bits. */
4464 return aarch64_sve_move_pred_via_while (target, mode, vl);
4465 }
4466
4467 if (!allow_recurse_p)
4468 return NULL_RTX;
4469
4470 /* Try inverting the vector in element size ELT_SIZE and then EORing
4471 the result with an ELT_SIZE PTRUE. */
4472 if (INTVAL (builder.elt (0)) == 0)
4473 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4474 elt_size))
4475 return res;
4476
4477 /* Try using TRN1 to permute two simpler constants. */
4478 for (unsigned int i = elt_size; i <= 8; i *= 2)
4479 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4480 elt_size, i))
4481 return res;
4482
4483 return NULL_RTX;
4484 }
4485
4486 /* Return an SVE predicate register that contains the VNx16BImode
4487 constant in BUILDER, without going through the move expanders.
4488
4489 The returned register can have whatever mode seems most natural
4490 given the contents of BUILDER. Use TARGET for the result if
4491 convenient. */
4492
4493 static rtx
4494 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4495 {
4496 /* Try loading the constant using pure predicate operations. */
4497 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
4498 return res;
4499
4500 /* Try forcing the constant to memory. */
4501 if (builder.full_nelts ().is_constant ())
4502 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
4503 {
4504 target = aarch64_target_reg (target, VNx16BImode);
4505 emit_move_insn (target, mem);
4506 return target;
4507 }
4508
4509 /* The last resort is to load the constant as an integer and then
4510 compare it against zero. Use -1 for set bits in order to increase
4511 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
4512 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
4513 builder.nelts_per_pattern ());
4514 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4515 int_builder.quick_push (INTVAL (builder.elt (i))
4516 ? constm1_rtx : const0_rtx);
4517 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
4518 int_builder.build ());
4519 }
4520
4521 /* Set DEST to immediate IMM. */
4522
4523 void
4524 aarch64_expand_mov_immediate (rtx dest, rtx imm)
4525 {
4526 machine_mode mode = GET_MODE (dest);
4527
4528 /* Check on what type of symbol it is. */
4529 scalar_int_mode int_mode;
4530 if ((GET_CODE (imm) == SYMBOL_REF
4531 || GET_CODE (imm) == LABEL_REF
4532 || GET_CODE (imm) == CONST
4533 || GET_CODE (imm) == CONST_POLY_INT)
4534 && is_a <scalar_int_mode> (mode, &int_mode))
4535 {
4536 rtx mem;
4537 poly_int64 offset;
4538 HOST_WIDE_INT const_offset;
4539 enum aarch64_symbol_type sty;
4540
4541 /* If we have (const (plus symbol offset)), separate out the offset
4542 before we start classifying the symbol. */
4543 rtx base = strip_offset (imm, &offset);
4544
4545 /* We must always add an offset involving VL separately, rather than
4546 folding it into the relocation. */
4547 if (!offset.is_constant (&const_offset))
4548 {
4549 if (!TARGET_SVE)
4550 {
4551 aarch64_report_sve_required ();
4552 return;
4553 }
4554 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
4555 emit_insn (gen_rtx_SET (dest, imm));
4556 else
4557 {
4558 /* Do arithmetic on 32-bit values if the result is smaller
4559 than that. */
4560 if (partial_subreg_p (int_mode, SImode))
4561 {
4562 /* It is invalid to do symbol calculations in modes
4563 narrower than SImode. */
4564 gcc_assert (base == const0_rtx);
4565 dest = gen_lowpart (SImode, dest);
4566 int_mode = SImode;
4567 }
4568 if (base != const0_rtx)
4569 {
4570 base = aarch64_force_temporary (int_mode, dest, base);
4571 aarch64_add_offset (int_mode, dest, base, offset,
4572 NULL_RTX, NULL_RTX, false);
4573 }
4574 else
4575 aarch64_add_offset (int_mode, dest, base, offset,
4576 dest, NULL_RTX, false);
4577 }
4578 return;
4579 }
4580
4581 sty = aarch64_classify_symbol (base, const_offset);
4582 switch (sty)
4583 {
4584 case SYMBOL_FORCE_TO_MEM:
4585 if (const_offset != 0
4586 && targetm.cannot_force_const_mem (int_mode, imm))
4587 {
4588 gcc_assert (can_create_pseudo_p ());
4589 base = aarch64_force_temporary (int_mode, dest, base);
4590 aarch64_add_offset (int_mode, dest, base, const_offset,
4591 NULL_RTX, NULL_RTX, false);
4592 return;
4593 }
4594
4595 mem = force_const_mem (ptr_mode, imm);
4596 gcc_assert (mem);
4597
4598 /* If we aren't generating PC relative literals, then
4599 we need to expand the literal pool access carefully.
4600 This is something that needs to be done in a number
4601 of places, so could well live as a separate function. */
4602 if (!aarch64_pcrelative_literal_loads)
4603 {
4604 gcc_assert (can_create_pseudo_p ());
4605 base = gen_reg_rtx (ptr_mode);
4606 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
4607 if (ptr_mode != Pmode)
4608 base = convert_memory_address (Pmode, base);
4609 mem = gen_rtx_MEM (ptr_mode, base);
4610 }
4611
4612 if (int_mode != ptr_mode)
4613 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
4614
4615 emit_insn (gen_rtx_SET (dest, mem));
4616
4617 return;
4618
4619 case SYMBOL_SMALL_TLSGD:
4620 case SYMBOL_SMALL_TLSDESC:
4621 case SYMBOL_SMALL_TLSIE:
4622 case SYMBOL_SMALL_GOT_28K:
4623 case SYMBOL_SMALL_GOT_4G:
4624 case SYMBOL_TINY_GOT:
4625 case SYMBOL_TINY_TLSIE:
4626 if (const_offset != 0)
4627 {
4628 gcc_assert(can_create_pseudo_p ());
4629 base = aarch64_force_temporary (int_mode, dest, base);
4630 aarch64_add_offset (int_mode, dest, base, const_offset,
4631 NULL_RTX, NULL_RTX, false);
4632 return;
4633 }
4634 /* FALLTHRU */
4635
4636 case SYMBOL_SMALL_ABSOLUTE:
4637 case SYMBOL_TINY_ABSOLUTE:
4638 case SYMBOL_TLSLE12:
4639 case SYMBOL_TLSLE24:
4640 case SYMBOL_TLSLE32:
4641 case SYMBOL_TLSLE48:
4642 aarch64_load_symref_appropriately (dest, imm, sty);
4643 return;
4644
4645 default:
4646 gcc_unreachable ();
4647 }
4648 }
4649
4650 if (!CONST_INT_P (imm))
4651 {
4652 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
4653 {
4654 /* Only the low bit of each .H, .S and .D element is defined,
4655 so we can set the upper bits to whatever we like. If the
4656 predicate is all-true in MODE, prefer to set all the undefined
4657 bits as well, so that we can share a single .B predicate for
4658 all modes. */
4659 if (imm == CONSTM1_RTX (mode))
4660 imm = CONSTM1_RTX (VNx16BImode);
4661
4662 /* All methods for constructing predicate modes wider than VNx16BI
4663 will set the upper bits of each element to zero. Expose this
4664 by moving such constants as a VNx16BI, so that all bits are
4665 significant and so that constants for different modes can be
4666 shared. The wider constant will still be available as a
4667 REG_EQUAL note. */
4668 rtx_vector_builder builder;
4669 if (aarch64_get_sve_pred_bits (builder, imm))
4670 {
4671 rtx res = aarch64_expand_sve_const_pred (dest, builder);
4672 if (dest != res)
4673 emit_move_insn (dest, gen_lowpart (mode, res));
4674 return;
4675 }
4676 }
4677
4678 if (GET_CODE (imm) == HIGH
4679 || aarch64_simd_valid_immediate (imm, NULL))
4680 {
4681 emit_insn (gen_rtx_SET (dest, imm));
4682 return;
4683 }
4684
4685 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
4686 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
4687 {
4688 if (dest != res)
4689 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
4690 return;
4691 }
4692
4693 rtx mem = force_const_mem (mode, imm);
4694 gcc_assert (mem);
4695 emit_move_insn (dest, mem);
4696 return;
4697 }
4698
4699 aarch64_internal_mov_immediate (dest, imm, true,
4700 as_a <scalar_int_mode> (mode));
4701 }
4702
4703 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
4704 that is known to contain PTRUE. */
4705
4706 void
4707 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
4708 {
4709 expand_operand ops[3];
4710 machine_mode mode = GET_MODE (dest);
4711 create_output_operand (&ops[0], dest, mode);
4712 create_input_operand (&ops[1], pred, GET_MODE(pred));
4713 create_input_operand (&ops[2], src, mode);
4714 temporary_volatile_ok v (true);
4715 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
4716 }
4717
4718 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
4719 operand is in memory. In this case we need to use the predicated LD1
4720 and ST1 instead of LDR and STR, both for correctness on big-endian
4721 targets and because LD1 and ST1 support a wider range of addressing modes.
4722 PRED_MODE is the mode of the predicate.
4723
4724 See the comment at the head of aarch64-sve.md for details about the
4725 big-endian handling. */
4726
4727 void
4728 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
4729 {
4730 machine_mode mode = GET_MODE (dest);
4731 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4732 if (!register_operand (src, mode)
4733 && !register_operand (dest, mode))
4734 {
4735 rtx tmp = gen_reg_rtx (mode);
4736 if (MEM_P (src))
4737 aarch64_emit_sve_pred_move (tmp, ptrue, src);
4738 else
4739 emit_move_insn (tmp, src);
4740 src = tmp;
4741 }
4742 aarch64_emit_sve_pred_move (dest, ptrue, src);
4743 }
4744
4745 /* Called only on big-endian targets. See whether an SVE vector move
4746 from SRC to DEST is effectively a REV[BHW] instruction, because at
4747 least one operand is a subreg of an SVE vector that has wider or
4748 narrower elements. Return true and emit the instruction if so.
4749
4750 For example:
4751
4752 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
4753
4754 represents a VIEW_CONVERT between the following vectors, viewed
4755 in memory order:
4756
4757 R2: { [0].high, [0].low, [1].high, [1].low, ... }
4758 R1: { [0], [1], [2], [3], ... }
4759
4760 The high part of lane X in R2 should therefore correspond to lane X*2
4761 of R1, but the register representations are:
4762
4763 msb lsb
4764 R2: ...... [1].high [1].low [0].high [0].low
4765 R1: ...... [3] [2] [1] [0]
4766
4767 where the low part of lane X in R2 corresponds to lane X*2 in R1.
4768 We therefore need a reverse operation to swap the high and low values
4769 around.
4770
4771 This is purely an optimization. Without it we would spill the
4772 subreg operand to the stack in one mode and reload it in the
4773 other mode, which has the same effect as the REV. */
4774
4775 bool
4776 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
4777 {
4778 gcc_assert (BYTES_BIG_ENDIAN);
4779 if (GET_CODE (dest) == SUBREG)
4780 dest = SUBREG_REG (dest);
4781 if (GET_CODE (src) == SUBREG)
4782 src = SUBREG_REG (src);
4783
4784 /* The optimization handles two single SVE REGs with different element
4785 sizes. */
4786 if (!REG_P (dest)
4787 || !REG_P (src)
4788 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
4789 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
4790 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
4791 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
4792 return false;
4793
4794 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
4795 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
4796 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
4797 UNSPEC_REV_SUBREG);
4798 emit_insn (gen_rtx_SET (dest, unspec));
4799 return true;
4800 }
4801
4802 /* Return a copy of X with mode MODE, without changing its other
4803 attributes. Unlike gen_lowpart, this doesn't care whether the
4804 mode change is valid. */
4805
4806 rtx
4807 aarch64_replace_reg_mode (rtx x, machine_mode mode)
4808 {
4809 if (GET_MODE (x) == mode)
4810 return x;
4811
4812 x = shallow_copy_rtx (x);
4813 set_mode_and_regno (x, mode, REGNO (x));
4814 return x;
4815 }
4816
4817 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
4818 stored in wider integer containers. */
4819
4820 static unsigned int
4821 aarch64_sve_rev_unspec (machine_mode mode)
4822 {
4823 switch (GET_MODE_UNIT_SIZE (mode))
4824 {
4825 case 1: return UNSPEC_REVB;
4826 case 2: return UNSPEC_REVH;
4827 case 4: return UNSPEC_REVW;
4828 }
4829 gcc_unreachable ();
4830 }
4831
4832 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
4833 operands. */
4834
4835 void
4836 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
4837 {
4838 /* Decide which REV operation we need. The mode with wider elements
4839 determines the mode of the operands and the mode with the narrower
4840 elements determines the reverse width. */
4841 machine_mode mode_with_wider_elts = GET_MODE (dest);
4842 machine_mode mode_with_narrower_elts = GET_MODE (src);
4843 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
4844 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
4845 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
4846
4847 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
4848 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
4849
4850 /* Get the operands in the appropriate modes and emit the instruction. */
4851 ptrue = gen_lowpart (pred_mode, ptrue);
4852 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
4853 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
4854 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
4855 dest, ptrue, src));
4856 }
4857
4858 static bool
4859 aarch64_function_ok_for_sibcall (tree, tree exp)
4860 {
4861 if (crtl->abi->id () != expr_callee_abi (exp).id ())
4862 return false;
4863
4864 return true;
4865 }
4866
4867 /* Implement TARGET_PASS_BY_REFERENCE. */
4868
4869 static bool
4870 aarch64_pass_by_reference (cumulative_args_t pcum_v,
4871 const function_arg_info &arg)
4872 {
4873 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4874 HOST_WIDE_INT size;
4875 machine_mode dummymode;
4876 int nregs;
4877
4878 unsigned int num_zr, num_pr;
4879 if (arg.type && aarch64_sve_argument_p (arg.type, &num_zr, &num_pr))
4880 {
4881 if (pcum && !pcum->silent_p && !TARGET_SVE)
4882 /* We can't gracefully recover at this point, so make this a
4883 fatal error. */
4884 fatal_error (input_location, "arguments of type %qT require"
4885 " the SVE ISA extension", arg.type);
4886
4887 /* Variadic SVE types are passed by reference. Normal non-variadic
4888 arguments are too if we've run out of registers. */
4889 return (!arg.named
4890 || pcum->aapcs_nvrn + num_zr > NUM_FP_ARG_REGS
4891 || pcum->aapcs_nprn + num_pr > NUM_PR_ARG_REGS);
4892 }
4893
4894 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
4895 if (arg.mode == BLKmode && arg.type)
4896 size = int_size_in_bytes (arg.type);
4897 else
4898 /* No frontends can create types with variable-sized modes, so we
4899 shouldn't be asked to pass or return them. */
4900 size = GET_MODE_SIZE (arg.mode).to_constant ();
4901
4902 /* Aggregates are passed by reference based on their size. */
4903 if (arg.aggregate_type_p ())
4904 size = int_size_in_bytes (arg.type);
4905
4906 /* Variable sized arguments are always returned by reference. */
4907 if (size < 0)
4908 return true;
4909
4910 /* Can this be a candidate to be passed in fp/simd register(s)? */
4911 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
4912 &dummymode, &nregs,
4913 NULL))
4914 return false;
4915
4916 /* Arguments which are variable sized or larger than 2 registers are
4917 passed by reference unless they are a homogenous floating point
4918 aggregate. */
4919 return size > 2 * UNITS_PER_WORD;
4920 }
4921
4922 /* Return TRUE if VALTYPE is padded to its least significant bits. */
4923 static bool
4924 aarch64_return_in_msb (const_tree valtype)
4925 {
4926 machine_mode dummy_mode;
4927 int dummy_int;
4928
4929 /* Never happens in little-endian mode. */
4930 if (!BYTES_BIG_ENDIAN)
4931 return false;
4932
4933 /* Only composite types smaller than or equal to 16 bytes can
4934 be potentially returned in registers. */
4935 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
4936 || int_size_in_bytes (valtype) <= 0
4937 || int_size_in_bytes (valtype) > 16)
4938 return false;
4939
4940 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
4941 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
4942 is always passed/returned in the least significant bits of fp/simd
4943 register(s). */
4944 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
4945 &dummy_mode, &dummy_int, NULL))
4946 return false;
4947
4948 return true;
4949 }
4950
4951 /* Subroutine of aarch64_function_value. MODE is the mode of the argument
4952 after promotion, and after partial SVE types have been replaced by
4953 their integer equivalents. */
4954 static rtx
4955 aarch64_function_value_1 (const_tree type, machine_mode mode)
4956 {
4957 unsigned int num_zr, num_pr;
4958 if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
4959 {
4960 /* Don't raise an error here if we're called when SVE is disabled,
4961 since this is really just a query function. Other code must
4962 do that where appropriate. */
4963 mode = TYPE_MODE_RAW (type);
4964 gcc_assert (VECTOR_MODE_P (mode)
4965 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
4966
4967 if (num_zr > 0 && num_pr == 0)
4968 return gen_rtx_REG (mode, V0_REGNUM);
4969
4970 if (num_zr == 0 && num_pr == 1)
4971 return gen_rtx_REG (mode, P0_REGNUM);
4972
4973 gcc_unreachable ();
4974 }
4975
4976 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
4977 returned in memory, not by value. */
4978 gcc_assert (!aarch64_sve_mode_p (mode));
4979
4980 if (aarch64_return_in_msb (type))
4981 {
4982 HOST_WIDE_INT size = int_size_in_bytes (type);
4983
4984 if (size % UNITS_PER_WORD != 0)
4985 {
4986 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
4987 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
4988 }
4989 }
4990
4991 int count;
4992 machine_mode ag_mode;
4993 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
4994 &ag_mode, &count, NULL))
4995 {
4996 if (!aarch64_composite_type_p (type, mode))
4997 {
4998 gcc_assert (count == 1 && mode == ag_mode);
4999 return gen_rtx_REG (mode, V0_REGNUM);
5000 }
5001 else
5002 {
5003 int i;
5004 rtx par;
5005
5006 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5007 for (i = 0; i < count; i++)
5008 {
5009 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
5010 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5011 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5012 XVECEXP (par, 0, i) = tmp;
5013 }
5014 return par;
5015 }
5016 }
5017 else
5018 return gen_rtx_REG (mode, R0_REGNUM);
5019 }
5020
5021 /* Implement TARGET_FUNCTION_VALUE.
5022 Define how to find the value returned by a function. */
5023
5024 static rtx
5025 aarch64_function_value (const_tree type, const_tree func,
5026 bool outgoing ATTRIBUTE_UNUSED)
5027 {
5028 machine_mode mode;
5029 int unsignedp;
5030
5031 mode = TYPE_MODE (type);
5032 if (INTEGRAL_TYPE_P (type))
5033 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5034
5035 /* Vector types can acquire a partial SVE mode using things like
5036 __attribute__((vector_size(N))), and this is potentially useful.
5037 However, the choice of mode doesn't affect the type's ABI identity,
5038 so we should treat the types as though they had the associated
5039 integer mode, just like they did before SVE was introduced.
5040
5041 We know that the vector must be 128 bits or smaller, otherwise we'd
5042 have returned it in memory instead. */
5043 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5044 if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
5045 {
5046 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5047 rtx reg = aarch64_function_value_1 (type, int_mode);
5048 /* Vector types are never returned in the MSB and are never split. */
5049 gcc_assert (REG_P (reg) && GET_MODE (reg) == int_mode);
5050 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5051 return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, pair));
5052 }
5053
5054 return aarch64_function_value_1 (type, mode);
5055 }
5056
5057 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5058 Return true if REGNO is the number of a hard register in which the values
5059 of called function may come back. */
5060
5061 static bool
5062 aarch64_function_value_regno_p (const unsigned int regno)
5063 {
5064 /* Maximum of 16 bytes can be returned in the general registers. Examples
5065 of 16-byte return values are: 128-bit integers and 16-byte small
5066 structures (excluding homogeneous floating-point aggregates). */
5067 if (regno == R0_REGNUM || regno == R1_REGNUM)
5068 return true;
5069
5070 /* Up to four fp/simd registers can return a function value, e.g. a
5071 homogeneous floating-point aggregate having four members. */
5072 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5073 return TARGET_FLOAT;
5074
5075 return false;
5076 }
5077
5078 /* Implement TARGET_RETURN_IN_MEMORY.
5079
5080 If the type T of the result of a function is such that
5081 void func (T arg)
5082 would require that arg be passed as a value in a register (or set of
5083 registers) according to the parameter passing rules, then the result
5084 is returned in the same registers as would be used for such an
5085 argument. */
5086
5087 static bool
5088 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5089 {
5090 HOST_WIDE_INT size;
5091 machine_mode ag_mode;
5092 int count;
5093
5094 if (!AGGREGATE_TYPE_P (type)
5095 && TREE_CODE (type) != COMPLEX_TYPE
5096 && TREE_CODE (type) != VECTOR_TYPE)
5097 /* Simple scalar types always returned in registers. */
5098 return false;
5099
5100 unsigned int num_zr, num_pr;
5101 if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
5102 {
5103 /* All SVE types we support fit in registers. For example, it isn't
5104 yet possible to define an aggregate of 9+ SVE vectors or 5+ SVE
5105 predicates. */
5106 gcc_assert (num_zr <= NUM_FP_ARG_REGS && num_pr <= NUM_PR_ARG_REGS);
5107 return false;
5108 }
5109
5110 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
5111 type,
5112 &ag_mode,
5113 &count,
5114 NULL))
5115 return false;
5116
5117 /* Types larger than 2 registers returned in memory. */
5118 size = int_size_in_bytes (type);
5119 return (size < 0 || size > 2 * UNITS_PER_WORD);
5120 }
5121
5122 static bool
5123 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5124 const_tree type, int *nregs)
5125 {
5126 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5127 return aarch64_vfp_is_call_or_return_candidate (mode,
5128 type,
5129 &pcum->aapcs_vfp_rmode,
5130 nregs,
5131 NULL);
5132 }
5133
5134 /* Given MODE and TYPE of a function argument, return the alignment in
5135 bits. The idea is to suppress any stronger alignment requested by
5136 the user and opt for the natural alignment (specified in AAPCS64 \S
5137 4.1). ABI_BREAK is set to true if the alignment was incorrectly
5138 calculated in versions of GCC prior to GCC-9. This is a helper
5139 function for local use only. */
5140
5141 static unsigned int
5142 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5143 bool *abi_break)
5144 {
5145 *abi_break = false;
5146 if (!type)
5147 return GET_MODE_ALIGNMENT (mode);
5148
5149 if (integer_zerop (TYPE_SIZE (type)))
5150 return 0;
5151
5152 gcc_assert (TYPE_MODE (type) == mode);
5153
5154 if (!AGGREGATE_TYPE_P (type))
5155 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5156
5157 if (TREE_CODE (type) == ARRAY_TYPE)
5158 return TYPE_ALIGN (TREE_TYPE (type));
5159
5160 unsigned int alignment = 0;
5161 unsigned int bitfield_alignment = 0;
5162 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5163 if (TREE_CODE (field) == FIELD_DECL)
5164 {
5165 alignment = std::max (alignment, DECL_ALIGN (field));
5166 if (DECL_BIT_FIELD_TYPE (field))
5167 bitfield_alignment
5168 = std::max (bitfield_alignment,
5169 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5170 }
5171
5172 if (bitfield_alignment > alignment)
5173 {
5174 *abi_break = true;
5175 return bitfield_alignment;
5176 }
5177
5178 return alignment;
5179 }
5180
5181 /* Layout a function argument according to the AAPCS64 rules. The rule
5182 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
5183 mode that was originally given to us by the target hook, whereas the
5184 mode in ARG might be the result of replacing partial SVE modes with
5185 the equivalent integer mode. */
5186
5187 static void
5188 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg,
5189 machine_mode orig_mode)
5190 {
5191 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5192 tree type = arg.type;
5193 machine_mode mode = arg.mode;
5194 int ncrn, nvrn, nregs;
5195 bool allocate_ncrn, allocate_nvrn;
5196 HOST_WIDE_INT size;
5197 bool abi_break;
5198
5199 /* We need to do this once per argument. */
5200 if (pcum->aapcs_arg_processed)
5201 return;
5202
5203 /* Vector types can acquire a partial SVE mode using things like
5204 __attribute__((vector_size(N))), and this is potentially useful.
5205 However, the choice of mode doesn't affect the type's ABI identity,
5206 so we should treat the types as though they had the associated
5207 integer mode, just like they did before SVE was introduced.
5208
5209 We know that the vector must be 128 bits or smaller, otherwise we'd
5210 have passed it by reference instead. */
5211 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5212 if ((vec_flags & VEC_ANY_SVE) && (vec_flags & VEC_PARTIAL))
5213 {
5214 function_arg_info tmp_arg = arg;
5215 tmp_arg.mode = int_mode_for_mode (mode).require ();
5216 aarch64_layout_arg (pcum_v, tmp_arg, orig_mode);
5217 if (rtx reg = pcum->aapcs_reg)
5218 {
5219 gcc_assert (REG_P (reg) && GET_MODE (reg) == tmp_arg.mode);
5220 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5221 pcum->aapcs_reg = gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5222 }
5223 return;
5224 }
5225
5226 pcum->aapcs_arg_processed = true;
5227
5228 unsigned int num_zr, num_pr;
5229 if (type && aarch64_sve_argument_p (type, &num_zr, &num_pr))
5230 {
5231 /* The PCS says that it is invalid to pass an SVE value to an
5232 unprototyped function. There is no ABI-defined location we
5233 can return in this case, so we have no real choice but to raise
5234 an error immediately, even though this is only a query function. */
5235 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5236 {
5237 gcc_assert (!pcum->silent_p);
5238 error ("SVE type %qT cannot be passed to an unprototyped function",
5239 arg.type);
5240 /* Avoid repeating the message, and avoid tripping the assert
5241 below. */
5242 pcum->pcs_variant = ARM_PCS_SVE;
5243 }
5244
5245 /* We would have converted the argument into pass-by-reference
5246 form if it didn't fit in registers. */
5247 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + num_zr;
5248 pcum->aapcs_nextnprn = pcum->aapcs_nprn + num_pr;
5249 gcc_assert (arg.named
5250 && pcum->pcs_variant == ARM_PCS_SVE
5251 && aarch64_sve_mode_p (mode)
5252 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5253 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5254
5255 if (num_zr > 0 && num_pr == 0)
5256 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + pcum->aapcs_nvrn);
5257 else if (num_zr == 0 && num_pr == 1)
5258 pcum->aapcs_reg = gen_rtx_REG (mode, P0_REGNUM + pcum->aapcs_nprn);
5259 else
5260 gcc_unreachable ();
5261 return;
5262 }
5263
5264 /* Generic vectors that map to SVE modes with -msve-vector-bits=N are
5265 passed by reference, not by value. */
5266 gcc_assert (!aarch64_sve_mode_p (mode));
5267
5268 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
5269 if (type)
5270 size = int_size_in_bytes (type);
5271 else
5272 /* No frontends can create types with variable-sized modes, so we
5273 shouldn't be asked to pass or return them. */
5274 size = GET_MODE_SIZE (mode).to_constant ();
5275 size = ROUND_UP (size, UNITS_PER_WORD);
5276
5277 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5278 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5279 mode,
5280 type,
5281 &nregs);
5282
5283 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5284 The following code thus handles passing by SIMD/FP registers first. */
5285
5286 nvrn = pcum->aapcs_nvrn;
5287
5288 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5289 and homogenous short-vector aggregates (HVA). */
5290 if (allocate_nvrn)
5291 {
5292 if (!pcum->silent_p && !TARGET_FLOAT)
5293 aarch64_err_no_fpadvsimd (mode);
5294
5295 if (nvrn + nregs <= NUM_FP_ARG_REGS)
5296 {
5297 pcum->aapcs_nextnvrn = nvrn + nregs;
5298 if (!aarch64_composite_type_p (type, mode))
5299 {
5300 gcc_assert (nregs == 1);
5301 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5302 }
5303 else
5304 {
5305 rtx par;
5306 int i;
5307 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5308 for (i = 0; i < nregs; i++)
5309 {
5310 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5311 V0_REGNUM + nvrn + i);
5312 rtx offset = gen_int_mode
5313 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5314 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5315 XVECEXP (par, 0, i) = tmp;
5316 }
5317 pcum->aapcs_reg = par;
5318 }
5319 return;
5320 }
5321 else
5322 {
5323 /* C.3 NSRN is set to 8. */
5324 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5325 goto on_stack;
5326 }
5327 }
5328
5329 ncrn = pcum->aapcs_ncrn;
5330 nregs = size / UNITS_PER_WORD;
5331
5332 /* C6 - C9. though the sign and zero extension semantics are
5333 handled elsewhere. This is the case where the argument fits
5334 entirely general registers. */
5335 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5336 {
5337 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5338
5339 /* C.8 if the argument has an alignment of 16 then the NGRN is
5340 rounded up to the next even number. */
5341 if (nregs == 2
5342 && ncrn % 2
5343 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5344 comparison is there because for > 16 * BITS_PER_UNIT
5345 alignment nregs should be > 2 and therefore it should be
5346 passed by reference rather than value. */
5347 && (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
5348 == 16 * BITS_PER_UNIT))
5349 {
5350 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5351 inform (input_location, "parameter passing for argument of type "
5352 "%qT changed in GCC 9.1", type);
5353 ++ncrn;
5354 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5355 }
5356
5357 /* NREGS can be 0 when e.g. an empty structure is to be passed.
5358 A reg is still generated for it, but the caller should be smart
5359 enough not to use it. */
5360 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
5361 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5362 else
5363 {
5364 rtx par;
5365 int i;
5366
5367 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5368 for (i = 0; i < nregs; i++)
5369 {
5370 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
5371 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5372 GEN_INT (i * UNITS_PER_WORD));
5373 XVECEXP (par, 0, i) = tmp;
5374 }
5375 pcum->aapcs_reg = par;
5376 }
5377
5378 pcum->aapcs_nextncrn = ncrn + nregs;
5379 return;
5380 }
5381
5382 /* C.11 */
5383 pcum->aapcs_nextncrn = NUM_ARG_REGS;
5384
5385 /* The argument is passed on stack; record the needed number of words for
5386 this argument and align the total size if necessary. */
5387 on_stack:
5388 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5389
5390 if (aarch64_function_arg_alignment (orig_mode, type, &abi_break)
5391 == 16 * BITS_PER_UNIT)
5392 {
5393 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5394 if (pcum->aapcs_stack_size != new_size)
5395 {
5396 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5397 inform (input_location, "parameter passing for argument of type "
5398 "%qT changed in GCC 9.1", type);
5399 pcum->aapcs_stack_size = new_size;
5400 }
5401 }
5402 return;
5403 }
5404
5405 /* Implement TARGET_FUNCTION_ARG. */
5406
5407 static rtx
5408 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5409 {
5410 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5411 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5412 || pcum->pcs_variant == ARM_PCS_SIMD
5413 || pcum->pcs_variant == ARM_PCS_SVE);
5414
5415 if (arg.end_marker_p ())
5416 return gen_int_mode (pcum->pcs_variant, DImode);
5417
5418 aarch64_layout_arg (pcum_v, arg, arg.mode);
5419 return pcum->aapcs_reg;
5420 }
5421
5422 void
5423 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5424 const_tree fntype,
5425 rtx libname ATTRIBUTE_UNUSED,
5426 const_tree fndecl ATTRIBUTE_UNUSED,
5427 unsigned n_named ATTRIBUTE_UNUSED,
5428 bool silent_p)
5429 {
5430 pcum->aapcs_ncrn = 0;
5431 pcum->aapcs_nvrn = 0;
5432 pcum->aapcs_nprn = 0;
5433 pcum->aapcs_nextncrn = 0;
5434 pcum->aapcs_nextnvrn = 0;
5435 pcum->aapcs_nextnprn = 0;
5436 if (fntype)
5437 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5438 else
5439 pcum->pcs_variant = ARM_PCS_AAPCS64;
5440 pcum->aapcs_reg = NULL_RTX;
5441 pcum->aapcs_arg_processed = false;
5442 pcum->aapcs_stack_words = 0;
5443 pcum->aapcs_stack_size = 0;
5444 pcum->silent_p = silent_p;
5445
5446 if (!silent_p
5447 && !TARGET_FLOAT
5448 && fndecl && TREE_PUBLIC (fndecl)
5449 && fntype && fntype != error_mark_node)
5450 {
5451 const_tree type = TREE_TYPE (fntype);
5452 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
5453 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
5454 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5455 &mode, &nregs, NULL))
5456 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5457 }
5458
5459 if (!silent_p
5460 && !TARGET_SVE
5461 && pcum->pcs_variant == ARM_PCS_SVE)
5462 {
5463 /* We can't gracefully recover at this point, so make this a
5464 fatal error. */
5465 if (fndecl)
5466 fatal_error (input_location, "%qE requires the SVE ISA extension",
5467 fndecl);
5468 else
5469 fatal_error (input_location, "calls to functions of type %qT require"
5470 " the SVE ISA extension", fntype);
5471 }
5472 }
5473
5474 static void
5475 aarch64_function_arg_advance (cumulative_args_t pcum_v,
5476 const function_arg_info &arg)
5477 {
5478 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5479 if (pcum->pcs_variant == ARM_PCS_AAPCS64
5480 || pcum->pcs_variant == ARM_PCS_SIMD
5481 || pcum->pcs_variant == ARM_PCS_SVE)
5482 {
5483 aarch64_layout_arg (pcum_v, arg, arg.mode);
5484 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
5485 != (pcum->aapcs_stack_words != 0));
5486 pcum->aapcs_arg_processed = false;
5487 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
5488 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
5489 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
5490 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
5491 pcum->aapcs_stack_words = 0;
5492 pcum->aapcs_reg = NULL_RTX;
5493 }
5494 }
5495
5496 bool
5497 aarch64_function_arg_regno_p (unsigned regno)
5498 {
5499 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
5500 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
5501 }
5502
5503 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
5504 PARM_BOUNDARY bits of alignment, but will be given anything up
5505 to STACK_BOUNDARY bits if the type requires it. This makes sure
5506 that both before and after the layout of each argument, the Next
5507 Stacked Argument Address (NSAA) will have a minimum alignment of
5508 8 bytes. */
5509
5510 static unsigned int
5511 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
5512 {
5513 bool abi_break;
5514 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
5515 &abi_break);
5516 if (abi_break & warn_psabi)
5517 inform (input_location, "parameter passing for argument of type "
5518 "%qT changed in GCC 9.1", type);
5519
5520 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
5521 }
5522
5523 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
5524
5525 static fixed_size_mode
5526 aarch64_get_reg_raw_mode (int regno)
5527 {
5528 if (TARGET_SVE && FP_REGNUM_P (regno))
5529 /* Don't use the SVE part of the register for __builtin_apply and
5530 __builtin_return. The SVE registers aren't used by the normal PCS,
5531 so using them there would be a waste of time. The PCS extensions
5532 for SVE types are fundamentally incompatible with the
5533 __builtin_return/__builtin_apply interface. */
5534 return as_a <fixed_size_mode> (V16QImode);
5535 return default_get_reg_raw_mode (regno);
5536 }
5537
5538 /* Implement TARGET_FUNCTION_ARG_PADDING.
5539
5540 Small aggregate types are placed in the lowest memory address.
5541
5542 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
5543
5544 static pad_direction
5545 aarch64_function_arg_padding (machine_mode mode, const_tree type)
5546 {
5547 /* On little-endian targets, the least significant byte of every stack
5548 argument is passed at the lowest byte address of the stack slot. */
5549 if (!BYTES_BIG_ENDIAN)
5550 return PAD_UPWARD;
5551
5552 /* Otherwise, integral, floating-point and pointer types are padded downward:
5553 the least significant byte of a stack argument is passed at the highest
5554 byte address of the stack slot. */
5555 if (type
5556 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
5557 || POINTER_TYPE_P (type))
5558 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
5559 return PAD_DOWNWARD;
5560
5561 /* Everything else padded upward, i.e. data in first byte of stack slot. */
5562 return PAD_UPWARD;
5563 }
5564
5565 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
5566
5567 It specifies padding for the last (may also be the only)
5568 element of a block move between registers and memory. If
5569 assuming the block is in the memory, padding upward means that
5570 the last element is padded after its highest significant byte,
5571 while in downward padding, the last element is padded at the
5572 its least significant byte side.
5573
5574 Small aggregates and small complex types are always padded
5575 upwards.
5576
5577 We don't need to worry about homogeneous floating-point or
5578 short-vector aggregates; their move is not affected by the
5579 padding direction determined here. Regardless of endianness,
5580 each element of such an aggregate is put in the least
5581 significant bits of a fp/simd register.
5582
5583 Return !BYTES_BIG_ENDIAN if the least significant byte of the
5584 register has useful data, and return the opposite if the most
5585 significant byte does. */
5586
5587 bool
5588 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
5589 bool first ATTRIBUTE_UNUSED)
5590 {
5591
5592 /* Small composite types are always padded upward. */
5593 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
5594 {
5595 HOST_WIDE_INT size;
5596 if (type)
5597 size = int_size_in_bytes (type);
5598 else
5599 /* No frontends can create types with variable-sized modes, so we
5600 shouldn't be asked to pass or return them. */
5601 size = GET_MODE_SIZE (mode).to_constant ();
5602 if (size < 2 * UNITS_PER_WORD)
5603 return true;
5604 }
5605
5606 /* Otherwise, use the default padding. */
5607 return !BYTES_BIG_ENDIAN;
5608 }
5609
5610 static scalar_int_mode
5611 aarch64_libgcc_cmp_return_mode (void)
5612 {
5613 return SImode;
5614 }
5615
5616 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
5617
5618 /* We use the 12-bit shifted immediate arithmetic instructions so values
5619 must be multiple of (1 << 12), i.e. 4096. */
5620 #define ARITH_FACTOR 4096
5621
5622 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
5623 #error Cannot use simple address calculation for stack probing
5624 #endif
5625
5626 /* The pair of scratch registers used for stack probing. */
5627 #define PROBE_STACK_FIRST_REG R9_REGNUM
5628 #define PROBE_STACK_SECOND_REG R10_REGNUM
5629
5630 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
5631 inclusive. These are offsets from the current stack pointer. */
5632
5633 static void
5634 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
5635 {
5636 HOST_WIDE_INT size;
5637 if (!poly_size.is_constant (&size))
5638 {
5639 sorry ("stack probes for SVE frames");
5640 return;
5641 }
5642
5643 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
5644
5645 /* See the same assertion on PROBE_INTERVAL above. */
5646 gcc_assert ((first % ARITH_FACTOR) == 0);
5647
5648 /* See if we have a constant small number of probes to generate. If so,
5649 that's the easy case. */
5650 if (size <= PROBE_INTERVAL)
5651 {
5652 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
5653
5654 emit_set_insn (reg1,
5655 plus_constant (Pmode,
5656 stack_pointer_rtx, -(first + base)));
5657 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
5658 }
5659
5660 /* The run-time loop is made up of 8 insns in the generic case while the
5661 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
5662 else if (size <= 4 * PROBE_INTERVAL)
5663 {
5664 HOST_WIDE_INT i, rem;
5665
5666 emit_set_insn (reg1,
5667 plus_constant (Pmode,
5668 stack_pointer_rtx,
5669 -(first + PROBE_INTERVAL)));
5670 emit_stack_probe (reg1);
5671
5672 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
5673 it exceeds SIZE. If only two probes are needed, this will not
5674 generate any code. Then probe at FIRST + SIZE. */
5675 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
5676 {
5677 emit_set_insn (reg1,
5678 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
5679 emit_stack_probe (reg1);
5680 }
5681
5682 rem = size - (i - PROBE_INTERVAL);
5683 if (rem > 256)
5684 {
5685 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5686
5687 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
5688 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
5689 }
5690 else
5691 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
5692 }
5693
5694 /* Otherwise, do the same as above, but in a loop. Note that we must be
5695 extra careful with variables wrapping around because we might be at
5696 the very top (or the very bottom) of the address space and we have
5697 to be able to handle this case properly; in particular, we use an
5698 equality test for the loop condition. */
5699 else
5700 {
5701 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
5702
5703 /* Step 1: round SIZE to the previous multiple of the interval. */
5704
5705 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
5706
5707
5708 /* Step 2: compute initial and final value of the loop counter. */
5709
5710 /* TEST_ADDR = SP + FIRST. */
5711 emit_set_insn (reg1,
5712 plus_constant (Pmode, stack_pointer_rtx, -first));
5713
5714 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
5715 HOST_WIDE_INT adjustment = - (first + rounded_size);
5716 if (! aarch64_uimm12_shift (adjustment))
5717 {
5718 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
5719 true, Pmode);
5720 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
5721 }
5722 else
5723 emit_set_insn (reg2,
5724 plus_constant (Pmode, stack_pointer_rtx, adjustment));
5725
5726 /* Step 3: the loop
5727
5728 do
5729 {
5730 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
5731 probe at TEST_ADDR
5732 }
5733 while (TEST_ADDR != LAST_ADDR)
5734
5735 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
5736 until it is equal to ROUNDED_SIZE. */
5737
5738 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
5739
5740
5741 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
5742 that SIZE is equal to ROUNDED_SIZE. */
5743
5744 if (size != rounded_size)
5745 {
5746 HOST_WIDE_INT rem = size - rounded_size;
5747
5748 if (rem > 256)
5749 {
5750 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
5751
5752 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
5753 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
5754 }
5755 else
5756 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
5757 }
5758 }
5759
5760 /* Make sure nothing is scheduled before we are done. */
5761 emit_insn (gen_blockage ());
5762 }
5763
5764 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
5765 absolute addresses. */
5766
5767 const char *
5768 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
5769 {
5770 static int labelno = 0;
5771 char loop_lab[32];
5772 rtx xops[2];
5773
5774 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
5775
5776 /* Loop. */
5777 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
5778
5779 HOST_WIDE_INT stack_clash_probe_interval
5780 = 1 << param_stack_clash_protection_guard_size;
5781
5782 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
5783 xops[0] = reg1;
5784 HOST_WIDE_INT interval;
5785 if (flag_stack_clash_protection)
5786 interval = stack_clash_probe_interval;
5787 else
5788 interval = PROBE_INTERVAL;
5789
5790 gcc_assert (aarch64_uimm12_shift (interval));
5791 xops[1] = GEN_INT (interval);
5792
5793 output_asm_insn ("sub\t%0, %0, %1", xops);
5794
5795 /* If doing stack clash protection then we probe up by the ABI specified
5796 amount. We do this because we're dropping full pages at a time in the
5797 loop. But if we're doing non-stack clash probing, probe at SP 0. */
5798 if (flag_stack_clash_protection)
5799 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
5800 else
5801 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
5802
5803 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
5804 by this amount for each iteration. */
5805 output_asm_insn ("str\txzr, [%0, %1]", xops);
5806
5807 /* Test if TEST_ADDR == LAST_ADDR. */
5808 xops[1] = reg2;
5809 output_asm_insn ("cmp\t%0, %1", xops);
5810
5811 /* Branch. */
5812 fputs ("\tb.ne\t", asm_out_file);
5813 assemble_name_raw (asm_out_file, loop_lab);
5814 fputc ('\n', asm_out_file);
5815
5816 return "";
5817 }
5818
5819 /* Emit the probe loop for doing stack clash probes and stack adjustments for
5820 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
5821 of GUARD_SIZE. When a probe is emitted it is done at most
5822 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
5823 at most MIN_PROBE_THRESHOLD. By the end of this function
5824 BASE = BASE - ADJUSTMENT. */
5825
5826 const char *
5827 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
5828 rtx min_probe_threshold, rtx guard_size)
5829 {
5830 /* This function is not allowed to use any instruction generation function
5831 like gen_ and friends. If you do you'll likely ICE during CFG validation,
5832 so instead emit the code you want using output_asm_insn. */
5833 gcc_assert (flag_stack_clash_protection);
5834 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
5835 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
5836
5837 /* The minimum required allocation before the residual requires probing. */
5838 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
5839
5840 /* Clamp the value down to the nearest value that can be used with a cmp. */
5841 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
5842 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
5843
5844 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
5845 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
5846
5847 static int labelno = 0;
5848 char loop_start_lab[32];
5849 char loop_end_lab[32];
5850 rtx xops[2];
5851
5852 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
5853 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
5854
5855 /* Emit loop start label. */
5856 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
5857
5858 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
5859 xops[0] = adjustment;
5860 xops[1] = probe_offset_value_rtx;
5861 output_asm_insn ("cmp\t%0, %1", xops);
5862
5863 /* Branch to end if not enough adjustment to probe. */
5864 fputs ("\tb.lt\t", asm_out_file);
5865 assemble_name_raw (asm_out_file, loop_end_lab);
5866 fputc ('\n', asm_out_file);
5867
5868 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
5869 xops[0] = base;
5870 xops[1] = probe_offset_value_rtx;
5871 output_asm_insn ("sub\t%0, %0, %1", xops);
5872
5873 /* Probe at BASE. */
5874 xops[1] = const0_rtx;
5875 output_asm_insn ("str\txzr, [%0, %1]", xops);
5876
5877 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
5878 xops[0] = adjustment;
5879 xops[1] = probe_offset_value_rtx;
5880 output_asm_insn ("sub\t%0, %0, %1", xops);
5881
5882 /* Branch to start if still more bytes to allocate. */
5883 fputs ("\tb\t", asm_out_file);
5884 assemble_name_raw (asm_out_file, loop_start_lab);
5885 fputc ('\n', asm_out_file);
5886
5887 /* No probe leave. */
5888 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
5889
5890 /* BASE = BASE - ADJUSTMENT. */
5891 xops[0] = base;
5892 xops[1] = adjustment;
5893 output_asm_insn ("sub\t%0, %0, %1", xops);
5894 return "";
5895 }
5896
5897 /* Determine whether a frame chain needs to be generated. */
5898 static bool
5899 aarch64_needs_frame_chain (void)
5900 {
5901 /* Force a frame chain for EH returns so the return address is at FP+8. */
5902 if (frame_pointer_needed || crtl->calls_eh_return)
5903 return true;
5904
5905 /* A leaf function cannot have calls or write LR. */
5906 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
5907
5908 /* Don't use a frame chain in leaf functions if leaf frame pointers
5909 are disabled. */
5910 if (flag_omit_leaf_frame_pointer && is_leaf)
5911 return false;
5912
5913 return aarch64_use_frame_pointer;
5914 }
5915
5916 /* Mark the registers that need to be saved by the callee and calculate
5917 the size of the callee-saved registers area and frame record (both FP
5918 and LR may be omitted). */
5919 static void
5920 aarch64_layout_frame (void)
5921 {
5922 poly_int64 offset = 0;
5923 int regno, last_fp_reg = INVALID_REGNUM;
5924 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
5925 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
5926 bool frame_related_fp_reg_p = false;
5927 aarch64_frame &frame = cfun->machine->frame;
5928
5929 frame.emit_frame_chain = aarch64_needs_frame_chain ();
5930
5931 /* Adjust the outgoing arguments size if required. Keep it in sync with what
5932 the mid-end is doing. */
5933 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
5934
5935 #define SLOT_NOT_REQUIRED (-2)
5936 #define SLOT_REQUIRED (-1)
5937
5938 frame.wb_candidate1 = INVALID_REGNUM;
5939 frame.wb_candidate2 = INVALID_REGNUM;
5940 frame.spare_pred_reg = INVALID_REGNUM;
5941
5942 /* First mark all the registers that really need to be saved... */
5943 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5944 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
5945
5946 /* ... that includes the eh data registers (if needed)... */
5947 if (crtl->calls_eh_return)
5948 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
5949 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
5950
5951 /* ... and any callee saved register that dataflow says is live. */
5952 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
5953 if (df_regs_ever_live_p (regno)
5954 && !fixed_regs[regno]
5955 && (regno == R30_REGNUM
5956 || !crtl->abi->clobbers_full_reg_p (regno)))
5957 frame.reg_offset[regno] = SLOT_REQUIRED;
5958
5959 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
5960 if (df_regs_ever_live_p (regno)
5961 && !fixed_regs[regno]
5962 && !crtl->abi->clobbers_full_reg_p (regno))
5963 {
5964 frame.reg_offset[regno] = SLOT_REQUIRED;
5965 last_fp_reg = regno;
5966 if (aarch64_emit_cfi_for_reg_p (regno))
5967 frame_related_fp_reg_p = true;
5968 }
5969
5970 /* Big-endian SVE frames need a spare predicate register in order
5971 to save Z8-Z15. Decide which register they should use. Prefer
5972 an unused argument register if possible, so that we don't force P4
5973 to be saved unnecessarily. */
5974 if (frame_related_fp_reg_p
5975 && crtl->abi->id () == ARM_PCS_SVE
5976 && BYTES_BIG_ENDIAN)
5977 {
5978 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5979 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
5980 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
5981 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
5982 break;
5983 gcc_assert (regno <= P7_REGNUM);
5984 frame.spare_pred_reg = regno;
5985 df_set_regs_ever_live (regno, true);
5986 }
5987
5988 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
5989 if (df_regs_ever_live_p (regno)
5990 && !fixed_regs[regno]
5991 && !crtl->abi->clobbers_full_reg_p (regno))
5992 frame.reg_offset[regno] = SLOT_REQUIRED;
5993
5994 /* With stack-clash, LR must be saved in non-leaf functions. */
5995 gcc_assert (crtl->is_leaf
5996 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
5997
5998 /* Now assign stack slots for the registers. Start with the predicate
5999 registers, since predicate LDR and STR have a relatively small
6000 offset range. These saves happen below the hard frame pointer. */
6001 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6002 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6003 {
6004 frame.reg_offset[regno] = offset;
6005 offset += BYTES_PER_SVE_PRED;
6006 }
6007
6008 /* We save a maximum of 8 predicate registers, and since vector
6009 registers are 8 times the size of a predicate register, all the
6010 saved predicates fit within a single vector. Doing this also
6011 rounds the offset to a 128-bit boundary. */
6012 if (maybe_ne (offset, 0))
6013 {
6014 gcc_assert (known_le (offset, vector_save_size));
6015 offset = vector_save_size;
6016 }
6017
6018 /* If we need to save any SVE vector registers, add them next. */
6019 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6020 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6021 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6022 {
6023 frame.reg_offset[regno] = offset;
6024 offset += vector_save_size;
6025 }
6026
6027 /* OFFSET is now the offset of the hard frame pointer from the bottom
6028 of the callee save area. */
6029 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6030 frame.below_hard_fp_saved_regs_size = offset;
6031 if (frame.emit_frame_chain)
6032 {
6033 /* FP and LR are placed in the linkage record. */
6034 frame.reg_offset[R29_REGNUM] = offset;
6035 frame.wb_candidate1 = R29_REGNUM;
6036 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6037 frame.wb_candidate2 = R30_REGNUM;
6038 offset += 2 * UNITS_PER_WORD;
6039 }
6040
6041 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6042 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6043 {
6044 frame.reg_offset[regno] = offset;
6045 if (frame.wb_candidate1 == INVALID_REGNUM)
6046 frame.wb_candidate1 = regno;
6047 else if (frame.wb_candidate2 == INVALID_REGNUM)
6048 frame.wb_candidate2 = regno;
6049 offset += UNITS_PER_WORD;
6050 }
6051
6052 poly_int64 max_int_offset = offset;
6053 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6054 bool has_align_gap = maybe_ne (offset, max_int_offset);
6055
6056 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6057 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6058 {
6059 /* If there is an alignment gap between integer and fp callee-saves,
6060 allocate the last fp register to it if possible. */
6061 if (regno == last_fp_reg
6062 && has_align_gap
6063 && known_eq (vector_save_size, 8)
6064 && multiple_p (offset, 16))
6065 {
6066 frame.reg_offset[regno] = max_int_offset;
6067 break;
6068 }
6069
6070 frame.reg_offset[regno] = offset;
6071 if (frame.wb_candidate1 == INVALID_REGNUM)
6072 frame.wb_candidate1 = regno;
6073 else if (frame.wb_candidate2 == INVALID_REGNUM
6074 && frame.wb_candidate1 >= V0_REGNUM)
6075 frame.wb_candidate2 = regno;
6076 offset += vector_save_size;
6077 }
6078
6079 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6080
6081 frame.saved_regs_size = offset;
6082
6083 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6084
6085 poly_int64 above_outgoing_args
6086 = aligned_upper_bound (varargs_and_saved_regs_size
6087 + get_frame_size (),
6088 STACK_BOUNDARY / BITS_PER_UNIT);
6089
6090 frame.hard_fp_offset
6091 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6092
6093 /* Both these values are already aligned. */
6094 gcc_assert (multiple_p (crtl->outgoing_args_size,
6095 STACK_BOUNDARY / BITS_PER_UNIT));
6096 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6097
6098 frame.locals_offset = frame.saved_varargs_size;
6099
6100 frame.initial_adjust = 0;
6101 frame.final_adjust = 0;
6102 frame.callee_adjust = 0;
6103 frame.sve_callee_adjust = 0;
6104 frame.callee_offset = 0;
6105
6106 HOST_WIDE_INT max_push_offset = 0;
6107 if (frame.wb_candidate2 != INVALID_REGNUM)
6108 max_push_offset = 512;
6109 else if (frame.wb_candidate1 != INVALID_REGNUM)
6110 max_push_offset = 256;
6111
6112 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6113 HOST_WIDE_INT const_saved_regs_size;
6114 if (frame.frame_size.is_constant (&const_size)
6115 && const_size < max_push_offset
6116 && known_eq (frame.hard_fp_offset, const_size))
6117 {
6118 /* Simple, small frame with no outgoing arguments:
6119
6120 stp reg1, reg2, [sp, -frame_size]!
6121 stp reg3, reg4, [sp, 16] */
6122 frame.callee_adjust = const_size;
6123 }
6124 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6125 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6126 && const_outgoing_args_size + const_saved_regs_size < 512
6127 /* We could handle this case even with outgoing args, provided
6128 that the number of args left us with valid offsets for all
6129 predicate and vector save slots. It's such a rare case that
6130 it hardly seems worth the effort though. */
6131 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6132 && !(cfun->calls_alloca
6133 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6134 && const_fp_offset < max_push_offset))
6135 {
6136 /* Frame with small outgoing arguments:
6137
6138 sub sp, sp, frame_size
6139 stp reg1, reg2, [sp, outgoing_args_size]
6140 stp reg3, reg4, [sp, outgoing_args_size + 16] */
6141 frame.initial_adjust = frame.frame_size;
6142 frame.callee_offset = const_outgoing_args_size;
6143 }
6144 else if (saves_below_hard_fp_p
6145 && known_eq (frame.saved_regs_size,
6146 frame.below_hard_fp_saved_regs_size))
6147 {
6148 /* Frame in which all saves are SVE saves:
6149
6150 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6151 save SVE registers relative to SP
6152 sub sp, sp, outgoing_args_size */
6153 frame.initial_adjust = (frame.hard_fp_offset
6154 + frame.below_hard_fp_saved_regs_size);
6155 frame.final_adjust = crtl->outgoing_args_size;
6156 }
6157 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6158 && const_fp_offset < max_push_offset)
6159 {
6160 /* Frame with large outgoing arguments or SVE saves, but with
6161 a small local area:
6162
6163 stp reg1, reg2, [sp, -hard_fp_offset]!
6164 stp reg3, reg4, [sp, 16]
6165 [sub sp, sp, below_hard_fp_saved_regs_size]
6166 [save SVE registers relative to SP]
6167 sub sp, sp, outgoing_args_size */
6168 frame.callee_adjust = const_fp_offset;
6169 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6170 frame.final_adjust = crtl->outgoing_args_size;
6171 }
6172 else
6173 {
6174 /* Frame with large local area and outgoing arguments or SVE saves,
6175 using frame pointer:
6176
6177 sub sp, sp, hard_fp_offset
6178 stp x29, x30, [sp, 0]
6179 add x29, sp, 0
6180 stp reg3, reg4, [sp, 16]
6181 [sub sp, sp, below_hard_fp_saved_regs_size]
6182 [save SVE registers relative to SP]
6183 sub sp, sp, outgoing_args_size */
6184 frame.initial_adjust = frame.hard_fp_offset;
6185 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6186 frame.final_adjust = crtl->outgoing_args_size;
6187 }
6188
6189 /* Make sure the individual adjustments add up to the full frame size. */
6190 gcc_assert (known_eq (frame.initial_adjust
6191 + frame.callee_adjust
6192 + frame.sve_callee_adjust
6193 + frame.final_adjust, frame.frame_size));
6194
6195 frame.laid_out = true;
6196 }
6197
6198 /* Return true if the register REGNO is saved on entry to
6199 the current function. */
6200
6201 static bool
6202 aarch64_register_saved_on_entry (int regno)
6203 {
6204 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6205 }
6206
6207 /* Return the next register up from REGNO up to LIMIT for the callee
6208 to save. */
6209
6210 static unsigned
6211 aarch64_next_callee_save (unsigned regno, unsigned limit)
6212 {
6213 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6214 regno ++;
6215 return regno;
6216 }
6217
6218 /* Push the register number REGNO of mode MODE to the stack with write-back
6219 adjusting the stack by ADJUSTMENT. */
6220
6221 static void
6222 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6223 HOST_WIDE_INT adjustment)
6224 {
6225 rtx base_rtx = stack_pointer_rtx;
6226 rtx insn, reg, mem;
6227
6228 reg = gen_rtx_REG (mode, regno);
6229 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6230 plus_constant (Pmode, base_rtx, -adjustment));
6231 mem = gen_frame_mem (mode, mem);
6232
6233 insn = emit_move_insn (mem, reg);
6234 RTX_FRAME_RELATED_P (insn) = 1;
6235 }
6236
6237 /* Generate and return an instruction to store the pair of registers
6238 REG and REG2 of mode MODE to location BASE with write-back adjusting
6239 the stack location BASE by ADJUSTMENT. */
6240
6241 static rtx
6242 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6243 HOST_WIDE_INT adjustment)
6244 {
6245 switch (mode)
6246 {
6247 case E_DImode:
6248 return gen_storewb_pairdi_di (base, base, reg, reg2,
6249 GEN_INT (-adjustment),
6250 GEN_INT (UNITS_PER_WORD - adjustment));
6251 case E_DFmode:
6252 return gen_storewb_pairdf_di (base, base, reg, reg2,
6253 GEN_INT (-adjustment),
6254 GEN_INT (UNITS_PER_WORD - adjustment));
6255 case E_TFmode:
6256 return gen_storewb_pairtf_di (base, base, reg, reg2,
6257 GEN_INT (-adjustment),
6258 GEN_INT (UNITS_PER_VREG - adjustment));
6259 default:
6260 gcc_unreachable ();
6261 }
6262 }
6263
6264 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6265 stack pointer by ADJUSTMENT. */
6266
6267 static void
6268 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6269 {
6270 rtx_insn *insn;
6271 machine_mode mode = aarch64_reg_save_mode (regno1);
6272
6273 if (regno2 == INVALID_REGNUM)
6274 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6275
6276 rtx reg1 = gen_rtx_REG (mode, regno1);
6277 rtx reg2 = gen_rtx_REG (mode, regno2);
6278
6279 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6280 reg2, adjustment));
6281 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6282 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6283 RTX_FRAME_RELATED_P (insn) = 1;
6284 }
6285
6286 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6287 adjusting it by ADJUSTMENT afterwards. */
6288
6289 static rtx
6290 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6291 HOST_WIDE_INT adjustment)
6292 {
6293 switch (mode)
6294 {
6295 case E_DImode:
6296 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6297 GEN_INT (UNITS_PER_WORD));
6298 case E_DFmode:
6299 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6300 GEN_INT (UNITS_PER_WORD));
6301 case E_TFmode:
6302 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6303 GEN_INT (UNITS_PER_VREG));
6304 default:
6305 gcc_unreachable ();
6306 }
6307 }
6308
6309 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6310 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6311 into CFI_OPS. */
6312
6313 static void
6314 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6315 rtx *cfi_ops)
6316 {
6317 machine_mode mode = aarch64_reg_save_mode (regno1);
6318 rtx reg1 = gen_rtx_REG (mode, regno1);
6319
6320 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6321
6322 if (regno2 == INVALID_REGNUM)
6323 {
6324 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6325 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6326 emit_move_insn (reg1, gen_frame_mem (mode, mem));
6327 }
6328 else
6329 {
6330 rtx reg2 = gen_rtx_REG (mode, regno2);
6331 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6332 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6333 reg2, adjustment));
6334 }
6335 }
6336
6337 /* Generate and return a store pair instruction of mode MODE to store
6338 register REG1 to MEM1 and register REG2 to MEM2. */
6339
6340 static rtx
6341 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6342 rtx reg2)
6343 {
6344 switch (mode)
6345 {
6346 case E_DImode:
6347 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6348
6349 case E_DFmode:
6350 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6351
6352 case E_TFmode:
6353 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6354
6355 default:
6356 gcc_unreachable ();
6357 }
6358 }
6359
6360 /* Generate and regurn a load pair isntruction of mode MODE to load register
6361 REG1 from MEM1 and register REG2 from MEM2. */
6362
6363 static rtx
6364 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6365 rtx mem2)
6366 {
6367 switch (mode)
6368 {
6369 case E_DImode:
6370 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6371
6372 case E_DFmode:
6373 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6374
6375 case E_TFmode:
6376 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6377
6378 default:
6379 gcc_unreachable ();
6380 }
6381 }
6382
6383 /* Return TRUE if return address signing should be enabled for the current
6384 function, otherwise return FALSE. */
6385
6386 bool
6387 aarch64_return_address_signing_enabled (void)
6388 {
6389 /* This function should only be called after frame laid out. */
6390 gcc_assert (cfun->machine->frame.laid_out);
6391
6392 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6393 if its LR is pushed onto stack. */
6394 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6395 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6396 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6397 }
6398
6399 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
6400 bool
6401 aarch64_bti_enabled (void)
6402 {
6403 return (aarch64_enable_bti == 1);
6404 }
6405
6406 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6407 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6408 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
6409
6410 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6411 or LD1D address
6412
6413 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6414 if the variable isn't already nonnull
6415
6416 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6417 Handle this case using a temporary base register that is suitable for
6418 all offsets in that range. Use ANCHOR_REG as this base register if it
6419 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
6420
6421 static inline void
6422 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
6423 rtx &anchor_reg, poly_int64 &offset,
6424 rtx &ptrue)
6425 {
6426 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
6427 {
6428 /* This is the maximum valid offset of the anchor from the base.
6429 Lower values would be valid too. */
6430 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
6431 if (!anchor_reg)
6432 {
6433 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6434 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6435 gen_int_mode (anchor_offset, Pmode)));
6436 }
6437 base_rtx = anchor_reg;
6438 offset -= anchor_offset;
6439 }
6440 if (!ptrue)
6441 {
6442 int pred_reg = cfun->machine->frame.spare_pred_reg;
6443 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
6444 CONSTM1_RTX (VNx16BImode));
6445 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
6446 }
6447 }
6448
6449 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
6450 is saved at BASE + OFFSET. */
6451
6452 static void
6453 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
6454 rtx base, poly_int64 offset)
6455 {
6456 rtx mem = gen_frame_mem (GET_MODE (reg),
6457 plus_constant (Pmode, base, offset));
6458 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
6459 }
6460
6461 /* Emit code to save the callee-saved registers from register number START
6462 to LIMIT to the stack at the location starting at offset START_OFFSET,
6463 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
6464 is true if the hard frame pointer has been set up. */
6465
6466 static void
6467 aarch64_save_callee_saves (poly_int64 start_offset,
6468 unsigned start, unsigned limit, bool skip_wb,
6469 bool hard_fp_valid_p)
6470 {
6471 rtx_insn *insn;
6472 unsigned regno;
6473 unsigned regno2;
6474 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6475
6476 for (regno = aarch64_next_callee_save (start, limit);
6477 regno <= limit;
6478 regno = aarch64_next_callee_save (regno + 1, limit))
6479 {
6480 rtx reg, mem;
6481 poly_int64 offset;
6482 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6483
6484 if (skip_wb
6485 && (regno == cfun->machine->frame.wb_candidate1
6486 || regno == cfun->machine->frame.wb_candidate2))
6487 continue;
6488
6489 if (cfun->machine->reg_is_wrapped_separately[regno])
6490 continue;
6491
6492 machine_mode mode = aarch64_reg_save_mode (regno);
6493 reg = gen_rtx_REG (mode, regno);
6494 offset = start_offset + cfun->machine->frame.reg_offset[regno];
6495 rtx base_rtx = stack_pointer_rtx;
6496 poly_int64 sp_offset = offset;
6497
6498 HOST_WIDE_INT const_offset;
6499 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6500 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6501 offset, ptrue);
6502 else if (GP_REGNUM_P (regno)
6503 && (!offset.is_constant (&const_offset) || const_offset >= 512))
6504 {
6505 gcc_assert (known_eq (start_offset, 0));
6506 poly_int64 fp_offset
6507 = cfun->machine->frame.below_hard_fp_saved_regs_size;
6508 if (hard_fp_valid_p)
6509 base_rtx = hard_frame_pointer_rtx;
6510 else
6511 {
6512 if (!anchor_reg)
6513 {
6514 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
6515 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
6516 gen_int_mode (fp_offset, Pmode)));
6517 }
6518 base_rtx = anchor_reg;
6519 }
6520 offset -= fp_offset;
6521 }
6522 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6523 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
6524
6525 if (!aarch64_sve_mode_p (mode)
6526 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6527 && !cfun->machine->reg_is_wrapped_separately[regno2]
6528 && known_eq (GET_MODE_SIZE (mode),
6529 cfun->machine->frame.reg_offset[regno2]
6530 - cfun->machine->frame.reg_offset[regno]))
6531 {
6532 rtx reg2 = gen_rtx_REG (mode, regno2);
6533 rtx mem2;
6534
6535 offset += GET_MODE_SIZE (mode);
6536 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6537 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
6538 reg2));
6539
6540 /* The first part of a frame-related parallel insn is
6541 always assumed to be relevant to the frame
6542 calculations; subsequent parts, are only
6543 frame-related if explicitly marked. */
6544 if (aarch64_emit_cfi_for_reg_p (regno2))
6545 {
6546 if (need_cfa_note_p)
6547 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
6548 sp_offset + GET_MODE_SIZE (mode));
6549 else
6550 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6551 }
6552
6553 regno = regno2;
6554 }
6555 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6556 {
6557 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
6558 need_cfa_note_p = true;
6559 }
6560 else if (aarch64_sve_mode_p (mode))
6561 insn = emit_insn (gen_rtx_SET (mem, reg));
6562 else
6563 insn = emit_move_insn (mem, reg);
6564
6565 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6566 if (frame_related_p && need_cfa_note_p)
6567 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
6568 }
6569 }
6570
6571 /* Emit code to restore the callee registers from register number START
6572 up to and including LIMIT. Restore from the stack offset START_OFFSET,
6573 skipping any write-back candidates if SKIP_WB is true. Write the
6574 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
6575
6576 static void
6577 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
6578 unsigned limit, bool skip_wb, rtx *cfi_ops)
6579 {
6580 unsigned regno;
6581 unsigned regno2;
6582 poly_int64 offset;
6583 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
6584
6585 for (regno = aarch64_next_callee_save (start, limit);
6586 regno <= limit;
6587 regno = aarch64_next_callee_save (regno + 1, limit))
6588 {
6589 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6590 if (cfun->machine->reg_is_wrapped_separately[regno])
6591 continue;
6592
6593 rtx reg, mem;
6594
6595 if (skip_wb
6596 && (regno == cfun->machine->frame.wb_candidate1
6597 || regno == cfun->machine->frame.wb_candidate2))
6598 continue;
6599
6600 machine_mode mode = aarch64_reg_save_mode (regno);
6601 reg = gen_rtx_REG (mode, regno);
6602 offset = start_offset + cfun->machine->frame.reg_offset[regno];
6603 rtx base_rtx = stack_pointer_rtx;
6604 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6605 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
6606 offset, ptrue);
6607 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6608
6609 if (!aarch64_sve_mode_p (mode)
6610 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
6611 && !cfun->machine->reg_is_wrapped_separately[regno2]
6612 && known_eq (GET_MODE_SIZE (mode),
6613 cfun->machine->frame.reg_offset[regno2]
6614 - cfun->machine->frame.reg_offset[regno]))
6615 {
6616 rtx reg2 = gen_rtx_REG (mode, regno2);
6617 rtx mem2;
6618
6619 offset += GET_MODE_SIZE (mode);
6620 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
6621 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6622
6623 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6624 regno = regno2;
6625 }
6626 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6627 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
6628 else if (aarch64_sve_mode_p (mode))
6629 emit_insn (gen_rtx_SET (reg, mem));
6630 else
6631 emit_move_insn (reg, mem);
6632 if (frame_related_p)
6633 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
6634 }
6635 }
6636
6637 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
6638 of MODE. */
6639
6640 static inline bool
6641 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6642 {
6643 HOST_WIDE_INT multiple;
6644 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6645 && IN_RANGE (multiple, -8, 7));
6646 }
6647
6648 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
6649 of MODE. */
6650
6651 static inline bool
6652 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6653 {
6654 HOST_WIDE_INT multiple;
6655 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6656 && IN_RANGE (multiple, 0, 63));
6657 }
6658
6659 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
6660 of MODE. */
6661
6662 bool
6663 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6664 {
6665 HOST_WIDE_INT multiple;
6666 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6667 && IN_RANGE (multiple, -64, 63));
6668 }
6669
6670 /* Return true if OFFSET is a signed 9-bit value. */
6671
6672 bool
6673 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
6674 poly_int64 offset)
6675 {
6676 HOST_WIDE_INT const_offset;
6677 return (offset.is_constant (&const_offset)
6678 && IN_RANGE (const_offset, -256, 255));
6679 }
6680
6681 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
6682 of MODE. */
6683
6684 static inline bool
6685 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
6686 {
6687 HOST_WIDE_INT multiple;
6688 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6689 && IN_RANGE (multiple, -256, 255));
6690 }
6691
6692 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
6693 of MODE. */
6694
6695 static inline bool
6696 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
6697 {
6698 HOST_WIDE_INT multiple;
6699 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
6700 && IN_RANGE (multiple, 0, 4095));
6701 }
6702
6703 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
6704
6705 static sbitmap
6706 aarch64_get_separate_components (void)
6707 {
6708 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6709 bitmap_clear (components);
6710
6711 /* The registers we need saved to the frame. */
6712 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6713 if (aarch64_register_saved_on_entry (regno))
6714 {
6715 /* Punt on saves and restores that use ST1D and LD1D. We could
6716 try to be smarter, but it would involve making sure that the
6717 spare predicate register itself is safe to use at the save
6718 and restore points. Also, when a frame pointer is being used,
6719 the slots are often out of reach of ST1D and LD1D anyway. */
6720 machine_mode mode = aarch64_reg_save_mode (regno);
6721 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
6722 continue;
6723
6724 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6725
6726 /* If the register is saved in the first SVE save slot, we use
6727 it as a stack probe for -fstack-clash-protection. */
6728 if (flag_stack_clash_protection
6729 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
6730 && known_eq (offset, 0))
6731 continue;
6732
6733 /* Get the offset relative to the register we'll use. */
6734 if (frame_pointer_needed)
6735 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6736 else
6737 offset += crtl->outgoing_args_size;
6738
6739 /* Check that we can access the stack slot of the register with one
6740 direct load with no adjustments needed. */
6741 if (aarch64_sve_mode_p (mode)
6742 ? offset_9bit_signed_scaled_p (mode, offset)
6743 : offset_12bit_unsigned_scaled_p (mode, offset))
6744 bitmap_set_bit (components, regno);
6745 }
6746
6747 /* Don't mess with the hard frame pointer. */
6748 if (frame_pointer_needed)
6749 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
6750
6751 /* If the spare predicate register used by big-endian SVE code
6752 is call-preserved, it must be saved in the main prologue
6753 before any saves that use it. */
6754 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
6755 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
6756
6757 unsigned reg1 = cfun->machine->frame.wb_candidate1;
6758 unsigned reg2 = cfun->machine->frame.wb_candidate2;
6759 /* If registers have been chosen to be stored/restored with
6760 writeback don't interfere with them to avoid having to output explicit
6761 stack adjustment instructions. */
6762 if (reg2 != INVALID_REGNUM)
6763 bitmap_clear_bit (components, reg2);
6764 if (reg1 != INVALID_REGNUM)
6765 bitmap_clear_bit (components, reg1);
6766
6767 bitmap_clear_bit (components, LR_REGNUM);
6768 bitmap_clear_bit (components, SP_REGNUM);
6769
6770 return components;
6771 }
6772
6773 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
6774
6775 static sbitmap
6776 aarch64_components_for_bb (basic_block bb)
6777 {
6778 bitmap in = DF_LIVE_IN (bb);
6779 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
6780 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
6781
6782 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
6783 bitmap_clear (components);
6784
6785 /* Clobbered registers don't generate values in any meaningful sense,
6786 since nothing after the clobber can rely on their value. And we can't
6787 say that partially-clobbered registers are unconditionally killed,
6788 because whether they're killed or not depends on the mode of the
6789 value they're holding. Thus partially call-clobbered registers
6790 appear in neither the kill set nor the gen set.
6791
6792 Check manually for any calls that clobber more of a register than the
6793 current function can. */
6794 function_abi_aggregator callee_abis;
6795 rtx_insn *insn;
6796 FOR_BB_INSNS (bb, insn)
6797 if (CALL_P (insn))
6798 callee_abis.note_callee_abi (insn_callee_abi (insn));
6799 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
6800
6801 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
6802 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6803 if (!fixed_regs[regno]
6804 && !crtl->abi->clobbers_full_reg_p (regno)
6805 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
6806 || bitmap_bit_p (in, regno)
6807 || bitmap_bit_p (gen, regno)
6808 || bitmap_bit_p (kill, regno)))
6809 {
6810 bitmap_set_bit (components, regno);
6811
6812 /* If there is a callee-save at an adjacent offset, add it too
6813 to increase the use of LDP/STP. */
6814 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6815 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
6816
6817 if (regno2 <= LAST_SAVED_REGNUM)
6818 {
6819 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6820 if (regno < regno2
6821 ? known_eq (offset + 8, offset2)
6822 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
6823 bitmap_set_bit (components, regno2);
6824 }
6825 }
6826
6827 return components;
6828 }
6829
6830 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
6831 Nothing to do for aarch64. */
6832
6833 static void
6834 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
6835 {
6836 }
6837
6838 /* Return the next set bit in BMP from START onwards. Return the total number
6839 of bits in BMP if no set bit is found at or after START. */
6840
6841 static unsigned int
6842 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
6843 {
6844 unsigned int nbits = SBITMAP_SIZE (bmp);
6845 if (start == nbits)
6846 return start;
6847
6848 gcc_assert (start < nbits);
6849 for (unsigned int i = start; i < nbits; i++)
6850 if (bitmap_bit_p (bmp, i))
6851 return i;
6852
6853 return nbits;
6854 }
6855
6856 /* Do the work for aarch64_emit_prologue_components and
6857 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
6858 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
6859 for these components or the epilogue sequence. That is, it determines
6860 whether we should emit stores or loads and what kind of CFA notes to attach
6861 to the insns. Otherwise the logic for the two sequences is very
6862 similar. */
6863
6864 static void
6865 aarch64_process_components (sbitmap components, bool prologue_p)
6866 {
6867 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
6868 ? HARD_FRAME_POINTER_REGNUM
6869 : STACK_POINTER_REGNUM);
6870
6871 unsigned last_regno = SBITMAP_SIZE (components);
6872 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
6873 rtx_insn *insn = NULL;
6874
6875 while (regno != last_regno)
6876 {
6877 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
6878 machine_mode mode = aarch64_reg_save_mode (regno);
6879
6880 rtx reg = gen_rtx_REG (mode, regno);
6881 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
6882 if (frame_pointer_needed)
6883 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6884 else
6885 offset += crtl->outgoing_args_size;
6886
6887 rtx addr = plus_constant (Pmode, ptr_reg, offset);
6888 rtx mem = gen_frame_mem (mode, addr);
6889
6890 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
6891 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
6892 /* No more registers to handle after REGNO.
6893 Emit a single save/restore and exit. */
6894 if (regno2 == last_regno)
6895 {
6896 insn = emit_insn (set);
6897 if (frame_related_p)
6898 {
6899 RTX_FRAME_RELATED_P (insn) = 1;
6900 if (prologue_p)
6901 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6902 else
6903 add_reg_note (insn, REG_CFA_RESTORE, reg);
6904 }
6905 break;
6906 }
6907
6908 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
6909 /* The next register is not of the same class or its offset is not
6910 mergeable with the current one into a pair. */
6911 if (aarch64_sve_mode_p (mode)
6912 || !satisfies_constraint_Ump (mem)
6913 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
6914 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
6915 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
6916 GET_MODE_SIZE (mode)))
6917 {
6918 insn = emit_insn (set);
6919 if (frame_related_p)
6920 {
6921 RTX_FRAME_RELATED_P (insn) = 1;
6922 if (prologue_p)
6923 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
6924 else
6925 add_reg_note (insn, REG_CFA_RESTORE, reg);
6926 }
6927
6928 regno = regno2;
6929 continue;
6930 }
6931
6932 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
6933
6934 /* REGNO2 can be saved/restored in a pair with REGNO. */
6935 rtx reg2 = gen_rtx_REG (mode, regno2);
6936 if (frame_pointer_needed)
6937 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
6938 else
6939 offset2 += crtl->outgoing_args_size;
6940 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
6941 rtx mem2 = gen_frame_mem (mode, addr2);
6942 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
6943 : gen_rtx_SET (reg2, mem2);
6944
6945 if (prologue_p)
6946 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
6947 else
6948 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
6949
6950 if (frame_related_p || frame_related2_p)
6951 {
6952 RTX_FRAME_RELATED_P (insn) = 1;
6953 if (prologue_p)
6954 {
6955 if (frame_related_p)
6956 add_reg_note (insn, REG_CFA_OFFSET, set);
6957 if (frame_related2_p)
6958 add_reg_note (insn, REG_CFA_OFFSET, set2);
6959 }
6960 else
6961 {
6962 if (frame_related_p)
6963 add_reg_note (insn, REG_CFA_RESTORE, reg);
6964 if (frame_related2_p)
6965 add_reg_note (insn, REG_CFA_RESTORE, reg2);
6966 }
6967 }
6968
6969 regno = aarch64_get_next_set_bit (components, regno2 + 1);
6970 }
6971 }
6972
6973 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
6974
6975 static void
6976 aarch64_emit_prologue_components (sbitmap components)
6977 {
6978 aarch64_process_components (components, true);
6979 }
6980
6981 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
6982
6983 static void
6984 aarch64_emit_epilogue_components (sbitmap components)
6985 {
6986 aarch64_process_components (components, false);
6987 }
6988
6989 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
6990
6991 static void
6992 aarch64_set_handled_components (sbitmap components)
6993 {
6994 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6995 if (bitmap_bit_p (components, regno))
6996 cfun->machine->reg_is_wrapped_separately[regno] = true;
6997 }
6998
6999 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
7000 determining the probe offset for alloca. */
7001
7002 static HOST_WIDE_INT
7003 aarch64_stack_clash_protection_alloca_probe_range (void)
7004 {
7005 return STACK_CLASH_CALLER_GUARD;
7006 }
7007
7008
7009 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7010 registers. If POLY_SIZE is not large enough to require a probe this function
7011 will only adjust the stack. When allocating the stack space
7012 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7013 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7014 arguments. If we are then we ensure that any allocation larger than the ABI
7015 defined buffer needs a probe so that the invariant of having a 1KB buffer is
7016 maintained.
7017
7018 We emit barriers after each stack adjustment to prevent optimizations from
7019 breaking the invariant that we never drop the stack more than a page. This
7020 invariant is needed to make it easier to correctly handle asynchronous
7021 events, e.g. if we were to allow the stack to be dropped by more than a page
7022 and then have multiple probes up and we take a signal somewhere in between
7023 then the signal handler doesn't know the state of the stack and can make no
7024 assumptions about which pages have been probed. */
7025
7026 static void
7027 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7028 poly_int64 poly_size,
7029 bool frame_related_p,
7030 bool final_adjustment_p)
7031 {
7032 HOST_WIDE_INT guard_size
7033 = 1 << param_stack_clash_protection_guard_size;
7034 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7035 HOST_WIDE_INT min_probe_threshold
7036 = (final_adjustment_p
7037 ? guard_used_by_caller
7038 : guard_size - guard_used_by_caller);
7039 /* When doing the final adjustment for the outgoing arguments, take into
7040 account any unprobed space there is above the current SP. There are
7041 two cases:
7042
7043 - When saving SVE registers below the hard frame pointer, we force
7044 the lowest save to take place in the prologue before doing the final
7045 adjustment (i.e. we don't allow the save to be shrink-wrapped).
7046 This acts as a probe at SP, so there is no unprobed space.
7047
7048 - When there are no SVE register saves, we use the store of the link
7049 register as a probe. We can't assume that LR was saved at position 0
7050 though, so treat any space below it as unprobed. */
7051 if (final_adjustment_p
7052 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7053 {
7054 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7055 if (known_ge (lr_offset, 0))
7056 min_probe_threshold -= lr_offset.to_constant ();
7057 else
7058 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7059 }
7060
7061 poly_int64 frame_size = cfun->machine->frame.frame_size;
7062
7063 /* We should always have a positive probe threshold. */
7064 gcc_assert (min_probe_threshold > 0);
7065
7066 if (flag_stack_clash_protection && !final_adjustment_p)
7067 {
7068 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7069 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7070 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7071
7072 if (known_eq (frame_size, 0))
7073 {
7074 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7075 }
7076 else if (known_lt (initial_adjust + sve_callee_adjust,
7077 guard_size - guard_used_by_caller)
7078 && known_lt (final_adjust, guard_used_by_caller))
7079 {
7080 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7081 }
7082 }
7083
7084 /* If SIZE is not large enough to require probing, just adjust the stack and
7085 exit. */
7086 if (known_lt (poly_size, min_probe_threshold)
7087 || !flag_stack_clash_protection)
7088 {
7089 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7090 return;
7091 }
7092
7093 HOST_WIDE_INT size;
7094 /* Handle the SVE non-constant case first. */
7095 if (!poly_size.is_constant (&size))
7096 {
7097 if (dump_file)
7098 {
7099 fprintf (dump_file, "Stack clash SVE prologue: ");
7100 print_dec (poly_size, dump_file);
7101 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7102 }
7103
7104 /* First calculate the amount of bytes we're actually spilling. */
7105 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7106 poly_size, temp1, temp2, false, true);
7107
7108 rtx_insn *insn = get_last_insn ();
7109
7110 if (frame_related_p)
7111 {
7112 /* This is done to provide unwinding information for the stack
7113 adjustments we're about to do, however to prevent the optimizers
7114 from removing the R11 move and leaving the CFA note (which would be
7115 very wrong) we tie the old and new stack pointer together.
7116 The tie will expand to nothing but the optimizers will not touch
7117 the instruction. */
7118 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7119 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7120 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7121
7122 /* We want the CFA independent of the stack pointer for the
7123 duration of the loop. */
7124 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7125 RTX_FRAME_RELATED_P (insn) = 1;
7126 }
7127
7128 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7129 rtx guard_const = gen_int_mode (guard_size, Pmode);
7130
7131 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7132 stack_pointer_rtx, temp1,
7133 probe_const, guard_const));
7134
7135 /* Now reset the CFA register if needed. */
7136 if (frame_related_p)
7137 {
7138 add_reg_note (insn, REG_CFA_DEF_CFA,
7139 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7140 gen_int_mode (poly_size, Pmode)));
7141 RTX_FRAME_RELATED_P (insn) = 1;
7142 }
7143
7144 return;
7145 }
7146
7147 if (dump_file)
7148 fprintf (dump_file,
7149 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7150 " bytes, probing will be required.\n", size);
7151
7152 /* Round size to the nearest multiple of guard_size, and calculate the
7153 residual as the difference between the original size and the rounded
7154 size. */
7155 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7156 HOST_WIDE_INT residual = size - rounded_size;
7157
7158 /* We can handle a small number of allocations/probes inline. Otherwise
7159 punt to a loop. */
7160 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7161 {
7162 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7163 {
7164 aarch64_sub_sp (NULL, temp2, guard_size, true);
7165 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7166 guard_used_by_caller));
7167 emit_insn (gen_blockage ());
7168 }
7169 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7170 }
7171 else
7172 {
7173 /* Compute the ending address. */
7174 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7175 temp1, NULL, false, true);
7176 rtx_insn *insn = get_last_insn ();
7177
7178 /* For the initial allocation, we don't have a frame pointer
7179 set up, so we always need CFI notes. If we're doing the
7180 final allocation, then we may have a frame pointer, in which
7181 case it is the CFA, otherwise we need CFI notes.
7182
7183 We can determine which allocation we are doing by looking at
7184 the value of FRAME_RELATED_P since the final allocations are not
7185 frame related. */
7186 if (frame_related_p)
7187 {
7188 /* We want the CFA independent of the stack pointer for the
7189 duration of the loop. */
7190 add_reg_note (insn, REG_CFA_DEF_CFA,
7191 plus_constant (Pmode, temp1, rounded_size));
7192 RTX_FRAME_RELATED_P (insn) = 1;
7193 }
7194
7195 /* This allocates and probes the stack. Note that this re-uses some of
7196 the existing Ada stack protection code. However we are guaranteed not
7197 to enter the non loop or residual branches of that code.
7198
7199 The non-loop part won't be entered because if our allocation amount
7200 doesn't require a loop, the case above would handle it.
7201
7202 The residual amount won't be entered because TEMP1 is a mutliple of
7203 the allocation size. The residual will always be 0. As such, the only
7204 part we are actually using from that code is the loop setup. The
7205 actual probing is done in aarch64_output_probe_stack_range. */
7206 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7207 stack_pointer_rtx, temp1));
7208
7209 /* Now reset the CFA register if needed. */
7210 if (frame_related_p)
7211 {
7212 add_reg_note (insn, REG_CFA_DEF_CFA,
7213 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7214 RTX_FRAME_RELATED_P (insn) = 1;
7215 }
7216
7217 emit_insn (gen_blockage ());
7218 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7219 }
7220
7221 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
7222 be probed. This maintains the requirement that each page is probed at
7223 least once. For initial probing we probe only if the allocation is
7224 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7225 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
7226 GUARD_SIZE. This works that for any allocation that is large enough to
7227 trigger a probe here, we'll have at least one, and if they're not large
7228 enough for this code to emit anything for them, The page would have been
7229 probed by the saving of FP/LR either by this function or any callees. If
7230 we don't have any callees then we won't have more stack adjustments and so
7231 are still safe. */
7232 if (residual)
7233 {
7234 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7235 /* If we're doing final adjustments, and we've done any full page
7236 allocations then any residual needs to be probed. */
7237 if (final_adjustment_p && rounded_size != 0)
7238 min_probe_threshold = 0;
7239 /* If doing a small final adjustment, we always probe at offset 0.
7240 This is done to avoid issues when LR is not at position 0 or when
7241 the final adjustment is smaller than the probing offset. */
7242 else if (final_adjustment_p && rounded_size == 0)
7243 residual_probe_offset = 0;
7244
7245 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7246 if (residual >= min_probe_threshold)
7247 {
7248 if (dump_file)
7249 fprintf (dump_file,
7250 "Stack clash AArch64 prologue residuals: "
7251 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7252 "\n", residual);
7253
7254 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7255 residual_probe_offset));
7256 emit_insn (gen_blockage ());
7257 }
7258 }
7259 }
7260
7261 /* Return 1 if the register is used by the epilogue. We need to say the
7262 return register is used, but only after epilogue generation is complete.
7263 Note that in the case of sibcalls, the values "used by the epilogue" are
7264 considered live at the start of the called function.
7265
7266 For SIMD functions we need to return 1 for FP registers that are saved and
7267 restored by a function but are not zero in call_used_regs. If we do not do
7268 this optimizations may remove the restore of the register. */
7269
7270 int
7271 aarch64_epilogue_uses (int regno)
7272 {
7273 if (epilogue_completed)
7274 {
7275 if (regno == LR_REGNUM)
7276 return 1;
7277 }
7278 return 0;
7279 }
7280
7281 /* AArch64 stack frames generated by this compiler look like:
7282
7283 +-------------------------------+
7284 | |
7285 | incoming stack arguments |
7286 | |
7287 +-------------------------------+
7288 | | <-- incoming stack pointer (aligned)
7289 | callee-allocated save area |
7290 | for register varargs |
7291 | |
7292 +-------------------------------+
7293 | local variables | <-- frame_pointer_rtx
7294 | |
7295 +-------------------------------+
7296 | padding | \
7297 +-------------------------------+ |
7298 | callee-saved registers | | frame.saved_regs_size
7299 +-------------------------------+ |
7300 | LR' | |
7301 +-------------------------------+ |
7302 | FP' | |
7303 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
7304 | SVE vector registers | | \
7305 +-------------------------------+ | | below_hard_fp_saved_regs_size
7306 | SVE predicate registers | / /
7307 +-------------------------------+
7308 | dynamic allocation |
7309 +-------------------------------+
7310 | padding |
7311 +-------------------------------+
7312 | outgoing stack arguments | <-- arg_pointer
7313 | |
7314 +-------------------------------+
7315 | | <-- stack_pointer_rtx (aligned)
7316
7317 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7318 but leave frame_pointer_rtx and hard_frame_pointer_rtx
7319 unchanged.
7320
7321 By default for stack-clash we assume the guard is at least 64KB, but this
7322 value is configurable to either 4KB or 64KB. We also force the guard size to
7323 be the same as the probing interval and both values are kept in sync.
7324
7325 With those assumptions the callee can allocate up to 63KB (or 3KB depending
7326 on the guard size) of stack space without probing.
7327
7328 When probing is needed, we emit a probe at the start of the prologue
7329 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7330
7331 We have to track how much space has been allocated and the only stores
7332 to the stack we track as implicit probes are the FP/LR stores.
7333
7334 For outgoing arguments we probe if the size is larger than 1KB, such that
7335 the ABI specified buffer is maintained for the next callee.
7336
7337 The following registers are reserved during frame layout and should not be
7338 used for any other purpose:
7339
7340 - r11: Used by stack clash protection when SVE is enabled, and also
7341 as an anchor register when saving and restoring registers
7342 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7343 - r14 and r15: Used for speculation tracking.
7344 - r16(IP0), r17(IP1): Used by indirect tailcalls.
7345 - r30(LR), r29(FP): Used by standard frame layout.
7346
7347 These registers must be avoided in frame layout related code unless the
7348 explicit intention is to interact with one of the features listed above. */
7349
7350 /* Generate the prologue instructions for entry into a function.
7351 Establish the stack frame by decreasing the stack pointer with a
7352 properly calculated size and, if necessary, create a frame record
7353 filled with the values of LR and previous frame pointer. The
7354 current FP is also set up if it is in use. */
7355
7356 void
7357 aarch64_expand_prologue (void)
7358 {
7359 poly_int64 frame_size = cfun->machine->frame.frame_size;
7360 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7361 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7362 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7363 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7364 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7365 poly_int64 below_hard_fp_saved_regs_size
7366 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7367 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7368 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7369 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7370 rtx_insn *insn;
7371
7372 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7373 {
7374 /* Fold the SVE allocation into the initial allocation.
7375 We don't do this in aarch64_layout_arg to avoid pessimizing
7376 the epilogue code. */
7377 initial_adjust += sve_callee_adjust;
7378 sve_callee_adjust = 0;
7379 }
7380
7381 /* Sign return address for functions. */
7382 if (aarch64_return_address_signing_enabled ())
7383 {
7384 switch (aarch64_ra_sign_key)
7385 {
7386 case AARCH64_KEY_A:
7387 insn = emit_insn (gen_paciasp ());
7388 break;
7389 case AARCH64_KEY_B:
7390 insn = emit_insn (gen_pacibsp ());
7391 break;
7392 default:
7393 gcc_unreachable ();
7394 }
7395 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7396 RTX_FRAME_RELATED_P (insn) = 1;
7397 }
7398
7399 if (flag_stack_usage_info)
7400 current_function_static_stack_size = constant_lower_bound (frame_size);
7401
7402 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7403 {
7404 if (crtl->is_leaf && !cfun->calls_alloca)
7405 {
7406 if (maybe_gt (frame_size, PROBE_INTERVAL)
7407 && maybe_gt (frame_size, get_stack_check_protect ()))
7408 aarch64_emit_probe_stack_range (get_stack_check_protect (),
7409 (frame_size
7410 - get_stack_check_protect ()));
7411 }
7412 else if (maybe_gt (frame_size, 0))
7413 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7414 }
7415
7416 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7417 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7418
7419 /* In theory we should never have both an initial adjustment
7420 and a callee save adjustment. Verify that is the case since the
7421 code below does not handle it for -fstack-clash-protection. */
7422 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
7423
7424 /* Will only probe if the initial adjustment is larger than the guard
7425 less the amount of the guard reserved for use by the caller's
7426 outgoing args. */
7427 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
7428 true, false);
7429
7430 if (callee_adjust != 0)
7431 aarch64_push_regs (reg1, reg2, callee_adjust);
7432
7433 /* The offset of the frame chain record (if any) from the current SP. */
7434 poly_int64 chain_offset = (initial_adjust + callee_adjust
7435 - cfun->machine->frame.hard_fp_offset);
7436 gcc_assert (known_ge (chain_offset, 0));
7437
7438 /* The offset of the bottom of the save area from the current SP. */
7439 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
7440
7441 if (emit_frame_chain)
7442 {
7443 if (callee_adjust == 0)
7444 {
7445 reg1 = R29_REGNUM;
7446 reg2 = R30_REGNUM;
7447 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
7448 false, false);
7449 }
7450 else
7451 gcc_assert (known_eq (chain_offset, 0));
7452 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
7453 stack_pointer_rtx, chain_offset,
7454 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
7455 if (frame_pointer_needed && !frame_size.is_constant ())
7456 {
7457 /* Variable-sized frames need to describe the save slot
7458 address using DW_CFA_expression rather than DW_CFA_offset.
7459 This means that, without taking further action, the
7460 locations of the registers that we've already saved would
7461 remain based on the stack pointer even after we redefine
7462 the CFA based on the frame pointer. We therefore need new
7463 DW_CFA_expressions to re-express the save slots with addresses
7464 based on the frame pointer. */
7465 rtx_insn *insn = get_last_insn ();
7466 gcc_assert (RTX_FRAME_RELATED_P (insn));
7467
7468 /* Add an explicit CFA definition if this was previously
7469 implicit. */
7470 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
7471 {
7472 rtx src = plus_constant (Pmode, stack_pointer_rtx,
7473 callee_offset);
7474 add_reg_note (insn, REG_CFA_ADJUST_CFA,
7475 gen_rtx_SET (hard_frame_pointer_rtx, src));
7476 }
7477
7478 /* Change the save slot expressions for the registers that
7479 we've already saved. */
7480 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
7481 hard_frame_pointer_rtx, UNITS_PER_WORD);
7482 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
7483 hard_frame_pointer_rtx, 0);
7484 }
7485 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
7486 }
7487
7488 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
7489 callee_adjust != 0 || emit_frame_chain,
7490 emit_frame_chain);
7491 if (maybe_ne (sve_callee_adjust, 0))
7492 {
7493 gcc_assert (!flag_stack_clash_protection
7494 || known_eq (initial_adjust, 0));
7495 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
7496 sve_callee_adjust,
7497 !frame_pointer_needed, false);
7498 saved_regs_offset += sve_callee_adjust;
7499 }
7500 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
7501 false, emit_frame_chain);
7502 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
7503 callee_adjust != 0 || emit_frame_chain,
7504 emit_frame_chain);
7505
7506 /* We may need to probe the final adjustment if it is larger than the guard
7507 that is assumed by the called. */
7508 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
7509 !frame_pointer_needed, true);
7510 }
7511
7512 /* Return TRUE if we can use a simple_return insn.
7513
7514 This function checks whether the callee saved stack is empty, which
7515 means no restore actions are need. The pro_and_epilogue will use
7516 this to check whether shrink-wrapping opt is feasible. */
7517
7518 bool
7519 aarch64_use_return_insn_p (void)
7520 {
7521 if (!reload_completed)
7522 return false;
7523
7524 if (crtl->profile)
7525 return false;
7526
7527 return known_eq (cfun->machine->frame.frame_size, 0);
7528 }
7529
7530 /* Generate the epilogue instructions for returning from a function.
7531 This is almost exactly the reverse of the prolog sequence, except
7532 that we need to insert barriers to avoid scheduling loads that read
7533 from a deallocated stack, and we optimize the unwind records by
7534 emitting them all together if possible. */
7535 void
7536 aarch64_expand_epilogue (bool for_sibcall)
7537 {
7538 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7539 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7540 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7541 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7542 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7543 poly_int64 below_hard_fp_saved_regs_size
7544 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7545 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7546 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7547 rtx cfi_ops = NULL;
7548 rtx_insn *insn;
7549 /* A stack clash protection prologue may not have left EP0_REGNUM or
7550 EP1_REGNUM in a usable state. The same is true for allocations
7551 with an SVE component, since we then need both temporary registers
7552 for each allocation. For stack clash we are in a usable state if
7553 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
7554 HOST_WIDE_INT guard_size
7555 = 1 << param_stack_clash_protection_guard_size;
7556 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7557
7558 /* We can re-use the registers when:
7559
7560 (a) the deallocation amount is the same as the corresponding
7561 allocation amount (which is false if we combine the initial
7562 and SVE callee save allocations in the prologue); and
7563
7564 (b) the allocation amount doesn't need a probe (which is false
7565 if the amount is guard_size - guard_used_by_caller or greater).
7566
7567 In such situations the register should remain live with the correct
7568 value. */
7569 bool can_inherit_p = (initial_adjust.is_constant ()
7570 && final_adjust.is_constant ()
7571 && (!flag_stack_clash_protection
7572 || (known_lt (initial_adjust,
7573 guard_size - guard_used_by_caller)
7574 && known_eq (sve_callee_adjust, 0))));
7575
7576 /* We need to add memory barrier to prevent read from deallocated stack. */
7577 bool need_barrier_p
7578 = maybe_ne (get_frame_size ()
7579 + cfun->machine->frame.saved_varargs_size, 0);
7580
7581 /* Emit a barrier to prevent loads from a deallocated stack. */
7582 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
7583 || cfun->calls_alloca
7584 || crtl->calls_eh_return)
7585 {
7586 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7587 need_barrier_p = false;
7588 }
7589
7590 /* Restore the stack pointer from the frame pointer if it may not
7591 be the same as the stack pointer. */
7592 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7593 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7594 if (frame_pointer_needed
7595 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
7596 /* If writeback is used when restoring callee-saves, the CFA
7597 is restored on the instruction doing the writeback. */
7598 aarch64_add_offset (Pmode, stack_pointer_rtx,
7599 hard_frame_pointer_rtx,
7600 -callee_offset - below_hard_fp_saved_regs_size,
7601 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
7602 else
7603 /* The case where we need to re-use the register here is very rare, so
7604 avoid the complicated condition and just always emit a move if the
7605 immediate doesn't fit. */
7606 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
7607
7608 /* Restore the vector registers before the predicate registers,
7609 so that we can use P4 as a temporary for big-endian SVE frames. */
7610 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
7611 callee_adjust != 0, &cfi_ops);
7612 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
7613 false, &cfi_ops);
7614 if (maybe_ne (sve_callee_adjust, 0))
7615 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
7616 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
7617 R0_REGNUM, R30_REGNUM,
7618 callee_adjust != 0, &cfi_ops);
7619
7620 if (need_barrier_p)
7621 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
7622
7623 if (callee_adjust != 0)
7624 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
7625
7626 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
7627 {
7628 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
7629 insn = get_last_insn ();
7630 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
7631 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
7632 RTX_FRAME_RELATED_P (insn) = 1;
7633 cfi_ops = NULL;
7634 }
7635
7636 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
7637 add restriction on emit_move optimization to leaf functions. */
7638 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
7639 (!can_inherit_p || !crtl->is_leaf
7640 || df_regs_ever_live_p (EP0_REGNUM)));
7641
7642 if (cfi_ops)
7643 {
7644 /* Emit delayed restores and reset the CFA to be SP. */
7645 insn = get_last_insn ();
7646 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
7647 REG_NOTES (insn) = cfi_ops;
7648 RTX_FRAME_RELATED_P (insn) = 1;
7649 }
7650
7651 /* We prefer to emit the combined return/authenticate instruction RETAA,
7652 however there are three cases in which we must instead emit an explicit
7653 authentication instruction.
7654
7655 1) Sibcalls don't return in a normal way, so if we're about to call one
7656 we must authenticate.
7657
7658 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
7659 generating code for !TARGET_ARMV8_3 we can't use it and must
7660 explicitly authenticate.
7661
7662 3) On an eh_return path we make extra stack adjustments to update the
7663 canonical frame address to be the exception handler's CFA. We want
7664 to authenticate using the CFA of the function which calls eh_return.
7665 */
7666 if (aarch64_return_address_signing_enabled ()
7667 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
7668 {
7669 switch (aarch64_ra_sign_key)
7670 {
7671 case AARCH64_KEY_A:
7672 insn = emit_insn (gen_autiasp ());
7673 break;
7674 case AARCH64_KEY_B:
7675 insn = emit_insn (gen_autibsp ());
7676 break;
7677 default:
7678 gcc_unreachable ();
7679 }
7680 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7681 RTX_FRAME_RELATED_P (insn) = 1;
7682 }
7683
7684 /* Stack adjustment for exception handler. */
7685 if (crtl->calls_eh_return && !for_sibcall)
7686 {
7687 /* We need to unwind the stack by the offset computed by
7688 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
7689 to be SP; letting the CFA move during this adjustment
7690 is just as correct as retaining the CFA from the body
7691 of the function. Therefore, do nothing special. */
7692 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
7693 }
7694
7695 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
7696 if (!for_sibcall)
7697 emit_jump_insn (ret_rtx);
7698 }
7699
7700 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
7701 normally or return to a previous frame after unwinding.
7702
7703 An EH return uses a single shared return sequence. The epilogue is
7704 exactly like a normal epilogue except that it has an extra input
7705 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
7706 that must be applied after the frame has been destroyed. An extra label
7707 is inserted before the epilogue which initializes this register to zero,
7708 and this is the entry point for a normal return.
7709
7710 An actual EH return updates the return address, initializes the stack
7711 adjustment and jumps directly into the epilogue (bypassing the zeroing
7712 of the adjustment). Since the return address is typically saved on the
7713 stack when a function makes a call, the saved LR must be updated outside
7714 the epilogue.
7715
7716 This poses problems as the store is generated well before the epilogue,
7717 so the offset of LR is not known yet. Also optimizations will remove the
7718 store as it appears dead, even after the epilogue is generated (as the
7719 base or offset for loading LR is different in many cases).
7720
7721 To avoid these problems this implementation forces the frame pointer
7722 in eh_return functions so that the location of LR is fixed and known early.
7723 It also marks the store volatile, so no optimization is permitted to
7724 remove the store. */
7725 rtx
7726 aarch64_eh_return_handler_rtx (void)
7727 {
7728 rtx tmp = gen_frame_mem (Pmode,
7729 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
7730
7731 /* Mark the store volatile, so no optimization is permitted to remove it. */
7732 MEM_VOLATILE_P (tmp) = true;
7733 return tmp;
7734 }
7735
7736 /* Output code to add DELTA to the first argument, and then jump
7737 to FUNCTION. Used for C++ multiple inheritance. */
7738 static void
7739 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
7740 HOST_WIDE_INT delta,
7741 HOST_WIDE_INT vcall_offset,
7742 tree function)
7743 {
7744 /* The this pointer is always in x0. Note that this differs from
7745 Arm where the this pointer maybe bumped to r1 if r0 is required
7746 to return a pointer to an aggregate. On AArch64 a result value
7747 pointer will be in x8. */
7748 int this_regno = R0_REGNUM;
7749 rtx this_rtx, temp0, temp1, addr, funexp;
7750 rtx_insn *insn;
7751 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
7752
7753 if (aarch64_bti_enabled ())
7754 emit_insn (gen_bti_c());
7755
7756 reload_completed = 1;
7757 emit_note (NOTE_INSN_PROLOGUE_END);
7758
7759 this_rtx = gen_rtx_REG (Pmode, this_regno);
7760 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
7761 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
7762
7763 if (vcall_offset == 0)
7764 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
7765 else
7766 {
7767 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
7768
7769 addr = this_rtx;
7770 if (delta != 0)
7771 {
7772 if (delta >= -256 && delta < 256)
7773 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
7774 plus_constant (Pmode, this_rtx, delta));
7775 else
7776 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
7777 temp1, temp0, false);
7778 }
7779
7780 if (Pmode == ptr_mode)
7781 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
7782 else
7783 aarch64_emit_move (temp0,
7784 gen_rtx_ZERO_EXTEND (Pmode,
7785 gen_rtx_MEM (ptr_mode, addr)));
7786
7787 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
7788 addr = plus_constant (Pmode, temp0, vcall_offset);
7789 else
7790 {
7791 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
7792 Pmode);
7793 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
7794 }
7795
7796 if (Pmode == ptr_mode)
7797 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
7798 else
7799 aarch64_emit_move (temp1,
7800 gen_rtx_SIGN_EXTEND (Pmode,
7801 gen_rtx_MEM (ptr_mode, addr)));
7802
7803 emit_insn (gen_add2_insn (this_rtx, temp1));
7804 }
7805
7806 /* Generate a tail call to the target function. */
7807 if (!TREE_USED (function))
7808 {
7809 assemble_external (function);
7810 TREE_USED (function) = 1;
7811 }
7812 funexp = XEXP (DECL_RTL (function), 0);
7813 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
7814 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
7815 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
7816 SIBLING_CALL_P (insn) = 1;
7817
7818 insn = get_insns ();
7819 shorten_branches (insn);
7820
7821 assemble_start_function (thunk, fnname);
7822 final_start_function (insn, file, 1);
7823 final (insn, file, 1);
7824 final_end_function ();
7825 assemble_end_function (thunk, fnname);
7826
7827 /* Stop pretending to be a post-reload pass. */
7828 reload_completed = 0;
7829 }
7830
7831 static bool
7832 aarch64_tls_referenced_p (rtx x)
7833 {
7834 if (!TARGET_HAVE_TLS)
7835 return false;
7836 subrtx_iterator::array_type array;
7837 FOR_EACH_SUBRTX (iter, array, x, ALL)
7838 {
7839 const_rtx x = *iter;
7840 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
7841 return true;
7842 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
7843 TLS offsets, not real symbol references. */
7844 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7845 iter.skip_subrtxes ();
7846 }
7847 return false;
7848 }
7849
7850
7851 /* Return true if val can be encoded as a 12-bit unsigned immediate with
7852 a left shift of 0 or 12 bits. */
7853 bool
7854 aarch64_uimm12_shift (HOST_WIDE_INT val)
7855 {
7856 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
7857 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
7858 );
7859 }
7860
7861 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
7862 that can be created with a left shift of 0 or 12. */
7863 static HOST_WIDE_INT
7864 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
7865 {
7866 /* Check to see if the value fits in 24 bits, as that is the maximum we can
7867 handle correctly. */
7868 gcc_assert ((val & 0xffffff) == val);
7869
7870 if (((val & 0xfff) << 0) == val)
7871 return val;
7872
7873 return val & (0xfff << 12);
7874 }
7875
7876 /* Return true if val is an immediate that can be loaded into a
7877 register by a MOVZ instruction. */
7878 static bool
7879 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
7880 {
7881 if (GET_MODE_SIZE (mode) > 4)
7882 {
7883 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
7884 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
7885 return 1;
7886 }
7887 else
7888 {
7889 /* Ignore sign extension. */
7890 val &= (HOST_WIDE_INT) 0xffffffff;
7891 }
7892 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
7893 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
7894 }
7895
7896 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
7897 64-bit (DImode) integer. */
7898
7899 static unsigned HOST_WIDE_INT
7900 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
7901 {
7902 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
7903 while (size < 64)
7904 {
7905 val &= (HOST_WIDE_INT_1U << size) - 1;
7906 val |= val << size;
7907 size *= 2;
7908 }
7909 return val;
7910 }
7911
7912 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
7913
7914 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
7915 {
7916 0x0000000100000001ull,
7917 0x0001000100010001ull,
7918 0x0101010101010101ull,
7919 0x1111111111111111ull,
7920 0x5555555555555555ull,
7921 };
7922
7923
7924 /* Return true if val is a valid bitmask immediate. */
7925
7926 bool
7927 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
7928 {
7929 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
7930 int bits;
7931
7932 /* Check for a single sequence of one bits and return quickly if so.
7933 The special cases of all ones and all zeroes returns false. */
7934 val = aarch64_replicate_bitmask_imm (val_in, mode);
7935 tmp = val + (val & -val);
7936
7937 if (tmp == (tmp & -tmp))
7938 return (val + 1) > 1;
7939
7940 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
7941 if (mode == SImode)
7942 val = (val << 32) | (val & 0xffffffff);
7943
7944 /* Invert if the immediate doesn't start with a zero bit - this means we
7945 only need to search for sequences of one bits. */
7946 if (val & 1)
7947 val = ~val;
7948
7949 /* Find the first set bit and set tmp to val with the first sequence of one
7950 bits removed. Return success if there is a single sequence of ones. */
7951 first_one = val & -val;
7952 tmp = val & (val + first_one);
7953
7954 if (tmp == 0)
7955 return true;
7956
7957 /* Find the next set bit and compute the difference in bit position. */
7958 next_one = tmp & -tmp;
7959 bits = clz_hwi (first_one) - clz_hwi (next_one);
7960 mask = val ^ tmp;
7961
7962 /* Check the bit position difference is a power of 2, and that the first
7963 sequence of one bits fits within 'bits' bits. */
7964 if ((mask >> bits) != 0 || bits != (bits & -bits))
7965 return false;
7966
7967 /* Check the sequence of one bits is repeated 64/bits times. */
7968 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
7969 }
7970
7971 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
7972 Assumed precondition: VAL_IN Is not zero. */
7973
7974 unsigned HOST_WIDE_INT
7975 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
7976 {
7977 int lowest_bit_set = ctz_hwi (val_in);
7978 int highest_bit_set = floor_log2 (val_in);
7979 gcc_assert (val_in != 0);
7980
7981 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
7982 (HOST_WIDE_INT_1U << lowest_bit_set));
7983 }
7984
7985 /* Create constant where bits outside of lowest bit set to highest bit set
7986 are set to 1. */
7987
7988 unsigned HOST_WIDE_INT
7989 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
7990 {
7991 return val_in | ~aarch64_and_split_imm1 (val_in);
7992 }
7993
7994 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
7995
7996 bool
7997 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
7998 {
7999 scalar_int_mode int_mode;
8000 if (!is_a <scalar_int_mode> (mode, &int_mode))
8001 return false;
8002
8003 if (aarch64_bitmask_imm (val_in, int_mode))
8004 return false;
8005
8006 if (aarch64_move_imm (val_in, int_mode))
8007 return false;
8008
8009 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8010
8011 return aarch64_bitmask_imm (imm2, int_mode);
8012 }
8013
8014 /* Return true if val is an immediate that can be loaded into a
8015 register in a single instruction. */
8016 bool
8017 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8018 {
8019 scalar_int_mode int_mode;
8020 if (!is_a <scalar_int_mode> (mode, &int_mode))
8021 return false;
8022
8023 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8024 return 1;
8025 return aarch64_bitmask_imm (val, int_mode);
8026 }
8027
8028 static bool
8029 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8030 {
8031 rtx base, offset;
8032
8033 if (GET_CODE (x) == HIGH)
8034 return true;
8035
8036 /* There's no way to calculate VL-based values using relocations. */
8037 subrtx_iterator::array_type array;
8038 FOR_EACH_SUBRTX (iter, array, x, ALL)
8039 if (GET_CODE (*iter) == CONST_POLY_INT)
8040 return true;
8041
8042 split_const (x, &base, &offset);
8043 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
8044 {
8045 if (aarch64_classify_symbol (base, INTVAL (offset))
8046 != SYMBOL_FORCE_TO_MEM)
8047 return true;
8048 else
8049 /* Avoid generating a 64-bit relocation in ILP32; leave
8050 to aarch64_expand_mov_immediate to handle it properly. */
8051 return mode != ptr_mode;
8052 }
8053
8054 return aarch64_tls_referenced_p (x);
8055 }
8056
8057 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8058 The expansion for a table switch is quite expensive due to the number
8059 of instructions, the table lookup and hard to predict indirect jump.
8060 When optimizing for speed, and -O3 enabled, use the per-core tuning if
8061 set, otherwise use tables for > 16 cases as a tradeoff between size and
8062 performance. When optimizing for size, use the default setting. */
8063
8064 static unsigned int
8065 aarch64_case_values_threshold (void)
8066 {
8067 /* Use the specified limit for the number of cases before using jump
8068 tables at higher optimization levels. */
8069 if (optimize > 2
8070 && selected_cpu->tune->max_case_values != 0)
8071 return selected_cpu->tune->max_case_values;
8072 else
8073 return optimize_size ? default_case_values_threshold () : 17;
8074 }
8075
8076 /* Return true if register REGNO is a valid index register.
8077 STRICT_P is true if REG_OK_STRICT is in effect. */
8078
8079 bool
8080 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8081 {
8082 if (!HARD_REGISTER_NUM_P (regno))
8083 {
8084 if (!strict_p)
8085 return true;
8086
8087 if (!reg_renumber)
8088 return false;
8089
8090 regno = reg_renumber[regno];
8091 }
8092 return GP_REGNUM_P (regno);
8093 }
8094
8095 /* Return true if register REGNO is a valid base register for mode MODE.
8096 STRICT_P is true if REG_OK_STRICT is in effect. */
8097
8098 bool
8099 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8100 {
8101 if (!HARD_REGISTER_NUM_P (regno))
8102 {
8103 if (!strict_p)
8104 return true;
8105
8106 if (!reg_renumber)
8107 return false;
8108
8109 regno = reg_renumber[regno];
8110 }
8111
8112 /* The fake registers will be eliminated to either the stack or
8113 hard frame pointer, both of which are usually valid base registers.
8114 Reload deals with the cases where the eliminated form isn't valid. */
8115 return (GP_REGNUM_P (regno)
8116 || regno == SP_REGNUM
8117 || regno == FRAME_POINTER_REGNUM
8118 || regno == ARG_POINTER_REGNUM);
8119 }
8120
8121 /* Return true if X is a valid base register for mode MODE.
8122 STRICT_P is true if REG_OK_STRICT is in effect. */
8123
8124 static bool
8125 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8126 {
8127 if (!strict_p
8128 && GET_CODE (x) == SUBREG
8129 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8130 x = SUBREG_REG (x);
8131
8132 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8133 }
8134
8135 /* Return true if address offset is a valid index. If it is, fill in INFO
8136 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
8137
8138 static bool
8139 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8140 machine_mode mode, bool strict_p)
8141 {
8142 enum aarch64_address_type type;
8143 rtx index;
8144 int shift;
8145
8146 /* (reg:P) */
8147 if ((REG_P (x) || GET_CODE (x) == SUBREG)
8148 && GET_MODE (x) == Pmode)
8149 {
8150 type = ADDRESS_REG_REG;
8151 index = x;
8152 shift = 0;
8153 }
8154 /* (sign_extend:DI (reg:SI)) */
8155 else if ((GET_CODE (x) == SIGN_EXTEND
8156 || GET_CODE (x) == ZERO_EXTEND)
8157 && GET_MODE (x) == DImode
8158 && GET_MODE (XEXP (x, 0)) == SImode)
8159 {
8160 type = (GET_CODE (x) == SIGN_EXTEND)
8161 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8162 index = XEXP (x, 0);
8163 shift = 0;
8164 }
8165 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8166 else if (GET_CODE (x) == MULT
8167 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8168 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8169 && GET_MODE (XEXP (x, 0)) == DImode
8170 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8171 && CONST_INT_P (XEXP (x, 1)))
8172 {
8173 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8174 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8175 index = XEXP (XEXP (x, 0), 0);
8176 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8177 }
8178 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8179 else if (GET_CODE (x) == ASHIFT
8180 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8181 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8182 && GET_MODE (XEXP (x, 0)) == DImode
8183 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8184 && CONST_INT_P (XEXP (x, 1)))
8185 {
8186 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8187 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8188 index = XEXP (XEXP (x, 0), 0);
8189 shift = INTVAL (XEXP (x, 1));
8190 }
8191 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8192 else if ((GET_CODE (x) == SIGN_EXTRACT
8193 || GET_CODE (x) == ZERO_EXTRACT)
8194 && GET_MODE (x) == DImode
8195 && GET_CODE (XEXP (x, 0)) == MULT
8196 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8197 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8198 {
8199 type = (GET_CODE (x) == SIGN_EXTRACT)
8200 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8201 index = XEXP (XEXP (x, 0), 0);
8202 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8203 if (INTVAL (XEXP (x, 1)) != 32 + shift
8204 || INTVAL (XEXP (x, 2)) != 0)
8205 shift = -1;
8206 }
8207 /* (and:DI (mult:DI (reg:DI) (const_int scale))
8208 (const_int 0xffffffff<<shift)) */
8209 else if (GET_CODE (x) == AND
8210 && GET_MODE (x) == DImode
8211 && GET_CODE (XEXP (x, 0)) == MULT
8212 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8213 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8214 && CONST_INT_P (XEXP (x, 1)))
8215 {
8216 type = ADDRESS_REG_UXTW;
8217 index = XEXP (XEXP (x, 0), 0);
8218 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8219 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8220 shift = -1;
8221 }
8222 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8223 else if ((GET_CODE (x) == SIGN_EXTRACT
8224 || GET_CODE (x) == ZERO_EXTRACT)
8225 && GET_MODE (x) == DImode
8226 && GET_CODE (XEXP (x, 0)) == ASHIFT
8227 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8228 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8229 {
8230 type = (GET_CODE (x) == SIGN_EXTRACT)
8231 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8232 index = XEXP (XEXP (x, 0), 0);
8233 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8234 if (INTVAL (XEXP (x, 1)) != 32 + shift
8235 || INTVAL (XEXP (x, 2)) != 0)
8236 shift = -1;
8237 }
8238 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8239 (const_int 0xffffffff<<shift)) */
8240 else if (GET_CODE (x) == AND
8241 && GET_MODE (x) == DImode
8242 && GET_CODE (XEXP (x, 0)) == ASHIFT
8243 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8244 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8245 && CONST_INT_P (XEXP (x, 1)))
8246 {
8247 type = ADDRESS_REG_UXTW;
8248 index = XEXP (XEXP (x, 0), 0);
8249 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8250 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8251 shift = -1;
8252 }
8253 /* (mult:P (reg:P) (const_int scale)) */
8254 else if (GET_CODE (x) == MULT
8255 && GET_MODE (x) == Pmode
8256 && GET_MODE (XEXP (x, 0)) == Pmode
8257 && CONST_INT_P (XEXP (x, 1)))
8258 {
8259 type = ADDRESS_REG_REG;
8260 index = XEXP (x, 0);
8261 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8262 }
8263 /* (ashift:P (reg:P) (const_int shift)) */
8264 else if (GET_CODE (x) == ASHIFT
8265 && GET_MODE (x) == Pmode
8266 && GET_MODE (XEXP (x, 0)) == Pmode
8267 && CONST_INT_P (XEXP (x, 1)))
8268 {
8269 type = ADDRESS_REG_REG;
8270 index = XEXP (x, 0);
8271 shift = INTVAL (XEXP (x, 1));
8272 }
8273 else
8274 return false;
8275
8276 if (!strict_p
8277 && GET_CODE (index) == SUBREG
8278 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8279 index = SUBREG_REG (index);
8280
8281 if (aarch64_sve_data_mode_p (mode))
8282 {
8283 if (type != ADDRESS_REG_REG
8284 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8285 return false;
8286 }
8287 else
8288 {
8289 if (shift != 0
8290 && !(IN_RANGE (shift, 1, 3)
8291 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8292 return false;
8293 }
8294
8295 if (REG_P (index)
8296 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8297 {
8298 info->type = type;
8299 info->offset = index;
8300 info->shift = shift;
8301 return true;
8302 }
8303
8304 return false;
8305 }
8306
8307 /* Return true if MODE is one of the modes for which we
8308 support LDP/STP operations. */
8309
8310 static bool
8311 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8312 {
8313 return mode == SImode || mode == DImode
8314 || mode == SFmode || mode == DFmode
8315 || (aarch64_vector_mode_supported_p (mode)
8316 && (known_eq (GET_MODE_SIZE (mode), 8)
8317 || (known_eq (GET_MODE_SIZE (mode), 16)
8318 && (aarch64_tune_params.extra_tuning_flags
8319 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8320 }
8321
8322 /* Return true if REGNO is a virtual pointer register, or an eliminable
8323 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
8324 include stack_pointer or hard_frame_pointer. */
8325 static bool
8326 virt_or_elim_regno_p (unsigned regno)
8327 {
8328 return ((regno >= FIRST_VIRTUAL_REGISTER
8329 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8330 || regno == FRAME_POINTER_REGNUM
8331 || regno == ARG_POINTER_REGNUM);
8332 }
8333
8334 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8335 If it is, fill in INFO appropriately. STRICT_P is true if
8336 REG_OK_STRICT is in effect. */
8337
8338 bool
8339 aarch64_classify_address (struct aarch64_address_info *info,
8340 rtx x, machine_mode mode, bool strict_p,
8341 aarch64_addr_query_type type)
8342 {
8343 enum rtx_code code = GET_CODE (x);
8344 rtx op0, op1;
8345 poly_int64 offset;
8346
8347 HOST_WIDE_INT const_size;
8348
8349 /* Whether a vector mode is partial doesn't affect address legitimacy.
8350 Partial vectors like VNx8QImode allow the same indexed addressing
8351 mode and MUL VL addressing mode as full vectors like VNx16QImode;
8352 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
8353 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8354 vec_flags &= ~VEC_PARTIAL;
8355
8356 /* On BE, we use load/store pair for all large int mode load/stores.
8357 TI/TFmode may also use a load/store pair. */
8358 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8359 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8360 || type == ADDR_QUERY_LDP_STP_N
8361 || mode == TImode
8362 || mode == TFmode
8363 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8364
8365 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8366 corresponds to the actual size of the memory being loaded/stored and the
8367 mode of the corresponding addressing mode is half of that. */
8368 if (type == ADDR_QUERY_LDP_STP_N
8369 && known_eq (GET_MODE_SIZE (mode), 16))
8370 mode = DFmode;
8371
8372 bool allow_reg_index_p = (!load_store_pair_p
8373 && (known_lt (GET_MODE_SIZE (mode), 16)
8374 || vec_flags == VEC_ADVSIMD
8375 || vec_flags & VEC_SVE_DATA));
8376
8377 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8378 [Rn, #offset, MUL VL]. */
8379 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8380 && (code != REG && code != PLUS))
8381 return false;
8382
8383 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8384 REG addressing. */
8385 if (advsimd_struct_p
8386 && !BYTES_BIG_ENDIAN
8387 && (code != POST_INC && code != REG))
8388 return false;
8389
8390 gcc_checking_assert (GET_MODE (x) == VOIDmode
8391 || SCALAR_INT_MODE_P (GET_MODE (x)));
8392
8393 switch (code)
8394 {
8395 case REG:
8396 case SUBREG:
8397 info->type = ADDRESS_REG_IMM;
8398 info->base = x;
8399 info->offset = const0_rtx;
8400 info->const_offset = 0;
8401 return aarch64_base_register_rtx_p (x, strict_p);
8402
8403 case PLUS:
8404 op0 = XEXP (x, 0);
8405 op1 = XEXP (x, 1);
8406
8407 if (! strict_p
8408 && REG_P (op0)
8409 && virt_or_elim_regno_p (REGNO (op0))
8410 && poly_int_rtx_p (op1, &offset))
8411 {
8412 info->type = ADDRESS_REG_IMM;
8413 info->base = op0;
8414 info->offset = op1;
8415 info->const_offset = offset;
8416
8417 return true;
8418 }
8419
8420 if (maybe_ne (GET_MODE_SIZE (mode), 0)
8421 && aarch64_base_register_rtx_p (op0, strict_p)
8422 && poly_int_rtx_p (op1, &offset))
8423 {
8424 info->type = ADDRESS_REG_IMM;
8425 info->base = op0;
8426 info->offset = op1;
8427 info->const_offset = offset;
8428
8429 /* TImode and TFmode values are allowed in both pairs of X
8430 registers and individual Q registers. The available
8431 address modes are:
8432 X,X: 7-bit signed scaled offset
8433 Q: 9-bit signed offset
8434 We conservatively require an offset representable in either mode.
8435 When performing the check for pairs of X registers i.e. LDP/STP
8436 pass down DImode since that is the natural size of the LDP/STP
8437 instruction memory accesses. */
8438 if (mode == TImode || mode == TFmode)
8439 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
8440 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8441 || offset_12bit_unsigned_scaled_p (mode, offset)));
8442
8443 /* A 7bit offset check because OImode will emit a ldp/stp
8444 instruction (only big endian will get here).
8445 For ldp/stp instructions, the offset is scaled for the size of a
8446 single element of the pair. */
8447 if (mode == OImode)
8448 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
8449
8450 /* Three 9/12 bit offsets checks because CImode will emit three
8451 ldr/str instructions (only big endian will get here). */
8452 if (mode == CImode)
8453 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8454 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
8455 offset + 32)
8456 || offset_12bit_unsigned_scaled_p (V16QImode,
8457 offset + 32)));
8458
8459 /* Two 7bit offsets checks because XImode will emit two ldp/stp
8460 instructions (only big endian will get here). */
8461 if (mode == XImode)
8462 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
8463 && aarch64_offset_7bit_signed_scaled_p (TImode,
8464 offset + 32));
8465
8466 /* Make "m" use the LD1 offset range for SVE data modes, so
8467 that pre-RTL optimizers like ivopts will work to that
8468 instead of the wider LDR/STR range. */
8469 if (vec_flags == VEC_SVE_DATA)
8470 return (type == ADDR_QUERY_M
8471 ? offset_4bit_signed_scaled_p (mode, offset)
8472 : offset_9bit_signed_scaled_p (mode, offset));
8473
8474 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
8475 {
8476 poly_int64 end_offset = (offset
8477 + GET_MODE_SIZE (mode)
8478 - BYTES_PER_SVE_VECTOR);
8479 return (type == ADDR_QUERY_M
8480 ? offset_4bit_signed_scaled_p (mode, offset)
8481 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
8482 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
8483 end_offset)));
8484 }
8485
8486 if (vec_flags == VEC_SVE_PRED)
8487 return offset_9bit_signed_scaled_p (mode, offset);
8488
8489 if (load_store_pair_p)
8490 return ((known_eq (GET_MODE_SIZE (mode), 4)
8491 || known_eq (GET_MODE_SIZE (mode), 8)
8492 || known_eq (GET_MODE_SIZE (mode), 16))
8493 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8494 else
8495 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
8496 || offset_12bit_unsigned_scaled_p (mode, offset));
8497 }
8498
8499 if (allow_reg_index_p)
8500 {
8501 /* Look for base + (scaled/extended) index register. */
8502 if (aarch64_base_register_rtx_p (op0, strict_p)
8503 && aarch64_classify_index (info, op1, mode, strict_p))
8504 {
8505 info->base = op0;
8506 return true;
8507 }
8508 if (aarch64_base_register_rtx_p (op1, strict_p)
8509 && aarch64_classify_index (info, op0, mode, strict_p))
8510 {
8511 info->base = op1;
8512 return true;
8513 }
8514 }
8515
8516 return false;
8517
8518 case POST_INC:
8519 case POST_DEC:
8520 case PRE_INC:
8521 case PRE_DEC:
8522 info->type = ADDRESS_REG_WB;
8523 info->base = XEXP (x, 0);
8524 info->offset = NULL_RTX;
8525 return aarch64_base_register_rtx_p (info->base, strict_p);
8526
8527 case POST_MODIFY:
8528 case PRE_MODIFY:
8529 info->type = ADDRESS_REG_WB;
8530 info->base = XEXP (x, 0);
8531 if (GET_CODE (XEXP (x, 1)) == PLUS
8532 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
8533 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
8534 && aarch64_base_register_rtx_p (info->base, strict_p))
8535 {
8536 info->offset = XEXP (XEXP (x, 1), 1);
8537 info->const_offset = offset;
8538
8539 /* TImode and TFmode values are allowed in both pairs of X
8540 registers and individual Q registers. The available
8541 address modes are:
8542 X,X: 7-bit signed scaled offset
8543 Q: 9-bit signed offset
8544 We conservatively require an offset representable in either mode.
8545 */
8546 if (mode == TImode || mode == TFmode)
8547 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
8548 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
8549
8550 if (load_store_pair_p)
8551 return ((known_eq (GET_MODE_SIZE (mode), 4)
8552 || known_eq (GET_MODE_SIZE (mode), 8)
8553 || known_eq (GET_MODE_SIZE (mode), 16))
8554 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
8555 else
8556 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
8557 }
8558 return false;
8559
8560 case CONST:
8561 case SYMBOL_REF:
8562 case LABEL_REF:
8563 /* load literal: pc-relative constant pool entry. Only supported
8564 for SI mode or larger. */
8565 info->type = ADDRESS_SYMBOLIC;
8566
8567 if (!load_store_pair_p
8568 && GET_MODE_SIZE (mode).is_constant (&const_size)
8569 && const_size >= 4)
8570 {
8571 rtx sym, addend;
8572
8573 split_const (x, &sym, &addend);
8574 return ((GET_CODE (sym) == LABEL_REF
8575 || (GET_CODE (sym) == SYMBOL_REF
8576 && CONSTANT_POOL_ADDRESS_P (sym)
8577 && aarch64_pcrelative_literal_loads)));
8578 }
8579 return false;
8580
8581 case LO_SUM:
8582 info->type = ADDRESS_LO_SUM;
8583 info->base = XEXP (x, 0);
8584 info->offset = XEXP (x, 1);
8585 if (allow_reg_index_p
8586 && aarch64_base_register_rtx_p (info->base, strict_p))
8587 {
8588 rtx sym, offs;
8589 split_const (info->offset, &sym, &offs);
8590 if (GET_CODE (sym) == SYMBOL_REF
8591 && (aarch64_classify_symbol (sym, INTVAL (offs))
8592 == SYMBOL_SMALL_ABSOLUTE))
8593 {
8594 /* The symbol and offset must be aligned to the access size. */
8595 unsigned int align;
8596
8597 if (CONSTANT_POOL_ADDRESS_P (sym))
8598 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
8599 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
8600 {
8601 tree exp = SYMBOL_REF_DECL (sym);
8602 align = TYPE_ALIGN (TREE_TYPE (exp));
8603 align = aarch64_constant_alignment (exp, align);
8604 }
8605 else if (SYMBOL_REF_DECL (sym))
8606 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
8607 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
8608 && SYMBOL_REF_BLOCK (sym) != NULL)
8609 align = SYMBOL_REF_BLOCK (sym)->alignment;
8610 else
8611 align = BITS_PER_UNIT;
8612
8613 poly_int64 ref_size = GET_MODE_SIZE (mode);
8614 if (known_eq (ref_size, 0))
8615 ref_size = GET_MODE_SIZE (DImode);
8616
8617 return (multiple_p (INTVAL (offs), ref_size)
8618 && multiple_p (align / BITS_PER_UNIT, ref_size));
8619 }
8620 }
8621 return false;
8622
8623 default:
8624 return false;
8625 }
8626 }
8627
8628 /* Return true if the address X is valid for a PRFM instruction.
8629 STRICT_P is true if we should do strict checking with
8630 aarch64_classify_address. */
8631
8632 bool
8633 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
8634 {
8635 struct aarch64_address_info addr;
8636
8637 /* PRFM accepts the same addresses as DImode... */
8638 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
8639 if (!res)
8640 return false;
8641
8642 /* ... except writeback forms. */
8643 return addr.type != ADDRESS_REG_WB;
8644 }
8645
8646 bool
8647 aarch64_symbolic_address_p (rtx x)
8648 {
8649 rtx offset;
8650
8651 split_const (x, &x, &offset);
8652 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
8653 }
8654
8655 /* Classify the base of symbolic expression X. */
8656
8657 enum aarch64_symbol_type
8658 aarch64_classify_symbolic_expression (rtx x)
8659 {
8660 rtx offset;
8661
8662 split_const (x, &x, &offset);
8663 return aarch64_classify_symbol (x, INTVAL (offset));
8664 }
8665
8666
8667 /* Return TRUE if X is a legitimate address for accessing memory in
8668 mode MODE. */
8669 static bool
8670 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
8671 {
8672 struct aarch64_address_info addr;
8673
8674 return aarch64_classify_address (&addr, x, mode, strict_p);
8675 }
8676
8677 /* Return TRUE if X is a legitimate address of type TYPE for accessing
8678 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
8679 bool
8680 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
8681 aarch64_addr_query_type type)
8682 {
8683 struct aarch64_address_info addr;
8684
8685 return aarch64_classify_address (&addr, x, mode, strict_p, type);
8686 }
8687
8688 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
8689
8690 static bool
8691 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
8692 poly_int64 orig_offset,
8693 machine_mode mode)
8694 {
8695 HOST_WIDE_INT size;
8696 if (GET_MODE_SIZE (mode).is_constant (&size))
8697 {
8698 HOST_WIDE_INT const_offset, second_offset;
8699
8700 /* A general SVE offset is A * VQ + B. Remove the A component from
8701 coefficient 0 in order to get the constant B. */
8702 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
8703
8704 /* Split an out-of-range address displacement into a base and
8705 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
8706 range otherwise to increase opportunities for sharing the base
8707 address of different sizes. Unaligned accesses use the signed
8708 9-bit range, TImode/TFmode use the intersection of signed
8709 scaled 7-bit and signed 9-bit offset. */
8710 if (mode == TImode || mode == TFmode)
8711 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
8712 else if ((const_offset & (size - 1)) != 0)
8713 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
8714 else
8715 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
8716
8717 if (second_offset == 0 || known_eq (orig_offset, second_offset))
8718 return false;
8719
8720 /* Split the offset into second_offset and the rest. */
8721 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8722 *offset2 = gen_int_mode (second_offset, Pmode);
8723 return true;
8724 }
8725 else
8726 {
8727 /* Get the mode we should use as the basis of the range. For structure
8728 modes this is the mode of one vector. */
8729 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8730 machine_mode step_mode
8731 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
8732
8733 /* Get the "mul vl" multiplier we'd like to use. */
8734 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
8735 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
8736 if (vec_flags & VEC_SVE_DATA)
8737 /* LDR supports a 9-bit range, but the move patterns for
8738 structure modes require all vectors to be in range of the
8739 same base. The simplest way of accomodating that while still
8740 promoting reuse of anchor points between different modes is
8741 to use an 8-bit range unconditionally. */
8742 vnum = ((vnum + 128) & 255) - 128;
8743 else
8744 /* Predicates are only handled singly, so we might as well use
8745 the full range. */
8746 vnum = ((vnum + 256) & 511) - 256;
8747 if (vnum == 0)
8748 return false;
8749
8750 /* Convert the "mul vl" multiplier into a byte offset. */
8751 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
8752 if (known_eq (second_offset, orig_offset))
8753 return false;
8754
8755 /* Split the offset into second_offset and the rest. */
8756 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
8757 *offset2 = gen_int_mode (second_offset, Pmode);
8758 return true;
8759 }
8760 }
8761
8762 /* Return the binary representation of floating point constant VALUE in INTVAL.
8763 If the value cannot be converted, return false without setting INTVAL.
8764 The conversion is done in the given MODE. */
8765 bool
8766 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
8767 {
8768
8769 /* We make a general exception for 0. */
8770 if (aarch64_float_const_zero_rtx_p (value))
8771 {
8772 *intval = 0;
8773 return true;
8774 }
8775
8776 scalar_float_mode mode;
8777 if (GET_CODE (value) != CONST_DOUBLE
8778 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
8779 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
8780 /* Only support up to DF mode. */
8781 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
8782 return false;
8783
8784 unsigned HOST_WIDE_INT ival = 0;
8785
8786 long res[2];
8787 real_to_target (res,
8788 CONST_DOUBLE_REAL_VALUE (value),
8789 REAL_MODE_FORMAT (mode));
8790
8791 if (mode == DFmode)
8792 {
8793 int order = BYTES_BIG_ENDIAN ? 1 : 0;
8794 ival = zext_hwi (res[order], 32);
8795 ival |= (zext_hwi (res[1 - order], 32) << 32);
8796 }
8797 else
8798 ival = zext_hwi (res[0], 32);
8799
8800 *intval = ival;
8801 return true;
8802 }
8803
8804 /* Return TRUE if rtx X is an immediate constant that can be moved using a
8805 single MOV(+MOVK) followed by an FMOV. */
8806 bool
8807 aarch64_float_const_rtx_p (rtx x)
8808 {
8809 machine_mode mode = GET_MODE (x);
8810 if (mode == VOIDmode)
8811 return false;
8812
8813 /* Determine whether it's cheaper to write float constants as
8814 mov/movk pairs over ldr/adrp pairs. */
8815 unsigned HOST_WIDE_INT ival;
8816
8817 if (GET_CODE (x) == CONST_DOUBLE
8818 && SCALAR_FLOAT_MODE_P (mode)
8819 && aarch64_reinterpret_float_as_int (x, &ival))
8820 {
8821 scalar_int_mode imode = (mode == HFmode
8822 ? SImode
8823 : int_mode_for_mode (mode).require ());
8824 int num_instr = aarch64_internal_mov_immediate
8825 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8826 return num_instr < 3;
8827 }
8828
8829 return false;
8830 }
8831
8832 /* Return TRUE if rtx X is immediate constant 0.0 */
8833 bool
8834 aarch64_float_const_zero_rtx_p (rtx x)
8835 {
8836 if (GET_MODE (x) == VOIDmode)
8837 return false;
8838
8839 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
8840 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
8841 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
8842 }
8843
8844 /* Return TRUE if rtx X is immediate constant that fits in a single
8845 MOVI immediate operation. */
8846 bool
8847 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
8848 {
8849 if (!TARGET_SIMD)
8850 return false;
8851
8852 machine_mode vmode;
8853 scalar_int_mode imode;
8854 unsigned HOST_WIDE_INT ival;
8855
8856 if (GET_CODE (x) == CONST_DOUBLE
8857 && SCALAR_FLOAT_MODE_P (mode))
8858 {
8859 if (!aarch64_reinterpret_float_as_int (x, &ival))
8860 return false;
8861
8862 /* We make a general exception for 0. */
8863 if (aarch64_float_const_zero_rtx_p (x))
8864 return true;
8865
8866 imode = int_mode_for_mode (mode).require ();
8867 }
8868 else if (GET_CODE (x) == CONST_INT
8869 && is_a <scalar_int_mode> (mode, &imode))
8870 ival = INTVAL (x);
8871 else
8872 return false;
8873
8874 /* use a 64 bit mode for everything except for DI/DF mode, where we use
8875 a 128 bit vector mode. */
8876 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
8877
8878 vmode = aarch64_simd_container_mode (imode, width);
8879 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
8880
8881 return aarch64_simd_valid_immediate (v_op, NULL);
8882 }
8883
8884
8885 /* Return the fixed registers used for condition codes. */
8886
8887 static bool
8888 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
8889 {
8890 *p1 = CC_REGNUM;
8891 *p2 = INVALID_REGNUM;
8892 return true;
8893 }
8894
8895 /* This function is used by the call expanders of the machine description.
8896 RESULT is the register in which the result is returned. It's NULL for
8897 "call" and "sibcall".
8898 MEM is the location of the function call.
8899 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
8900 SIBCALL indicates whether this function call is normal call or sibling call.
8901 It will generate different pattern accordingly. */
8902
8903 void
8904 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
8905 {
8906 rtx call, callee, tmp;
8907 rtvec vec;
8908 machine_mode mode;
8909
8910 gcc_assert (MEM_P (mem));
8911 callee = XEXP (mem, 0);
8912 mode = GET_MODE (callee);
8913 gcc_assert (mode == Pmode);
8914
8915 /* Decide if we should generate indirect calls by loading the
8916 address of the callee into a register before performing
8917 the branch-and-link. */
8918 if (SYMBOL_REF_P (callee)
8919 ? (aarch64_is_long_call_p (callee)
8920 || aarch64_is_noplt_call_p (callee))
8921 : !REG_P (callee))
8922 XEXP (mem, 0) = force_reg (mode, callee);
8923
8924 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
8925
8926 if (result != NULL_RTX)
8927 call = gen_rtx_SET (result, call);
8928
8929 if (sibcall)
8930 tmp = ret_rtx;
8931 else
8932 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
8933
8934 gcc_assert (CONST_INT_P (callee_abi));
8935 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
8936 UNSPEC_CALLEE_ABI);
8937
8938 vec = gen_rtvec (3, call, callee_abi, tmp);
8939 call = gen_rtx_PARALLEL (VOIDmode, vec);
8940
8941 aarch64_emit_call_insn (call);
8942 }
8943
8944 /* Emit call insn with PAT and do aarch64-specific handling. */
8945
8946 void
8947 aarch64_emit_call_insn (rtx pat)
8948 {
8949 rtx insn = emit_call_insn (pat);
8950
8951 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
8952 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
8953 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
8954 }
8955
8956 machine_mode
8957 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
8958 {
8959 machine_mode mode_x = GET_MODE (x);
8960 rtx_code code_x = GET_CODE (x);
8961
8962 /* All floating point compares return CCFP if it is an equality
8963 comparison, and CCFPE otherwise. */
8964 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
8965 {
8966 switch (code)
8967 {
8968 case EQ:
8969 case NE:
8970 case UNORDERED:
8971 case ORDERED:
8972 case UNLT:
8973 case UNLE:
8974 case UNGT:
8975 case UNGE:
8976 case UNEQ:
8977 return CCFPmode;
8978
8979 case LT:
8980 case LE:
8981 case GT:
8982 case GE:
8983 case LTGT:
8984 return CCFPEmode;
8985
8986 default:
8987 gcc_unreachable ();
8988 }
8989 }
8990
8991 /* Equality comparisons of short modes against zero can be performed
8992 using the TST instruction with the appropriate bitmask. */
8993 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
8994 && (code == EQ || code == NE)
8995 && (mode_x == HImode || mode_x == QImode))
8996 return CC_NZmode;
8997
8998 /* Similarly, comparisons of zero_extends from shorter modes can
8999 be performed using an ANDS with an immediate mask. */
9000 if (y == const0_rtx && code_x == ZERO_EXTEND
9001 && (mode_x == SImode || mode_x == DImode)
9002 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9003 && (code == EQ || code == NE))
9004 return CC_NZmode;
9005
9006 if ((mode_x == SImode || mode_x == DImode)
9007 && y == const0_rtx
9008 && (code == EQ || code == NE || code == LT || code == GE)
9009 && (code_x == PLUS || code_x == MINUS || code_x == AND
9010 || code_x == NEG
9011 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9012 && CONST_INT_P (XEXP (x, 2)))))
9013 return CC_NZmode;
9014
9015 /* A compare with a shifted operand. Because of canonicalization,
9016 the comparison will have to be swapped when we emit the assembly
9017 code. */
9018 if ((mode_x == SImode || mode_x == DImode)
9019 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
9020 && (code_x == ASHIFT || code_x == ASHIFTRT
9021 || code_x == LSHIFTRT
9022 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9023 return CC_SWPmode;
9024
9025 /* Similarly for a negated operand, but we can only do this for
9026 equalities. */
9027 if ((mode_x == SImode || mode_x == DImode)
9028 && (REG_P (y) || GET_CODE (y) == SUBREG)
9029 && (code == EQ || code == NE)
9030 && code_x == NEG)
9031 return CC_Zmode;
9032
9033 /* A test for unsigned overflow from an addition. */
9034 if ((mode_x == DImode || mode_x == TImode)
9035 && (code == LTU || code == GEU)
9036 && code_x == PLUS
9037 && rtx_equal_p (XEXP (x, 0), y))
9038 return CC_Cmode;
9039
9040 /* A test for unsigned overflow from an add with carry. */
9041 if ((mode_x == DImode || mode_x == TImode)
9042 && (code == LTU || code == GEU)
9043 && code_x == PLUS
9044 && CONST_SCALAR_INT_P (y)
9045 && (rtx_mode_t (y, mode_x)
9046 == (wi::shwi (1, mode_x)
9047 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9048 return CC_ADCmode;
9049
9050 /* A test for signed overflow. */
9051 if ((mode_x == DImode || mode_x == TImode)
9052 && code == NE
9053 && code_x == PLUS
9054 && GET_CODE (y) == SIGN_EXTEND)
9055 return CC_Vmode;
9056
9057 /* For everything else, return CCmode. */
9058 return CCmode;
9059 }
9060
9061 static int
9062 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9063
9064 int
9065 aarch64_get_condition_code (rtx x)
9066 {
9067 machine_mode mode = GET_MODE (XEXP (x, 0));
9068 enum rtx_code comp_code = GET_CODE (x);
9069
9070 if (GET_MODE_CLASS (mode) != MODE_CC)
9071 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9072 return aarch64_get_condition_code_1 (mode, comp_code);
9073 }
9074
9075 static int
9076 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9077 {
9078 switch (mode)
9079 {
9080 case E_CCFPmode:
9081 case E_CCFPEmode:
9082 switch (comp_code)
9083 {
9084 case GE: return AARCH64_GE;
9085 case GT: return AARCH64_GT;
9086 case LE: return AARCH64_LS;
9087 case LT: return AARCH64_MI;
9088 case NE: return AARCH64_NE;
9089 case EQ: return AARCH64_EQ;
9090 case ORDERED: return AARCH64_VC;
9091 case UNORDERED: return AARCH64_VS;
9092 case UNLT: return AARCH64_LT;
9093 case UNLE: return AARCH64_LE;
9094 case UNGT: return AARCH64_HI;
9095 case UNGE: return AARCH64_PL;
9096 default: return -1;
9097 }
9098 break;
9099
9100 case E_CCmode:
9101 switch (comp_code)
9102 {
9103 case NE: return AARCH64_NE;
9104 case EQ: return AARCH64_EQ;
9105 case GE: return AARCH64_GE;
9106 case GT: return AARCH64_GT;
9107 case LE: return AARCH64_LE;
9108 case LT: return AARCH64_LT;
9109 case GEU: return AARCH64_CS;
9110 case GTU: return AARCH64_HI;
9111 case LEU: return AARCH64_LS;
9112 case LTU: return AARCH64_CC;
9113 default: return -1;
9114 }
9115 break;
9116
9117 case E_CC_SWPmode:
9118 switch (comp_code)
9119 {
9120 case NE: return AARCH64_NE;
9121 case EQ: return AARCH64_EQ;
9122 case GE: return AARCH64_LE;
9123 case GT: return AARCH64_LT;
9124 case LE: return AARCH64_GE;
9125 case LT: return AARCH64_GT;
9126 case GEU: return AARCH64_LS;
9127 case GTU: return AARCH64_CC;
9128 case LEU: return AARCH64_CS;
9129 case LTU: return AARCH64_HI;
9130 default: return -1;
9131 }
9132 break;
9133
9134 case E_CC_NZCmode:
9135 switch (comp_code)
9136 {
9137 case NE: return AARCH64_NE; /* = any */
9138 case EQ: return AARCH64_EQ; /* = none */
9139 case GE: return AARCH64_PL; /* = nfrst */
9140 case LT: return AARCH64_MI; /* = first */
9141 case GEU: return AARCH64_CS; /* = nlast */
9142 case GTU: return AARCH64_HI; /* = pmore */
9143 case LEU: return AARCH64_LS; /* = plast */
9144 case LTU: return AARCH64_CC; /* = last */
9145 default: return -1;
9146 }
9147 break;
9148
9149 case E_CC_NZmode:
9150 switch (comp_code)
9151 {
9152 case NE: return AARCH64_NE;
9153 case EQ: return AARCH64_EQ;
9154 case GE: return AARCH64_PL;
9155 case LT: return AARCH64_MI;
9156 default: return -1;
9157 }
9158 break;
9159
9160 case E_CC_Zmode:
9161 switch (comp_code)
9162 {
9163 case NE: return AARCH64_NE;
9164 case EQ: return AARCH64_EQ;
9165 default: return -1;
9166 }
9167 break;
9168
9169 case E_CC_Cmode:
9170 switch (comp_code)
9171 {
9172 case LTU: return AARCH64_CS;
9173 case GEU: return AARCH64_CC;
9174 default: return -1;
9175 }
9176 break;
9177
9178 case E_CC_ADCmode:
9179 switch (comp_code)
9180 {
9181 case GEU: return AARCH64_CS;
9182 case LTU: return AARCH64_CC;
9183 default: return -1;
9184 }
9185 break;
9186
9187 case E_CC_Vmode:
9188 switch (comp_code)
9189 {
9190 case NE: return AARCH64_VS;
9191 case EQ: return AARCH64_VC;
9192 default: return -1;
9193 }
9194 break;
9195
9196 default:
9197 return -1;
9198 }
9199
9200 return -1;
9201 }
9202
9203 bool
9204 aarch64_const_vec_all_same_in_range_p (rtx x,
9205 HOST_WIDE_INT minval,
9206 HOST_WIDE_INT maxval)
9207 {
9208 rtx elt;
9209 return (const_vec_duplicate_p (x, &elt)
9210 && CONST_INT_P (elt)
9211 && IN_RANGE (INTVAL (elt), minval, maxval));
9212 }
9213
9214 bool
9215 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9216 {
9217 return aarch64_const_vec_all_same_in_range_p (x, val, val);
9218 }
9219
9220 /* Return true if VEC is a constant in which every element is in the range
9221 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
9222
9223 static bool
9224 aarch64_const_vec_all_in_range_p (rtx vec,
9225 HOST_WIDE_INT minval,
9226 HOST_WIDE_INT maxval)
9227 {
9228 if (GET_CODE (vec) != CONST_VECTOR
9229 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9230 return false;
9231
9232 int nunits;
9233 if (!CONST_VECTOR_STEPPED_P (vec))
9234 nunits = const_vector_encoded_nelts (vec);
9235 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9236 return false;
9237
9238 for (int i = 0; i < nunits; i++)
9239 {
9240 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9241 if (!CONST_INT_P (vec_elem)
9242 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9243 return false;
9244 }
9245 return true;
9246 }
9247
9248 /* N Z C V. */
9249 #define AARCH64_CC_V 1
9250 #define AARCH64_CC_C (1 << 1)
9251 #define AARCH64_CC_Z (1 << 2)
9252 #define AARCH64_CC_N (1 << 3)
9253
9254 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
9255 static const int aarch64_nzcv_codes[] =
9256 {
9257 0, /* EQ, Z == 1. */
9258 AARCH64_CC_Z, /* NE, Z == 0. */
9259 0, /* CS, C == 1. */
9260 AARCH64_CC_C, /* CC, C == 0. */
9261 0, /* MI, N == 1. */
9262 AARCH64_CC_N, /* PL, N == 0. */
9263 0, /* VS, V == 1. */
9264 AARCH64_CC_V, /* VC, V == 0. */
9265 0, /* HI, C ==1 && Z == 0. */
9266 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
9267 AARCH64_CC_V, /* GE, N == V. */
9268 0, /* LT, N != V. */
9269 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
9270 0, /* LE, !(Z == 0 && N == V). */
9271 0, /* AL, Any. */
9272 0 /* NV, Any. */
9273 };
9274
9275 /* Print floating-point vector immediate operand X to F, negating it
9276 first if NEGATE is true. Return true on success, false if it isn't
9277 a constant we can handle. */
9278
9279 static bool
9280 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9281 {
9282 rtx elt;
9283
9284 if (!const_vec_duplicate_p (x, &elt))
9285 return false;
9286
9287 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9288 if (negate)
9289 r = real_value_negate (&r);
9290
9291 /* Handle the SVE single-bit immediates specially, since they have a
9292 fixed form in the assembly syntax. */
9293 if (real_equal (&r, &dconst0))
9294 asm_fprintf (f, "0.0");
9295 else if (real_equal (&r, &dconst2))
9296 asm_fprintf (f, "2.0");
9297 else if (real_equal (&r, &dconst1))
9298 asm_fprintf (f, "1.0");
9299 else if (real_equal (&r, &dconsthalf))
9300 asm_fprintf (f, "0.5");
9301 else
9302 {
9303 const int buf_size = 20;
9304 char float_buf[buf_size] = {'\0'};
9305 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9306 1, GET_MODE (elt));
9307 asm_fprintf (f, "%s", float_buf);
9308 }
9309
9310 return true;
9311 }
9312
9313 /* Return the equivalent letter for size. */
9314 static char
9315 sizetochar (int size)
9316 {
9317 switch (size)
9318 {
9319 case 64: return 'd';
9320 case 32: return 's';
9321 case 16: return 'h';
9322 case 8 : return 'b';
9323 default: gcc_unreachable ();
9324 }
9325 }
9326
9327 /* Print operand X to file F in a target specific manner according to CODE.
9328 The acceptable formatting commands given by CODE are:
9329 'c': An integer or symbol address without a preceding #
9330 sign.
9331 'C': Take the duplicated element in a vector constant
9332 and print it in hex.
9333 'D': Take the duplicated element in a vector constant
9334 and print it as an unsigned integer, in decimal.
9335 'e': Print the sign/zero-extend size as a character 8->b,
9336 16->h, 32->w. Can also be used for masks:
9337 0xff->b, 0xffff->h, 0xffffffff->w.
9338 'I': If the operand is a duplicated vector constant,
9339 replace it with the duplicated scalar. If the
9340 operand is then a floating-point constant, replace
9341 it with the integer bit representation. Print the
9342 transformed constant as a signed decimal number.
9343 'p': Prints N such that 2^N == X (X must be power of 2 and
9344 const int).
9345 'P': Print the number of non-zero bits in X (a const_int).
9346 'H': Print the higher numbered register of a pair (TImode)
9347 of regs.
9348 'm': Print a condition (eq, ne, etc).
9349 'M': Same as 'm', but invert condition.
9350 'N': Take the duplicated element in a vector constant
9351 and print the negative of it in decimal.
9352 'b/h/s/d/q': Print a scalar FP/SIMD register name.
9353 'S/T/U/V': Print a FP/SIMD register name for a register list.
9354 The register printed is the FP/SIMD register name
9355 of X + 0/1/2/3 for S/T/U/V.
9356 'R': Print a scalar Integer/FP/SIMD register name + 1.
9357 'X': Print bottom 16 bits of integer constant in hex.
9358 'w/x': Print a general register name or the zero register
9359 (32-bit or 64-bit).
9360 '0': Print a normal operand, if it's a general register,
9361 then we assume DImode.
9362 'k': Print NZCV for conditional compare instructions.
9363 'A': Output address constant representing the first
9364 argument of X, specifying a relocation offset
9365 if appropriate.
9366 'L': Output constant address specified by X
9367 with a relocation offset if appropriate.
9368 'G': Prints address of X, specifying a PC relative
9369 relocation mode if appropriate.
9370 'y': Output address of LDP or STP - this is used for
9371 some LDP/STPs which don't use a PARALLEL in their
9372 pattern (so the mode needs to be adjusted).
9373 'z': Output address of a typical LDP or STP. */
9374
9375 static void
9376 aarch64_print_operand (FILE *f, rtx x, int code)
9377 {
9378 rtx elt;
9379 switch (code)
9380 {
9381 case 'c':
9382 switch (GET_CODE (x))
9383 {
9384 case CONST_INT:
9385 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9386 break;
9387
9388 case SYMBOL_REF:
9389 output_addr_const (f, x);
9390 break;
9391
9392 case CONST:
9393 if (GET_CODE (XEXP (x, 0)) == PLUS
9394 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
9395 {
9396 output_addr_const (f, x);
9397 break;
9398 }
9399 /* Fall through. */
9400
9401 default:
9402 output_operand_lossage ("unsupported operand for code '%c'", code);
9403 }
9404 break;
9405
9406 case 'e':
9407 {
9408 x = unwrap_const_vec_duplicate (x);
9409 if (!CONST_INT_P (x))
9410 {
9411 output_operand_lossage ("invalid operand for '%%%c'", code);
9412 return;
9413 }
9414
9415 HOST_WIDE_INT val = INTVAL (x);
9416 if ((val & ~7) == 8 || val == 0xff)
9417 fputc ('b', f);
9418 else if ((val & ~7) == 16 || val == 0xffff)
9419 fputc ('h', f);
9420 else if ((val & ~7) == 32 || val == 0xffffffff)
9421 fputc ('w', f);
9422 else
9423 {
9424 output_operand_lossage ("invalid operand for '%%%c'", code);
9425 return;
9426 }
9427 }
9428 break;
9429
9430 case 'p':
9431 {
9432 int n;
9433
9434 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
9435 {
9436 output_operand_lossage ("invalid operand for '%%%c'", code);
9437 return;
9438 }
9439
9440 asm_fprintf (f, "%d", n);
9441 }
9442 break;
9443
9444 case 'P':
9445 if (!CONST_INT_P (x))
9446 {
9447 output_operand_lossage ("invalid operand for '%%%c'", code);
9448 return;
9449 }
9450
9451 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
9452 break;
9453
9454 case 'H':
9455 if (x == const0_rtx)
9456 {
9457 asm_fprintf (f, "xzr");
9458 break;
9459 }
9460
9461 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
9462 {
9463 output_operand_lossage ("invalid operand for '%%%c'", code);
9464 return;
9465 }
9466
9467 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
9468 break;
9469
9470 case 'I':
9471 {
9472 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
9473 if (CONST_INT_P (x))
9474 asm_fprintf (f, "%wd", INTVAL (x));
9475 else
9476 {
9477 output_operand_lossage ("invalid operand for '%%%c'", code);
9478 return;
9479 }
9480 break;
9481 }
9482
9483 case 'M':
9484 case 'm':
9485 {
9486 int cond_code;
9487 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
9488 if (x == const_true_rtx)
9489 {
9490 if (code == 'M')
9491 fputs ("nv", f);
9492 return;
9493 }
9494
9495 if (!COMPARISON_P (x))
9496 {
9497 output_operand_lossage ("invalid operand for '%%%c'", code);
9498 return;
9499 }
9500
9501 cond_code = aarch64_get_condition_code (x);
9502 gcc_assert (cond_code >= 0);
9503 if (code == 'M')
9504 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
9505 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
9506 fputs (aarch64_sve_condition_codes[cond_code], f);
9507 else
9508 fputs (aarch64_condition_codes[cond_code], f);
9509 }
9510 break;
9511
9512 case 'N':
9513 if (!const_vec_duplicate_p (x, &elt))
9514 {
9515 output_operand_lossage ("invalid vector constant");
9516 return;
9517 }
9518
9519 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9520 asm_fprintf (f, "%wd", -INTVAL (elt));
9521 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9522 && aarch64_print_vector_float_operand (f, x, true))
9523 ;
9524 else
9525 {
9526 output_operand_lossage ("invalid vector constant");
9527 return;
9528 }
9529 break;
9530
9531 case 'b':
9532 case 'h':
9533 case 's':
9534 case 'd':
9535 case 'q':
9536 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9537 {
9538 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9539 return;
9540 }
9541 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
9542 break;
9543
9544 case 'S':
9545 case 'T':
9546 case 'U':
9547 case 'V':
9548 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
9549 {
9550 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
9551 return;
9552 }
9553 asm_fprintf (f, "%c%d",
9554 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
9555 REGNO (x) - V0_REGNUM + (code - 'S'));
9556 break;
9557
9558 case 'R':
9559 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
9560 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
9561 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9562 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
9563 else
9564 output_operand_lossage ("incompatible register operand for '%%%c'",
9565 code);
9566 break;
9567
9568 case 'X':
9569 if (!CONST_INT_P (x))
9570 {
9571 output_operand_lossage ("invalid operand for '%%%c'", code);
9572 return;
9573 }
9574 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
9575 break;
9576
9577 case 'C':
9578 {
9579 /* Print a replicated constant in hex. */
9580 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9581 {
9582 output_operand_lossage ("invalid operand for '%%%c'", code);
9583 return;
9584 }
9585 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9586 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9587 }
9588 break;
9589
9590 case 'D':
9591 {
9592 /* Print a replicated constant in decimal, treating it as
9593 unsigned. */
9594 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
9595 {
9596 output_operand_lossage ("invalid operand for '%%%c'", code);
9597 return;
9598 }
9599 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
9600 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
9601 }
9602 break;
9603
9604 case 'w':
9605 case 'x':
9606 if (x == const0_rtx
9607 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
9608 {
9609 asm_fprintf (f, "%czr", code);
9610 break;
9611 }
9612
9613 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
9614 {
9615 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
9616 break;
9617 }
9618
9619 if (REG_P (x) && REGNO (x) == SP_REGNUM)
9620 {
9621 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
9622 break;
9623 }
9624
9625 /* Fall through */
9626
9627 case 0:
9628 if (x == NULL)
9629 {
9630 output_operand_lossage ("missing operand");
9631 return;
9632 }
9633
9634 switch (GET_CODE (x))
9635 {
9636 case REG:
9637 if (aarch64_sve_data_mode_p (GET_MODE (x)))
9638 {
9639 if (REG_NREGS (x) == 1)
9640 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
9641 else
9642 {
9643 char suffix
9644 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
9645 asm_fprintf (f, "{z%d.%c - z%d.%c}",
9646 REGNO (x) - V0_REGNUM, suffix,
9647 END_REGNO (x) - V0_REGNUM - 1, suffix);
9648 }
9649 }
9650 else
9651 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
9652 break;
9653
9654 case MEM:
9655 output_address (GET_MODE (x), XEXP (x, 0));
9656 break;
9657
9658 case LABEL_REF:
9659 case SYMBOL_REF:
9660 output_addr_const (asm_out_file, x);
9661 break;
9662
9663 case CONST_INT:
9664 asm_fprintf (f, "%wd", INTVAL (x));
9665 break;
9666
9667 case CONST:
9668 if (!VECTOR_MODE_P (GET_MODE (x)))
9669 {
9670 output_addr_const (asm_out_file, x);
9671 break;
9672 }
9673 /* fall through */
9674
9675 case CONST_VECTOR:
9676 if (!const_vec_duplicate_p (x, &elt))
9677 {
9678 output_operand_lossage ("invalid vector constant");
9679 return;
9680 }
9681
9682 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
9683 asm_fprintf (f, "%wd", INTVAL (elt));
9684 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
9685 && aarch64_print_vector_float_operand (f, x, false))
9686 ;
9687 else
9688 {
9689 output_operand_lossage ("invalid vector constant");
9690 return;
9691 }
9692 break;
9693
9694 case CONST_DOUBLE:
9695 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
9696 be getting CONST_DOUBLEs holding integers. */
9697 gcc_assert (GET_MODE (x) != VOIDmode);
9698 if (aarch64_float_const_zero_rtx_p (x))
9699 {
9700 fputc ('0', f);
9701 break;
9702 }
9703 else if (aarch64_float_const_representable_p (x))
9704 {
9705 #define buf_size 20
9706 char float_buf[buf_size] = {'\0'};
9707 real_to_decimal_for_mode (float_buf,
9708 CONST_DOUBLE_REAL_VALUE (x),
9709 buf_size, buf_size,
9710 1, GET_MODE (x));
9711 asm_fprintf (asm_out_file, "%s", float_buf);
9712 break;
9713 #undef buf_size
9714 }
9715 output_operand_lossage ("invalid constant");
9716 return;
9717 default:
9718 output_operand_lossage ("invalid operand");
9719 return;
9720 }
9721 break;
9722
9723 case 'A':
9724 if (GET_CODE (x) == HIGH)
9725 x = XEXP (x, 0);
9726
9727 switch (aarch64_classify_symbolic_expression (x))
9728 {
9729 case SYMBOL_SMALL_GOT_4G:
9730 asm_fprintf (asm_out_file, ":got:");
9731 break;
9732
9733 case SYMBOL_SMALL_TLSGD:
9734 asm_fprintf (asm_out_file, ":tlsgd:");
9735 break;
9736
9737 case SYMBOL_SMALL_TLSDESC:
9738 asm_fprintf (asm_out_file, ":tlsdesc:");
9739 break;
9740
9741 case SYMBOL_SMALL_TLSIE:
9742 asm_fprintf (asm_out_file, ":gottprel:");
9743 break;
9744
9745 case SYMBOL_TLSLE24:
9746 asm_fprintf (asm_out_file, ":tprel:");
9747 break;
9748
9749 case SYMBOL_TINY_GOT:
9750 gcc_unreachable ();
9751 break;
9752
9753 default:
9754 break;
9755 }
9756 output_addr_const (asm_out_file, x);
9757 break;
9758
9759 case 'L':
9760 switch (aarch64_classify_symbolic_expression (x))
9761 {
9762 case SYMBOL_SMALL_GOT_4G:
9763 asm_fprintf (asm_out_file, ":lo12:");
9764 break;
9765
9766 case SYMBOL_SMALL_TLSGD:
9767 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
9768 break;
9769
9770 case SYMBOL_SMALL_TLSDESC:
9771 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
9772 break;
9773
9774 case SYMBOL_SMALL_TLSIE:
9775 asm_fprintf (asm_out_file, ":gottprel_lo12:");
9776 break;
9777
9778 case SYMBOL_TLSLE12:
9779 asm_fprintf (asm_out_file, ":tprel_lo12:");
9780 break;
9781
9782 case SYMBOL_TLSLE24:
9783 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
9784 break;
9785
9786 case SYMBOL_TINY_GOT:
9787 asm_fprintf (asm_out_file, ":got:");
9788 break;
9789
9790 case SYMBOL_TINY_TLSIE:
9791 asm_fprintf (asm_out_file, ":gottprel:");
9792 break;
9793
9794 default:
9795 break;
9796 }
9797 output_addr_const (asm_out_file, x);
9798 break;
9799
9800 case 'G':
9801 switch (aarch64_classify_symbolic_expression (x))
9802 {
9803 case SYMBOL_TLSLE24:
9804 asm_fprintf (asm_out_file, ":tprel_hi12:");
9805 break;
9806 default:
9807 break;
9808 }
9809 output_addr_const (asm_out_file, x);
9810 break;
9811
9812 case 'k':
9813 {
9814 HOST_WIDE_INT cond_code;
9815
9816 if (!CONST_INT_P (x))
9817 {
9818 output_operand_lossage ("invalid operand for '%%%c'", code);
9819 return;
9820 }
9821
9822 cond_code = INTVAL (x);
9823 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
9824 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
9825 }
9826 break;
9827
9828 case 'y':
9829 case 'z':
9830 {
9831 machine_mode mode = GET_MODE (x);
9832
9833 if (GET_CODE (x) != MEM
9834 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
9835 {
9836 output_operand_lossage ("invalid operand for '%%%c'", code);
9837 return;
9838 }
9839
9840 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
9841 code == 'y'
9842 ? ADDR_QUERY_LDP_STP_N
9843 : ADDR_QUERY_LDP_STP))
9844 output_operand_lossage ("invalid operand prefix '%%%c'", code);
9845 }
9846 break;
9847
9848 default:
9849 output_operand_lossage ("invalid operand prefix '%%%c'", code);
9850 return;
9851 }
9852 }
9853
9854 /* Print address 'x' of a memory access with mode 'mode'.
9855 'op' is the context required by aarch64_classify_address. It can either be
9856 MEM for a normal memory access or PARALLEL for LDP/STP. */
9857 static bool
9858 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
9859 aarch64_addr_query_type type)
9860 {
9861 struct aarch64_address_info addr;
9862 unsigned int size, vec_flags;
9863
9864 /* Check all addresses are Pmode - including ILP32. */
9865 if (GET_MODE (x) != Pmode
9866 && (!CONST_INT_P (x)
9867 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
9868 {
9869 output_operand_lossage ("invalid address mode");
9870 return false;
9871 }
9872
9873 if (aarch64_classify_address (&addr, x, mode, true, type))
9874 switch (addr.type)
9875 {
9876 case ADDRESS_REG_IMM:
9877 if (known_eq (addr.const_offset, 0))
9878 {
9879 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
9880 return true;
9881 }
9882
9883 vec_flags = aarch64_classify_vector_mode (mode);
9884 if (vec_flags & VEC_ANY_SVE)
9885 {
9886 HOST_WIDE_INT vnum
9887 = exact_div (addr.const_offset,
9888 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
9889 asm_fprintf (f, "[%s, #%wd, mul vl]",
9890 reg_names[REGNO (addr.base)], vnum);
9891 return true;
9892 }
9893
9894 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
9895 INTVAL (addr.offset));
9896 return true;
9897
9898 case ADDRESS_REG_REG:
9899 if (addr.shift == 0)
9900 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
9901 reg_names [REGNO (addr.offset)]);
9902 else
9903 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
9904 reg_names [REGNO (addr.offset)], addr.shift);
9905 return true;
9906
9907 case ADDRESS_REG_UXTW:
9908 if (addr.shift == 0)
9909 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
9910 REGNO (addr.offset) - R0_REGNUM);
9911 else
9912 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
9913 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9914 return true;
9915
9916 case ADDRESS_REG_SXTW:
9917 if (addr.shift == 0)
9918 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
9919 REGNO (addr.offset) - R0_REGNUM);
9920 else
9921 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
9922 REGNO (addr.offset) - R0_REGNUM, addr.shift);
9923 return true;
9924
9925 case ADDRESS_REG_WB:
9926 /* Writeback is only supported for fixed-width modes. */
9927 size = GET_MODE_SIZE (mode).to_constant ();
9928 switch (GET_CODE (x))
9929 {
9930 case PRE_INC:
9931 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
9932 return true;
9933 case POST_INC:
9934 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
9935 return true;
9936 case PRE_DEC:
9937 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
9938 return true;
9939 case POST_DEC:
9940 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
9941 return true;
9942 case PRE_MODIFY:
9943 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
9944 INTVAL (addr.offset));
9945 return true;
9946 case POST_MODIFY:
9947 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
9948 INTVAL (addr.offset));
9949 return true;
9950 default:
9951 break;
9952 }
9953 break;
9954
9955 case ADDRESS_LO_SUM:
9956 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
9957 output_addr_const (f, addr.offset);
9958 asm_fprintf (f, "]");
9959 return true;
9960
9961 case ADDRESS_SYMBOLIC:
9962 output_addr_const (f, x);
9963 return true;
9964 }
9965
9966 return false;
9967 }
9968
9969 /* Print address 'x' of a memory access with mode 'mode'. */
9970 static void
9971 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
9972 {
9973 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
9974 output_addr_const (f, x);
9975 }
9976
9977 bool
9978 aarch64_label_mentioned_p (rtx x)
9979 {
9980 const char *fmt;
9981 int i;
9982
9983 if (GET_CODE (x) == LABEL_REF)
9984 return true;
9985
9986 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
9987 referencing instruction, but they are constant offsets, not
9988 symbols. */
9989 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9990 return false;
9991
9992 fmt = GET_RTX_FORMAT (GET_CODE (x));
9993 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
9994 {
9995 if (fmt[i] == 'E')
9996 {
9997 int j;
9998
9999 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10000 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10001 return 1;
10002 }
10003 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10004 return 1;
10005 }
10006
10007 return 0;
10008 }
10009
10010 /* Implement REGNO_REG_CLASS. */
10011
10012 enum reg_class
10013 aarch64_regno_regclass (unsigned regno)
10014 {
10015 if (GP_REGNUM_P (regno))
10016 return GENERAL_REGS;
10017
10018 if (regno == SP_REGNUM)
10019 return STACK_REG;
10020
10021 if (regno == FRAME_POINTER_REGNUM
10022 || regno == ARG_POINTER_REGNUM)
10023 return POINTER_REGS;
10024
10025 if (FP_REGNUM_P (regno))
10026 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10027 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10028
10029 if (PR_REGNUM_P (regno))
10030 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10031
10032 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10033 return FFR_REGS;
10034
10035 return NO_REGS;
10036 }
10037
10038 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10039 If OFFSET is out of range, return an offset of an anchor point
10040 that is in range. Return 0 otherwise. */
10041
10042 static HOST_WIDE_INT
10043 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10044 machine_mode mode)
10045 {
10046 /* Does it look like we'll need a 16-byte load/store-pair operation? */
10047 if (size > 16)
10048 return (offset + 0x400) & ~0x7f0;
10049
10050 /* For offsets that aren't a multiple of the access size, the limit is
10051 -256...255. */
10052 if (offset & (size - 1))
10053 {
10054 /* BLKmode typically uses LDP of X-registers. */
10055 if (mode == BLKmode)
10056 return (offset + 512) & ~0x3ff;
10057 return (offset + 0x100) & ~0x1ff;
10058 }
10059
10060 /* Small negative offsets are supported. */
10061 if (IN_RANGE (offset, -256, 0))
10062 return 0;
10063
10064 if (mode == TImode || mode == TFmode)
10065 return (offset + 0x100) & ~0x1ff;
10066
10067 /* Use 12-bit offset by access size. */
10068 return offset & (~0xfff * size);
10069 }
10070
10071 static rtx
10072 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
10073 {
10074 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10075 where mask is selected by alignment and size of the offset.
10076 We try to pick as large a range for the offset as possible to
10077 maximize the chance of a CSE. However, for aligned addresses
10078 we limit the range to 4k so that structures with different sized
10079 elements are likely to use the same base. We need to be careful
10080 not to split a CONST for some forms of address expression, otherwise
10081 it will generate sub-optimal code. */
10082
10083 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10084 {
10085 rtx base = XEXP (x, 0);
10086 rtx offset_rtx = XEXP (x, 1);
10087 HOST_WIDE_INT offset = INTVAL (offset_rtx);
10088
10089 if (GET_CODE (base) == PLUS)
10090 {
10091 rtx op0 = XEXP (base, 0);
10092 rtx op1 = XEXP (base, 1);
10093
10094 /* Force any scaling into a temp for CSE. */
10095 op0 = force_reg (Pmode, op0);
10096 op1 = force_reg (Pmode, op1);
10097
10098 /* Let the pointer register be in op0. */
10099 if (REG_POINTER (op1))
10100 std::swap (op0, op1);
10101
10102 /* If the pointer is virtual or frame related, then we know that
10103 virtual register instantiation or register elimination is going
10104 to apply a second constant. We want the two constants folded
10105 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
10106 if (virt_or_elim_regno_p (REGNO (op0)))
10107 {
10108 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10109 NULL_RTX, true, OPTAB_DIRECT);
10110 return gen_rtx_PLUS (Pmode, base, op1);
10111 }
10112
10113 /* Otherwise, in order to encourage CSE (and thence loop strength
10114 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
10115 base = expand_binop (Pmode, add_optab, op0, op1,
10116 NULL_RTX, true, OPTAB_DIRECT);
10117 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10118 }
10119
10120 HOST_WIDE_INT size;
10121 if (GET_MODE_SIZE (mode).is_constant (&size))
10122 {
10123 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10124 mode);
10125 if (base_offset != 0)
10126 {
10127 base = plus_constant (Pmode, base, base_offset);
10128 base = force_operand (base, NULL_RTX);
10129 return plus_constant (Pmode, base, offset - base_offset);
10130 }
10131 }
10132 }
10133
10134 return x;
10135 }
10136
10137 static reg_class_t
10138 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10139 reg_class_t rclass,
10140 machine_mode mode,
10141 secondary_reload_info *sri)
10142 {
10143 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10144 LDR and STR. See the comment at the head of aarch64-sve.md for
10145 more details about the big-endian handling. */
10146 if (reg_class_subset_p (rclass, FP_REGS)
10147 && !((REG_P (x) && HARD_REGISTER_P (x))
10148 || aarch64_simd_valid_immediate (x, NULL))
10149 && mode != VNx16QImode)
10150 {
10151 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10152 if ((vec_flags & VEC_SVE_DATA)
10153 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10154 {
10155 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10156 return NO_REGS;
10157 }
10158 }
10159
10160 /* If we have to disable direct literal pool loads and stores because the
10161 function is too big, then we need a scratch register. */
10162 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
10163 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10164 || targetm.vector_mode_supported_p (GET_MODE (x)))
10165 && !aarch64_pcrelative_literal_loads)
10166 {
10167 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10168 return NO_REGS;
10169 }
10170
10171 /* Without the TARGET_SIMD instructions we cannot move a Q register
10172 to a Q register directly. We need a scratch. */
10173 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10174 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10175 && reg_class_subset_p (rclass, FP_REGS))
10176 {
10177 sri->icode = code_for_aarch64_reload_mov (mode);
10178 return NO_REGS;
10179 }
10180
10181 /* A TFmode or TImode memory access should be handled via an FP_REGS
10182 because AArch64 has richer addressing modes for LDR/STR instructions
10183 than LDP/STP instructions. */
10184 if (TARGET_FLOAT && rclass == GENERAL_REGS
10185 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10186 return FP_REGS;
10187
10188 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10189 return GENERAL_REGS;
10190
10191 return NO_REGS;
10192 }
10193
10194 static bool
10195 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10196 {
10197 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10198
10199 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10200 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
10201 if (frame_pointer_needed)
10202 return to == HARD_FRAME_POINTER_REGNUM;
10203 return true;
10204 }
10205
10206 poly_int64
10207 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10208 {
10209 if (to == HARD_FRAME_POINTER_REGNUM)
10210 {
10211 if (from == ARG_POINTER_REGNUM)
10212 return cfun->machine->frame.hard_fp_offset;
10213
10214 if (from == FRAME_POINTER_REGNUM)
10215 return cfun->machine->frame.hard_fp_offset
10216 - cfun->machine->frame.locals_offset;
10217 }
10218
10219 if (to == STACK_POINTER_REGNUM)
10220 {
10221 if (from == FRAME_POINTER_REGNUM)
10222 return cfun->machine->frame.frame_size
10223 - cfun->machine->frame.locals_offset;
10224 }
10225
10226 return cfun->machine->frame.frame_size;
10227 }
10228
10229 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
10230 previous frame. */
10231
10232 rtx
10233 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10234 {
10235 if (count != 0)
10236 return const0_rtx;
10237 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
10238 }
10239
10240
10241 static void
10242 aarch64_asm_trampoline_template (FILE *f)
10243 {
10244 int offset1 = 16;
10245 int offset2 = 20;
10246
10247 if (aarch64_bti_enabled ())
10248 {
10249 asm_fprintf (f, "\thint\t34 // bti c\n");
10250 offset1 -= 4;
10251 offset2 -= 4;
10252 }
10253
10254 if (TARGET_ILP32)
10255 {
10256 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
10257 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
10258 offset1);
10259 }
10260 else
10261 {
10262 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
10263 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
10264 offset2);
10265 }
10266 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10267
10268 /* The trampoline needs an extra padding instruction. In case if BTI is
10269 enabled the padding instruction is replaced by the BTI instruction at
10270 the beginning. */
10271 if (!aarch64_bti_enabled ())
10272 assemble_aligned_integer (4, const0_rtx);
10273
10274 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10275 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10276 }
10277
10278 static void
10279 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10280 {
10281 rtx fnaddr, mem, a_tramp;
10282 const int tramp_code_sz = 16;
10283
10284 /* Don't need to copy the trailing D-words, we fill those in below. */
10285 emit_block_move (m_tramp, assemble_trampoline_template (),
10286 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10287 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10288 fnaddr = XEXP (DECL_RTL (fndecl), 0);
10289 if (GET_MODE (fnaddr) != ptr_mode)
10290 fnaddr = convert_memory_address (ptr_mode, fnaddr);
10291 emit_move_insn (mem, fnaddr);
10292
10293 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10294 emit_move_insn (mem, chain_value);
10295
10296 /* XXX We should really define a "clear_cache" pattern and use
10297 gen_clear_cache(). */
10298 a_tramp = XEXP (m_tramp, 0);
10299 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10300 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10301 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10302 ptr_mode);
10303 }
10304
10305 static unsigned char
10306 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10307 {
10308 /* ??? Logically we should only need to provide a value when
10309 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10310 can hold MODE, but at the moment we need to handle all modes.
10311 Just ignore any runtime parts for registers that can't store them. */
10312 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10313 unsigned int nregs, vec_flags;
10314 switch (regclass)
10315 {
10316 case TAILCALL_ADDR_REGS:
10317 case POINTER_REGS:
10318 case GENERAL_REGS:
10319 case ALL_REGS:
10320 case POINTER_AND_FP_REGS:
10321 case FP_REGS:
10322 case FP_LO_REGS:
10323 case FP_LO8_REGS:
10324 vec_flags = aarch64_classify_vector_mode (mode);
10325 if ((vec_flags & VEC_SVE_DATA)
10326 && constant_multiple_p (GET_MODE_SIZE (mode),
10327 aarch64_vl_bytes (mode, vec_flags), &nregs))
10328 return nregs;
10329 return (vec_flags & VEC_ADVSIMD
10330 ? CEIL (lowest_size, UNITS_PER_VREG)
10331 : CEIL (lowest_size, UNITS_PER_WORD));
10332 case STACK_REG:
10333 case PR_REGS:
10334 case PR_LO_REGS:
10335 case PR_HI_REGS:
10336 case FFR_REGS:
10337 case PR_AND_FFR_REGS:
10338 return 1;
10339
10340 case NO_REGS:
10341 return 0;
10342
10343 default:
10344 break;
10345 }
10346 gcc_unreachable ();
10347 }
10348
10349 static reg_class_t
10350 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10351 {
10352 if (regclass == POINTER_REGS)
10353 return GENERAL_REGS;
10354
10355 if (regclass == STACK_REG)
10356 {
10357 if (REG_P(x)
10358 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10359 return regclass;
10360
10361 return NO_REGS;
10362 }
10363
10364 /* Register eliminiation can result in a request for
10365 SP+constant->FP_REGS. We cannot support such operations which
10366 use SP as source and an FP_REG as destination, so reject out
10367 right now. */
10368 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
10369 {
10370 rtx lhs = XEXP (x, 0);
10371
10372 /* Look through a possible SUBREG introduced by ILP32. */
10373 if (GET_CODE (lhs) == SUBREG)
10374 lhs = SUBREG_REG (lhs);
10375
10376 gcc_assert (REG_P (lhs));
10377 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
10378 POINTER_REGS));
10379 return NO_REGS;
10380 }
10381
10382 return regclass;
10383 }
10384
10385 void
10386 aarch64_asm_output_labelref (FILE* f, const char *name)
10387 {
10388 asm_fprintf (f, "%U%s", name);
10389 }
10390
10391 static void
10392 aarch64_elf_asm_constructor (rtx symbol, int priority)
10393 {
10394 if (priority == DEFAULT_INIT_PRIORITY)
10395 default_ctor_section_asm_out_constructor (symbol, priority);
10396 else
10397 {
10398 section *s;
10399 /* While priority is known to be in range [0, 65535], so 18 bytes
10400 would be enough, the compiler might not know that. To avoid
10401 -Wformat-truncation false positive, use a larger size. */
10402 char buf[23];
10403 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
10404 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10405 switch_to_section (s);
10406 assemble_align (POINTER_SIZE);
10407 assemble_aligned_integer (POINTER_BYTES, symbol);
10408 }
10409 }
10410
10411 static void
10412 aarch64_elf_asm_destructor (rtx symbol, int priority)
10413 {
10414 if (priority == DEFAULT_INIT_PRIORITY)
10415 default_dtor_section_asm_out_destructor (symbol, priority);
10416 else
10417 {
10418 section *s;
10419 /* While priority is known to be in range [0, 65535], so 18 bytes
10420 would be enough, the compiler might not know that. To avoid
10421 -Wformat-truncation false positive, use a larger size. */
10422 char buf[23];
10423 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
10424 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
10425 switch_to_section (s);
10426 assemble_align (POINTER_SIZE);
10427 assemble_aligned_integer (POINTER_BYTES, symbol);
10428 }
10429 }
10430
10431 const char*
10432 aarch64_output_casesi (rtx *operands)
10433 {
10434 char buf[100];
10435 char label[100];
10436 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
10437 int index;
10438 static const char *const patterns[4][2] =
10439 {
10440 {
10441 "ldrb\t%w3, [%0,%w1,uxtw]",
10442 "add\t%3, %4, %w3, sxtb #2"
10443 },
10444 {
10445 "ldrh\t%w3, [%0,%w1,uxtw #1]",
10446 "add\t%3, %4, %w3, sxth #2"
10447 },
10448 {
10449 "ldr\t%w3, [%0,%w1,uxtw #2]",
10450 "add\t%3, %4, %w3, sxtw #2"
10451 },
10452 /* We assume that DImode is only generated when not optimizing and
10453 that we don't really need 64-bit address offsets. That would
10454 imply an object file with 8GB of code in a single function! */
10455 {
10456 "ldr\t%w3, [%0,%w1,uxtw #2]",
10457 "add\t%3, %4, %w3, sxtw #2"
10458 }
10459 };
10460
10461 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
10462
10463 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
10464 index = exact_log2 (GET_MODE_SIZE (mode));
10465
10466 gcc_assert (index >= 0 && index <= 3);
10467
10468 /* Need to implement table size reduction, by chaning the code below. */
10469 output_asm_insn (patterns[index][0], operands);
10470 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
10471 snprintf (buf, sizeof (buf),
10472 "adr\t%%4, %s", targetm.strip_name_encoding (label));
10473 output_asm_insn (buf, operands);
10474 output_asm_insn (patterns[index][1], operands);
10475 output_asm_insn ("br\t%3", operands);
10476 assemble_label (asm_out_file, label);
10477 return "";
10478 }
10479
10480
10481 /* Return size in bits of an arithmetic operand which is shifted/scaled and
10482 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
10483 operator. */
10484
10485 int
10486 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
10487 {
10488 if (shift >= 0 && shift <= 3)
10489 {
10490 int size;
10491 for (size = 8; size <= 32; size *= 2)
10492 {
10493 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
10494 if (mask == bits << shift)
10495 return size;
10496 }
10497 }
10498 return 0;
10499 }
10500
10501 /* Constant pools are per function only when PC relative
10502 literal loads are true or we are in the large memory
10503 model. */
10504
10505 static inline bool
10506 aarch64_can_use_per_function_literal_pools_p (void)
10507 {
10508 return (aarch64_pcrelative_literal_loads
10509 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
10510 }
10511
10512 static bool
10513 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
10514 {
10515 /* We can't use blocks for constants when we're using a per-function
10516 constant pool. */
10517 return !aarch64_can_use_per_function_literal_pools_p ();
10518 }
10519
10520 /* Select appropriate section for constants depending
10521 on where we place literal pools. */
10522
10523 static section *
10524 aarch64_select_rtx_section (machine_mode mode,
10525 rtx x,
10526 unsigned HOST_WIDE_INT align)
10527 {
10528 if (aarch64_can_use_per_function_literal_pools_p ())
10529 return function_section (current_function_decl);
10530
10531 return default_elf_select_rtx_section (mode, x, align);
10532 }
10533
10534 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
10535 void
10536 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
10537 HOST_WIDE_INT offset)
10538 {
10539 /* When using per-function literal pools, we must ensure that any code
10540 section is aligned to the minimal instruction length, lest we get
10541 errors from the assembler re "unaligned instructions". */
10542 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
10543 ASM_OUTPUT_ALIGN (f, 2);
10544 }
10545
10546 /* Costs. */
10547
10548 /* Helper function for rtx cost calculation. Strip a shift expression
10549 from X. Returns the inner operand if successful, or the original
10550 expression on failure. */
10551 static rtx
10552 aarch64_strip_shift (rtx x)
10553 {
10554 rtx op = x;
10555
10556 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
10557 we can convert both to ROR during final output. */
10558 if ((GET_CODE (op) == ASHIFT
10559 || GET_CODE (op) == ASHIFTRT
10560 || GET_CODE (op) == LSHIFTRT
10561 || GET_CODE (op) == ROTATERT
10562 || GET_CODE (op) == ROTATE)
10563 && CONST_INT_P (XEXP (op, 1)))
10564 return XEXP (op, 0);
10565
10566 if (GET_CODE (op) == MULT
10567 && CONST_INT_P (XEXP (op, 1))
10568 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
10569 return XEXP (op, 0);
10570
10571 return x;
10572 }
10573
10574 /* Helper function for rtx cost calculation. Strip an extend
10575 expression from X. Returns the inner operand if successful, or the
10576 original expression on failure. We deal with a number of possible
10577 canonicalization variations here. If STRIP_SHIFT is true, then
10578 we can strip off a shift also. */
10579 static rtx
10580 aarch64_strip_extend (rtx x, bool strip_shift)
10581 {
10582 scalar_int_mode mode;
10583 rtx op = x;
10584
10585 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
10586 return op;
10587
10588 /* Zero and sign extraction of a widened value. */
10589 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
10590 && XEXP (op, 2) == const0_rtx
10591 && GET_CODE (XEXP (op, 0)) == MULT
10592 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
10593 XEXP (op, 1)))
10594 return XEXP (XEXP (op, 0), 0);
10595
10596 /* It can also be represented (for zero-extend) as an AND with an
10597 immediate. */
10598 if (GET_CODE (op) == AND
10599 && GET_CODE (XEXP (op, 0)) == MULT
10600 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
10601 && CONST_INT_P (XEXP (op, 1))
10602 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
10603 INTVAL (XEXP (op, 1))) != 0)
10604 return XEXP (XEXP (op, 0), 0);
10605
10606 /* Now handle extended register, as this may also have an optional
10607 left shift by 1..4. */
10608 if (strip_shift
10609 && GET_CODE (op) == ASHIFT
10610 && CONST_INT_P (XEXP (op, 1))
10611 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
10612 op = XEXP (op, 0);
10613
10614 if (GET_CODE (op) == ZERO_EXTEND
10615 || GET_CODE (op) == SIGN_EXTEND)
10616 op = XEXP (op, 0);
10617
10618 if (op != x)
10619 return op;
10620
10621 return x;
10622 }
10623
10624 /* Return true iff CODE is a shift supported in combination
10625 with arithmetic instructions. */
10626
10627 static bool
10628 aarch64_shift_p (enum rtx_code code)
10629 {
10630 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
10631 }
10632
10633
10634 /* Return true iff X is a cheap shift without a sign extend. */
10635
10636 static bool
10637 aarch64_cheap_mult_shift_p (rtx x)
10638 {
10639 rtx op0, op1;
10640
10641 op0 = XEXP (x, 0);
10642 op1 = XEXP (x, 1);
10643
10644 if (!(aarch64_tune_params.extra_tuning_flags
10645 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
10646 return false;
10647
10648 if (GET_CODE (op0) == SIGN_EXTEND)
10649 return false;
10650
10651 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
10652 && UINTVAL (op1) <= 4)
10653 return true;
10654
10655 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
10656 return false;
10657
10658 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
10659
10660 if (l2 > 0 && l2 <= 4)
10661 return true;
10662
10663 return false;
10664 }
10665
10666 /* Helper function for rtx cost calculation. Calculate the cost of
10667 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
10668 Return the calculated cost of the expression, recursing manually in to
10669 operands where needed. */
10670
10671 static int
10672 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
10673 {
10674 rtx op0, op1;
10675 const struct cpu_cost_table *extra_cost
10676 = aarch64_tune_params.insn_extra_cost;
10677 int cost = 0;
10678 bool compound_p = (outer == PLUS || outer == MINUS);
10679 machine_mode mode = GET_MODE (x);
10680
10681 gcc_checking_assert (code == MULT);
10682
10683 op0 = XEXP (x, 0);
10684 op1 = XEXP (x, 1);
10685
10686 if (VECTOR_MODE_P (mode))
10687 mode = GET_MODE_INNER (mode);
10688
10689 /* Integer multiply/fma. */
10690 if (GET_MODE_CLASS (mode) == MODE_INT)
10691 {
10692 /* The multiply will be canonicalized as a shift, cost it as such. */
10693 if (aarch64_shift_p (GET_CODE (x))
10694 || (CONST_INT_P (op1)
10695 && exact_log2 (INTVAL (op1)) > 0))
10696 {
10697 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
10698 || GET_CODE (op0) == SIGN_EXTEND;
10699 if (speed)
10700 {
10701 if (compound_p)
10702 {
10703 /* If the shift is considered cheap,
10704 then don't add any cost. */
10705 if (aarch64_cheap_mult_shift_p (x))
10706 ;
10707 else if (REG_P (op1))
10708 /* ARITH + shift-by-register. */
10709 cost += extra_cost->alu.arith_shift_reg;
10710 else if (is_extend)
10711 /* ARITH + extended register. We don't have a cost field
10712 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
10713 cost += extra_cost->alu.extend_arith;
10714 else
10715 /* ARITH + shift-by-immediate. */
10716 cost += extra_cost->alu.arith_shift;
10717 }
10718 else
10719 /* LSL (immediate). */
10720 cost += extra_cost->alu.shift;
10721
10722 }
10723 /* Strip extends as we will have costed them in the case above. */
10724 if (is_extend)
10725 op0 = aarch64_strip_extend (op0, true);
10726
10727 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
10728
10729 return cost;
10730 }
10731
10732 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
10733 compound and let the below cases handle it. After all, MNEG is a
10734 special-case alias of MSUB. */
10735 if (GET_CODE (op0) == NEG)
10736 {
10737 op0 = XEXP (op0, 0);
10738 compound_p = true;
10739 }
10740
10741 /* Integer multiplies or FMAs have zero/sign extending variants. */
10742 if ((GET_CODE (op0) == ZERO_EXTEND
10743 && GET_CODE (op1) == ZERO_EXTEND)
10744 || (GET_CODE (op0) == SIGN_EXTEND
10745 && GET_CODE (op1) == SIGN_EXTEND))
10746 {
10747 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
10748 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
10749
10750 if (speed)
10751 {
10752 if (compound_p)
10753 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
10754 cost += extra_cost->mult[0].extend_add;
10755 else
10756 /* MUL/SMULL/UMULL. */
10757 cost += extra_cost->mult[0].extend;
10758 }
10759
10760 return cost;
10761 }
10762
10763 /* This is either an integer multiply or a MADD. In both cases
10764 we want to recurse and cost the operands. */
10765 cost += rtx_cost (op0, mode, MULT, 0, speed);
10766 cost += rtx_cost (op1, mode, MULT, 1, speed);
10767
10768 if (speed)
10769 {
10770 if (compound_p)
10771 /* MADD/MSUB. */
10772 cost += extra_cost->mult[mode == DImode].add;
10773 else
10774 /* MUL. */
10775 cost += extra_cost->mult[mode == DImode].simple;
10776 }
10777
10778 return cost;
10779 }
10780 else
10781 {
10782 if (speed)
10783 {
10784 /* Floating-point FMA/FMUL can also support negations of the
10785 operands, unless the rounding mode is upward or downward in
10786 which case FNMUL is different than FMUL with operand negation. */
10787 bool neg0 = GET_CODE (op0) == NEG;
10788 bool neg1 = GET_CODE (op1) == NEG;
10789 if (compound_p || !flag_rounding_math || (neg0 && neg1))
10790 {
10791 if (neg0)
10792 op0 = XEXP (op0, 0);
10793 if (neg1)
10794 op1 = XEXP (op1, 0);
10795 }
10796
10797 if (compound_p)
10798 /* FMADD/FNMADD/FNMSUB/FMSUB. */
10799 cost += extra_cost->fp[mode == DFmode].fma;
10800 else
10801 /* FMUL/FNMUL. */
10802 cost += extra_cost->fp[mode == DFmode].mult;
10803 }
10804
10805 cost += rtx_cost (op0, mode, MULT, 0, speed);
10806 cost += rtx_cost (op1, mode, MULT, 1, speed);
10807 return cost;
10808 }
10809 }
10810
10811 static int
10812 aarch64_address_cost (rtx x,
10813 machine_mode mode,
10814 addr_space_t as ATTRIBUTE_UNUSED,
10815 bool speed)
10816 {
10817 enum rtx_code c = GET_CODE (x);
10818 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
10819 struct aarch64_address_info info;
10820 int cost = 0;
10821 info.shift = 0;
10822
10823 if (!aarch64_classify_address (&info, x, mode, false))
10824 {
10825 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
10826 {
10827 /* This is a CONST or SYMBOL ref which will be split
10828 in a different way depending on the code model in use.
10829 Cost it through the generic infrastructure. */
10830 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
10831 /* Divide through by the cost of one instruction to
10832 bring it to the same units as the address costs. */
10833 cost_symbol_ref /= COSTS_N_INSNS (1);
10834 /* The cost is then the cost of preparing the address,
10835 followed by an immediate (possibly 0) offset. */
10836 return cost_symbol_ref + addr_cost->imm_offset;
10837 }
10838 else
10839 {
10840 /* This is most likely a jump table from a case
10841 statement. */
10842 return addr_cost->register_offset;
10843 }
10844 }
10845
10846 switch (info.type)
10847 {
10848 case ADDRESS_LO_SUM:
10849 case ADDRESS_SYMBOLIC:
10850 case ADDRESS_REG_IMM:
10851 cost += addr_cost->imm_offset;
10852 break;
10853
10854 case ADDRESS_REG_WB:
10855 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
10856 cost += addr_cost->pre_modify;
10857 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
10858 cost += addr_cost->post_modify;
10859 else
10860 gcc_unreachable ();
10861
10862 break;
10863
10864 case ADDRESS_REG_REG:
10865 cost += addr_cost->register_offset;
10866 break;
10867
10868 case ADDRESS_REG_SXTW:
10869 cost += addr_cost->register_sextend;
10870 break;
10871
10872 case ADDRESS_REG_UXTW:
10873 cost += addr_cost->register_zextend;
10874 break;
10875
10876 default:
10877 gcc_unreachable ();
10878 }
10879
10880
10881 if (info.shift > 0)
10882 {
10883 /* For the sake of calculating the cost of the shifted register
10884 component, we can treat same sized modes in the same way. */
10885 if (known_eq (GET_MODE_BITSIZE (mode), 16))
10886 cost += addr_cost->addr_scale_costs.hi;
10887 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
10888 cost += addr_cost->addr_scale_costs.si;
10889 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
10890 cost += addr_cost->addr_scale_costs.di;
10891 else
10892 /* We can't tell, or this is a 128-bit vector. */
10893 cost += addr_cost->addr_scale_costs.ti;
10894 }
10895
10896 return cost;
10897 }
10898
10899 /* Return the cost of a branch. If SPEED_P is true then the compiler is
10900 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
10901 to be taken. */
10902
10903 int
10904 aarch64_branch_cost (bool speed_p, bool predictable_p)
10905 {
10906 /* When optimizing for speed, use the cost of unpredictable branches. */
10907 const struct cpu_branch_cost *branch_costs =
10908 aarch64_tune_params.branch_costs;
10909
10910 if (!speed_p || predictable_p)
10911 return branch_costs->predictable;
10912 else
10913 return branch_costs->unpredictable;
10914 }
10915
10916 /* Return true if the RTX X in mode MODE is a zero or sign extract
10917 usable in an ADD or SUB (extended register) instruction. */
10918 static bool
10919 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
10920 {
10921 /* Catch add with a sign extract.
10922 This is add_<optab><mode>_multp2. */
10923 if (GET_CODE (x) == SIGN_EXTRACT
10924 || GET_CODE (x) == ZERO_EXTRACT)
10925 {
10926 rtx op0 = XEXP (x, 0);
10927 rtx op1 = XEXP (x, 1);
10928 rtx op2 = XEXP (x, 2);
10929
10930 if (GET_CODE (op0) == MULT
10931 && CONST_INT_P (op1)
10932 && op2 == const0_rtx
10933 && CONST_INT_P (XEXP (op0, 1))
10934 && aarch64_is_extend_from_extract (mode,
10935 XEXP (op0, 1),
10936 op1))
10937 {
10938 return true;
10939 }
10940 }
10941 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
10942 No shift. */
10943 else if (GET_CODE (x) == SIGN_EXTEND
10944 || GET_CODE (x) == ZERO_EXTEND)
10945 return REG_P (XEXP (x, 0));
10946
10947 return false;
10948 }
10949
10950 static bool
10951 aarch64_frint_unspec_p (unsigned int u)
10952 {
10953 switch (u)
10954 {
10955 case UNSPEC_FRINTZ:
10956 case UNSPEC_FRINTP:
10957 case UNSPEC_FRINTM:
10958 case UNSPEC_FRINTA:
10959 case UNSPEC_FRINTN:
10960 case UNSPEC_FRINTX:
10961 case UNSPEC_FRINTI:
10962 return true;
10963
10964 default:
10965 return false;
10966 }
10967 }
10968
10969 /* Return true iff X is an rtx that will match an extr instruction
10970 i.e. as described in the *extr<mode>5_insn family of patterns.
10971 OP0 and OP1 will be set to the operands of the shifts involved
10972 on success and will be NULL_RTX otherwise. */
10973
10974 static bool
10975 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
10976 {
10977 rtx op0, op1;
10978 scalar_int_mode mode;
10979 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
10980 return false;
10981
10982 *res_op0 = NULL_RTX;
10983 *res_op1 = NULL_RTX;
10984
10985 if (GET_CODE (x) != IOR)
10986 return false;
10987
10988 op0 = XEXP (x, 0);
10989 op1 = XEXP (x, 1);
10990
10991 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
10992 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
10993 {
10994 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
10995 if (GET_CODE (op1) == ASHIFT)
10996 std::swap (op0, op1);
10997
10998 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
10999 return false;
11000
11001 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11002 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11003
11004 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11005 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11006 {
11007 *res_op0 = XEXP (op0, 0);
11008 *res_op1 = XEXP (op1, 0);
11009 return true;
11010 }
11011 }
11012
11013 return false;
11014 }
11015
11016 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11017 storing it in *COST. Result is true if the total cost of the operation
11018 has now been calculated. */
11019 static bool
11020 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11021 {
11022 rtx inner;
11023 rtx comparator;
11024 enum rtx_code cmpcode;
11025
11026 if (COMPARISON_P (op0))
11027 {
11028 inner = XEXP (op0, 0);
11029 comparator = XEXP (op0, 1);
11030 cmpcode = GET_CODE (op0);
11031 }
11032 else
11033 {
11034 inner = op0;
11035 comparator = const0_rtx;
11036 cmpcode = NE;
11037 }
11038
11039 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11040 {
11041 /* Conditional branch. */
11042 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11043 return true;
11044 else
11045 {
11046 if (cmpcode == NE || cmpcode == EQ)
11047 {
11048 if (comparator == const0_rtx)
11049 {
11050 /* TBZ/TBNZ/CBZ/CBNZ. */
11051 if (GET_CODE (inner) == ZERO_EXTRACT)
11052 /* TBZ/TBNZ. */
11053 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11054 ZERO_EXTRACT, 0, speed);
11055 else
11056 /* CBZ/CBNZ. */
11057 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11058
11059 return true;
11060 }
11061 }
11062 else if (cmpcode == LT || cmpcode == GE)
11063 {
11064 /* TBZ/TBNZ. */
11065 if (comparator == const0_rtx)
11066 return true;
11067 }
11068 }
11069 }
11070 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11071 {
11072 /* CCMP. */
11073 if (GET_CODE (op1) == COMPARE)
11074 {
11075 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
11076 if (XEXP (op1, 1) == const0_rtx)
11077 *cost += 1;
11078 if (speed)
11079 {
11080 machine_mode mode = GET_MODE (XEXP (op1, 0));
11081 const struct cpu_cost_table *extra_cost
11082 = aarch64_tune_params.insn_extra_cost;
11083
11084 if (GET_MODE_CLASS (mode) == MODE_INT)
11085 *cost += extra_cost->alu.arith;
11086 else
11087 *cost += extra_cost->fp[mode == DFmode].compare;
11088 }
11089 return true;
11090 }
11091
11092 /* It's a conditional operation based on the status flags,
11093 so it must be some flavor of CSEL. */
11094
11095 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
11096 if (GET_CODE (op1) == NEG
11097 || GET_CODE (op1) == NOT
11098 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11099 op1 = XEXP (op1, 0);
11100 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11101 {
11102 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
11103 op1 = XEXP (op1, 0);
11104 op2 = XEXP (op2, 0);
11105 }
11106
11107 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11108 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11109 return true;
11110 }
11111
11112 /* We don't know what this is, cost all operands. */
11113 return false;
11114 }
11115
11116 /* Check whether X is a bitfield operation of the form shift + extend that
11117 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
11118 operand to which the bitfield operation is applied. Otherwise return
11119 NULL_RTX. */
11120
11121 static rtx
11122 aarch64_extend_bitfield_pattern_p (rtx x)
11123 {
11124 rtx_code outer_code = GET_CODE (x);
11125 machine_mode outer_mode = GET_MODE (x);
11126
11127 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11128 && outer_mode != SImode && outer_mode != DImode)
11129 return NULL_RTX;
11130
11131 rtx inner = XEXP (x, 0);
11132 rtx_code inner_code = GET_CODE (inner);
11133 machine_mode inner_mode = GET_MODE (inner);
11134 rtx op = NULL_RTX;
11135
11136 switch (inner_code)
11137 {
11138 case ASHIFT:
11139 if (CONST_INT_P (XEXP (inner, 1))
11140 && (inner_mode == QImode || inner_mode == HImode))
11141 op = XEXP (inner, 0);
11142 break;
11143 case LSHIFTRT:
11144 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11145 && (inner_mode == QImode || inner_mode == HImode))
11146 op = XEXP (inner, 0);
11147 break;
11148 case ASHIFTRT:
11149 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11150 && (inner_mode == QImode || inner_mode == HImode))
11151 op = XEXP (inner, 0);
11152 break;
11153 default:
11154 break;
11155 }
11156
11157 return op;
11158 }
11159
11160 /* Return true if the mask and a shift amount from an RTX of the form
11161 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11162 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
11163
11164 bool
11165 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11166 rtx shft_amnt)
11167 {
11168 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11169 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11170 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11171 && (INTVAL (mask)
11172 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11173 }
11174
11175 /* Return true if the masks and a shift amount from an RTX of the form
11176 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11177 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
11178
11179 bool
11180 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11181 unsigned HOST_WIDE_INT mask1,
11182 unsigned HOST_WIDE_INT shft_amnt,
11183 unsigned HOST_WIDE_INT mask2)
11184 {
11185 unsigned HOST_WIDE_INT t;
11186
11187 /* Verify that there is no overlap in what bits are set in the two masks. */
11188 if (mask1 != ~mask2)
11189 return false;
11190
11191 /* Verify that mask2 is not all zeros or ones. */
11192 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11193 return false;
11194
11195 /* The shift amount should always be less than the mode size. */
11196 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11197
11198 /* Verify that the mask being shifted is contiguous and would be in the
11199 least significant bits after shifting by shft_amnt. */
11200 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11201 return (t == (t & -t));
11202 }
11203
11204 /* Calculate the cost of calculating X, storing it in *COST. Result
11205 is true if the total cost of the operation has now been calculated. */
11206 static bool
11207 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11208 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11209 {
11210 rtx op0, op1, op2;
11211 const struct cpu_cost_table *extra_cost
11212 = aarch64_tune_params.insn_extra_cost;
11213 int code = GET_CODE (x);
11214 scalar_int_mode int_mode;
11215
11216 /* By default, assume that everything has equivalent cost to the
11217 cheapest instruction. Any additional costs are applied as a delta
11218 above this default. */
11219 *cost = COSTS_N_INSNS (1);
11220
11221 switch (code)
11222 {
11223 case SET:
11224 /* The cost depends entirely on the operands to SET. */
11225 *cost = 0;
11226 op0 = SET_DEST (x);
11227 op1 = SET_SRC (x);
11228
11229 switch (GET_CODE (op0))
11230 {
11231 case MEM:
11232 if (speed)
11233 {
11234 rtx address = XEXP (op0, 0);
11235 if (VECTOR_MODE_P (mode))
11236 *cost += extra_cost->ldst.storev;
11237 else if (GET_MODE_CLASS (mode) == MODE_INT)
11238 *cost += extra_cost->ldst.store;
11239 else if (mode == SFmode)
11240 *cost += extra_cost->ldst.storef;
11241 else if (mode == DFmode)
11242 *cost += extra_cost->ldst.stored;
11243
11244 *cost +=
11245 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11246 0, speed));
11247 }
11248
11249 *cost += rtx_cost (op1, mode, SET, 1, speed);
11250 return true;
11251
11252 case SUBREG:
11253 if (! REG_P (SUBREG_REG (op0)))
11254 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11255
11256 /* Fall through. */
11257 case REG:
11258 /* The cost is one per vector-register copied. */
11259 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11260 {
11261 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11262 *cost = COSTS_N_INSNS (nregs);
11263 }
11264 /* const0_rtx is in general free, but we will use an
11265 instruction to set a register to 0. */
11266 else if (REG_P (op1) || op1 == const0_rtx)
11267 {
11268 /* The cost is 1 per register copied. */
11269 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11270 *cost = COSTS_N_INSNS (nregs);
11271 }
11272 else
11273 /* Cost is just the cost of the RHS of the set. */
11274 *cost += rtx_cost (op1, mode, SET, 1, speed);
11275 return true;
11276
11277 case ZERO_EXTRACT:
11278 case SIGN_EXTRACT:
11279 /* Bit-field insertion. Strip any redundant widening of
11280 the RHS to meet the width of the target. */
11281 if (GET_CODE (op1) == SUBREG)
11282 op1 = SUBREG_REG (op1);
11283 if ((GET_CODE (op1) == ZERO_EXTEND
11284 || GET_CODE (op1) == SIGN_EXTEND)
11285 && CONST_INT_P (XEXP (op0, 1))
11286 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11287 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11288 op1 = XEXP (op1, 0);
11289
11290 if (CONST_INT_P (op1))
11291 {
11292 /* MOV immediate is assumed to always be cheap. */
11293 *cost = COSTS_N_INSNS (1);
11294 }
11295 else
11296 {
11297 /* BFM. */
11298 if (speed)
11299 *cost += extra_cost->alu.bfi;
11300 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11301 }
11302
11303 return true;
11304
11305 default:
11306 /* We can't make sense of this, assume default cost. */
11307 *cost = COSTS_N_INSNS (1);
11308 return false;
11309 }
11310 return false;
11311
11312 case CONST_INT:
11313 /* If an instruction can incorporate a constant within the
11314 instruction, the instruction's expression avoids calling
11315 rtx_cost() on the constant. If rtx_cost() is called on a
11316 constant, then it is usually because the constant must be
11317 moved into a register by one or more instructions.
11318
11319 The exception is constant 0, which can be expressed
11320 as XZR/WZR and is therefore free. The exception to this is
11321 if we have (set (reg) (const0_rtx)) in which case we must cost
11322 the move. However, we can catch that when we cost the SET, so
11323 we don't need to consider that here. */
11324 if (x == const0_rtx)
11325 *cost = 0;
11326 else
11327 {
11328 /* To an approximation, building any other constant is
11329 proportionally expensive to the number of instructions
11330 required to build that constant. This is true whether we
11331 are compiling for SPEED or otherwise. */
11332 if (!is_a <scalar_int_mode> (mode, &int_mode))
11333 int_mode = word_mode;
11334 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
11335 (NULL_RTX, x, false, int_mode));
11336 }
11337 return true;
11338
11339 case CONST_DOUBLE:
11340
11341 /* First determine number of instructions to do the move
11342 as an integer constant. */
11343 if (!aarch64_float_const_representable_p (x)
11344 && !aarch64_can_const_movi_rtx_p (x, mode)
11345 && aarch64_float_const_rtx_p (x))
11346 {
11347 unsigned HOST_WIDE_INT ival;
11348 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
11349 gcc_assert (succeed);
11350
11351 scalar_int_mode imode = (mode == HFmode
11352 ? SImode
11353 : int_mode_for_mode (mode).require ());
11354 int ncost = aarch64_internal_mov_immediate
11355 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11356 *cost += COSTS_N_INSNS (ncost);
11357 return true;
11358 }
11359
11360 if (speed)
11361 {
11362 /* mov[df,sf]_aarch64. */
11363 if (aarch64_float_const_representable_p (x))
11364 /* FMOV (scalar immediate). */
11365 *cost += extra_cost->fp[mode == DFmode].fpconst;
11366 else if (!aarch64_float_const_zero_rtx_p (x))
11367 {
11368 /* This will be a load from memory. */
11369 if (mode == DFmode)
11370 *cost += extra_cost->ldst.loadd;
11371 else
11372 *cost += extra_cost->ldst.loadf;
11373 }
11374 else
11375 /* Otherwise this is +0.0. We get this using MOVI d0, #0
11376 or MOV v0.s[0], wzr - neither of which are modeled by the
11377 cost tables. Just use the default cost. */
11378 {
11379 }
11380 }
11381
11382 return true;
11383
11384 case MEM:
11385 if (speed)
11386 {
11387 /* For loads we want the base cost of a load, plus an
11388 approximation for the additional cost of the addressing
11389 mode. */
11390 rtx address = XEXP (x, 0);
11391 if (VECTOR_MODE_P (mode))
11392 *cost += extra_cost->ldst.loadv;
11393 else if (GET_MODE_CLASS (mode) == MODE_INT)
11394 *cost += extra_cost->ldst.load;
11395 else if (mode == SFmode)
11396 *cost += extra_cost->ldst.loadf;
11397 else if (mode == DFmode)
11398 *cost += extra_cost->ldst.loadd;
11399
11400 *cost +=
11401 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11402 0, speed));
11403 }
11404
11405 return true;
11406
11407 case NEG:
11408 op0 = XEXP (x, 0);
11409
11410 if (VECTOR_MODE_P (mode))
11411 {
11412 if (speed)
11413 {
11414 /* FNEG. */
11415 *cost += extra_cost->vect.alu;
11416 }
11417 return false;
11418 }
11419
11420 if (GET_MODE_CLASS (mode) == MODE_INT)
11421 {
11422 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11423 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11424 {
11425 /* CSETM. */
11426 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
11427 return true;
11428 }
11429
11430 /* Cost this as SUB wzr, X. */
11431 op0 = CONST0_RTX (mode);
11432 op1 = XEXP (x, 0);
11433 goto cost_minus;
11434 }
11435
11436 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11437 {
11438 /* Support (neg(fma...)) as a single instruction only if
11439 sign of zeros is unimportant. This matches the decision
11440 making in aarch64.md. */
11441 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
11442 {
11443 /* FNMADD. */
11444 *cost = rtx_cost (op0, mode, NEG, 0, speed);
11445 return true;
11446 }
11447 if (GET_CODE (op0) == MULT)
11448 {
11449 /* FNMUL. */
11450 *cost = rtx_cost (op0, mode, NEG, 0, speed);
11451 return true;
11452 }
11453 if (speed)
11454 /* FNEG. */
11455 *cost += extra_cost->fp[mode == DFmode].neg;
11456 return false;
11457 }
11458
11459 return false;
11460
11461 case CLRSB:
11462 case CLZ:
11463 if (speed)
11464 {
11465 if (VECTOR_MODE_P (mode))
11466 *cost += extra_cost->vect.alu;
11467 else
11468 *cost += extra_cost->alu.clz;
11469 }
11470
11471 return false;
11472
11473 case COMPARE:
11474 op0 = XEXP (x, 0);
11475 op1 = XEXP (x, 1);
11476
11477 if (op1 == const0_rtx
11478 && GET_CODE (op0) == AND)
11479 {
11480 x = op0;
11481 mode = GET_MODE (op0);
11482 goto cost_logic;
11483 }
11484
11485 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
11486 {
11487 /* TODO: A write to the CC flags possibly costs extra, this
11488 needs encoding in the cost tables. */
11489
11490 mode = GET_MODE (op0);
11491 /* ANDS. */
11492 if (GET_CODE (op0) == AND)
11493 {
11494 x = op0;
11495 goto cost_logic;
11496 }
11497
11498 if (GET_CODE (op0) == PLUS)
11499 {
11500 /* ADDS (and CMN alias). */
11501 x = op0;
11502 goto cost_plus;
11503 }
11504
11505 if (GET_CODE (op0) == MINUS)
11506 {
11507 /* SUBS. */
11508 x = op0;
11509 goto cost_minus;
11510 }
11511
11512 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
11513 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
11514 && CONST_INT_P (XEXP (op0, 2)))
11515 {
11516 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
11517 Handle it here directly rather than going to cost_logic
11518 since we know the immediate generated for the TST is valid
11519 so we can avoid creating an intermediate rtx for it only
11520 for costing purposes. */
11521 if (speed)
11522 *cost += extra_cost->alu.logical;
11523
11524 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
11525 ZERO_EXTRACT, 0, speed);
11526 return true;
11527 }
11528
11529 if (GET_CODE (op1) == NEG)
11530 {
11531 /* CMN. */
11532 if (speed)
11533 *cost += extra_cost->alu.arith;
11534
11535 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
11536 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
11537 return true;
11538 }
11539
11540 /* CMP.
11541
11542 Compare can freely swap the order of operands, and
11543 canonicalization puts the more complex operation first.
11544 But the integer MINUS logic expects the shift/extend
11545 operation in op1. */
11546 if (! (REG_P (op0)
11547 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
11548 {
11549 op0 = XEXP (x, 1);
11550 op1 = XEXP (x, 0);
11551 }
11552 goto cost_minus;
11553 }
11554
11555 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
11556 {
11557 /* FCMP. */
11558 if (speed)
11559 *cost += extra_cost->fp[mode == DFmode].compare;
11560
11561 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
11562 {
11563 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
11564 /* FCMP supports constant 0.0 for no extra cost. */
11565 return true;
11566 }
11567 return false;
11568 }
11569
11570 if (VECTOR_MODE_P (mode))
11571 {
11572 /* Vector compare. */
11573 if (speed)
11574 *cost += extra_cost->vect.alu;
11575
11576 if (aarch64_float_const_zero_rtx_p (op1))
11577 {
11578 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
11579 cost. */
11580 return true;
11581 }
11582 return false;
11583 }
11584 return false;
11585
11586 case MINUS:
11587 {
11588 op0 = XEXP (x, 0);
11589 op1 = XEXP (x, 1);
11590
11591 cost_minus:
11592 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
11593
11594 /* Detect valid immediates. */
11595 if ((GET_MODE_CLASS (mode) == MODE_INT
11596 || (GET_MODE_CLASS (mode) == MODE_CC
11597 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
11598 && CONST_INT_P (op1)
11599 && aarch64_uimm12_shift (INTVAL (op1)))
11600 {
11601 if (speed)
11602 /* SUB(S) (immediate). */
11603 *cost += extra_cost->alu.arith;
11604 return true;
11605 }
11606
11607 /* Look for SUB (extended register). */
11608 if (is_a <scalar_int_mode> (mode, &int_mode)
11609 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
11610 {
11611 if (speed)
11612 *cost += extra_cost->alu.extend_arith;
11613
11614 op1 = aarch64_strip_extend (op1, true);
11615 *cost += rtx_cost (op1, VOIDmode,
11616 (enum rtx_code) GET_CODE (op1), 0, speed);
11617 return true;
11618 }
11619
11620 rtx new_op1 = aarch64_strip_extend (op1, false);
11621
11622 /* Cost this as an FMA-alike operation. */
11623 if ((GET_CODE (new_op1) == MULT
11624 || aarch64_shift_p (GET_CODE (new_op1)))
11625 && code != COMPARE)
11626 {
11627 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
11628 (enum rtx_code) code,
11629 speed);
11630 return true;
11631 }
11632
11633 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
11634
11635 if (speed)
11636 {
11637 if (VECTOR_MODE_P (mode))
11638 {
11639 /* Vector SUB. */
11640 *cost += extra_cost->vect.alu;
11641 }
11642 else if (GET_MODE_CLASS (mode) == MODE_INT)
11643 {
11644 /* SUB(S). */
11645 *cost += extra_cost->alu.arith;
11646 }
11647 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11648 {
11649 /* FSUB. */
11650 *cost += extra_cost->fp[mode == DFmode].addsub;
11651 }
11652 }
11653 return true;
11654 }
11655
11656 case PLUS:
11657 {
11658 rtx new_op0;
11659
11660 op0 = XEXP (x, 0);
11661 op1 = XEXP (x, 1);
11662
11663 cost_plus:
11664 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
11665 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
11666 {
11667 /* CSINC. */
11668 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
11669 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11670 return true;
11671 }
11672
11673 if (GET_MODE_CLASS (mode) == MODE_INT
11674 && (aarch64_plus_immediate (op1, mode)
11675 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
11676 {
11677 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
11678
11679 if (speed)
11680 /* ADD (immediate). */
11681 *cost += extra_cost->alu.arith;
11682 return true;
11683 }
11684
11685 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
11686
11687 /* Look for ADD (extended register). */
11688 if (is_a <scalar_int_mode> (mode, &int_mode)
11689 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
11690 {
11691 if (speed)
11692 *cost += extra_cost->alu.extend_arith;
11693
11694 op0 = aarch64_strip_extend (op0, true);
11695 *cost += rtx_cost (op0, VOIDmode,
11696 (enum rtx_code) GET_CODE (op0), 0, speed);
11697 return true;
11698 }
11699
11700 /* Strip any extend, leave shifts behind as we will
11701 cost them through mult_cost. */
11702 new_op0 = aarch64_strip_extend (op0, false);
11703
11704 if (GET_CODE (new_op0) == MULT
11705 || aarch64_shift_p (GET_CODE (new_op0)))
11706 {
11707 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
11708 speed);
11709 return true;
11710 }
11711
11712 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
11713
11714 if (speed)
11715 {
11716 if (VECTOR_MODE_P (mode))
11717 {
11718 /* Vector ADD. */
11719 *cost += extra_cost->vect.alu;
11720 }
11721 else if (GET_MODE_CLASS (mode) == MODE_INT)
11722 {
11723 /* ADD. */
11724 *cost += extra_cost->alu.arith;
11725 }
11726 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11727 {
11728 /* FADD. */
11729 *cost += extra_cost->fp[mode == DFmode].addsub;
11730 }
11731 }
11732 return true;
11733 }
11734
11735 case BSWAP:
11736 *cost = COSTS_N_INSNS (1);
11737
11738 if (speed)
11739 {
11740 if (VECTOR_MODE_P (mode))
11741 *cost += extra_cost->vect.alu;
11742 else
11743 *cost += extra_cost->alu.rev;
11744 }
11745 return false;
11746
11747 case IOR:
11748 if (aarch_rev16_p (x))
11749 {
11750 *cost = COSTS_N_INSNS (1);
11751
11752 if (speed)
11753 {
11754 if (VECTOR_MODE_P (mode))
11755 *cost += extra_cost->vect.alu;
11756 else
11757 *cost += extra_cost->alu.rev;
11758 }
11759 return true;
11760 }
11761
11762 if (aarch64_extr_rtx_p (x, &op0, &op1))
11763 {
11764 *cost += rtx_cost (op0, mode, IOR, 0, speed);
11765 *cost += rtx_cost (op1, mode, IOR, 1, speed);
11766 if (speed)
11767 *cost += extra_cost->alu.shift;
11768
11769 return true;
11770 }
11771 /* Fall through. */
11772 case XOR:
11773 case AND:
11774 cost_logic:
11775 op0 = XEXP (x, 0);
11776 op1 = XEXP (x, 1);
11777
11778 if (VECTOR_MODE_P (mode))
11779 {
11780 if (speed)
11781 *cost += extra_cost->vect.alu;
11782 return true;
11783 }
11784
11785 if (code == AND
11786 && GET_CODE (op0) == MULT
11787 && CONST_INT_P (XEXP (op0, 1))
11788 && CONST_INT_P (op1)
11789 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
11790 INTVAL (op1)) != 0)
11791 {
11792 /* This is a UBFM/SBFM. */
11793 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
11794 if (speed)
11795 *cost += extra_cost->alu.bfx;
11796 return true;
11797 }
11798
11799 if (is_int_mode (mode, &int_mode))
11800 {
11801 if (CONST_INT_P (op1))
11802 {
11803 /* We have a mask + shift version of a UBFIZ
11804 i.e. the *andim_ashift<mode>_bfiz pattern. */
11805 if (GET_CODE (op0) == ASHIFT
11806 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
11807 XEXP (op0, 1)))
11808 {
11809 *cost += rtx_cost (XEXP (op0, 0), int_mode,
11810 (enum rtx_code) code, 0, speed);
11811 if (speed)
11812 *cost += extra_cost->alu.bfx;
11813
11814 return true;
11815 }
11816 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
11817 {
11818 /* We possibly get the immediate for free, this is not
11819 modelled. */
11820 *cost += rtx_cost (op0, int_mode,
11821 (enum rtx_code) code, 0, speed);
11822 if (speed)
11823 *cost += extra_cost->alu.logical;
11824
11825 return true;
11826 }
11827 }
11828 else
11829 {
11830 rtx new_op0 = op0;
11831
11832 /* Handle ORN, EON, or BIC. */
11833 if (GET_CODE (op0) == NOT)
11834 op0 = XEXP (op0, 0);
11835
11836 new_op0 = aarch64_strip_shift (op0);
11837
11838 /* If we had a shift on op0 then this is a logical-shift-
11839 by-register/immediate operation. Otherwise, this is just
11840 a logical operation. */
11841 if (speed)
11842 {
11843 if (new_op0 != op0)
11844 {
11845 /* Shift by immediate. */
11846 if (CONST_INT_P (XEXP (op0, 1)))
11847 *cost += extra_cost->alu.log_shift;
11848 else
11849 *cost += extra_cost->alu.log_shift_reg;
11850 }
11851 else
11852 *cost += extra_cost->alu.logical;
11853 }
11854
11855 /* In both cases we want to cost both operands. */
11856 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
11857 0, speed);
11858 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
11859 1, speed);
11860
11861 return true;
11862 }
11863 }
11864 return false;
11865
11866 case NOT:
11867 x = XEXP (x, 0);
11868 op0 = aarch64_strip_shift (x);
11869
11870 if (VECTOR_MODE_P (mode))
11871 {
11872 /* Vector NOT. */
11873 *cost += extra_cost->vect.alu;
11874 return false;
11875 }
11876
11877 /* MVN-shifted-reg. */
11878 if (op0 != x)
11879 {
11880 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
11881
11882 if (speed)
11883 *cost += extra_cost->alu.log_shift;
11884
11885 return true;
11886 }
11887 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
11888 Handle the second form here taking care that 'a' in the above can
11889 be a shift. */
11890 else if (GET_CODE (op0) == XOR)
11891 {
11892 rtx newop0 = XEXP (op0, 0);
11893 rtx newop1 = XEXP (op0, 1);
11894 rtx op0_stripped = aarch64_strip_shift (newop0);
11895
11896 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
11897 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
11898
11899 if (speed)
11900 {
11901 if (op0_stripped != newop0)
11902 *cost += extra_cost->alu.log_shift;
11903 else
11904 *cost += extra_cost->alu.logical;
11905 }
11906
11907 return true;
11908 }
11909 /* MVN. */
11910 if (speed)
11911 *cost += extra_cost->alu.logical;
11912
11913 return false;
11914
11915 case ZERO_EXTEND:
11916
11917 op0 = XEXP (x, 0);
11918 /* If a value is written in SI mode, then zero extended to DI
11919 mode, the operation will in general be free as a write to
11920 a 'w' register implicitly zeroes the upper bits of an 'x'
11921 register. However, if this is
11922
11923 (set (reg) (zero_extend (reg)))
11924
11925 we must cost the explicit register move. */
11926 if (mode == DImode
11927 && GET_MODE (op0) == SImode
11928 && outer == SET)
11929 {
11930 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
11931
11932 /* If OP_COST is non-zero, then the cost of the zero extend
11933 is effectively the cost of the inner operation. Otherwise
11934 we have a MOV instruction and we take the cost from the MOV
11935 itself. This is true independently of whether we are
11936 optimizing for space or time. */
11937 if (op_cost)
11938 *cost = op_cost;
11939
11940 return true;
11941 }
11942 else if (MEM_P (op0))
11943 {
11944 /* All loads can zero extend to any size for free. */
11945 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
11946 return true;
11947 }
11948
11949 op0 = aarch64_extend_bitfield_pattern_p (x);
11950 if (op0)
11951 {
11952 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
11953 if (speed)
11954 *cost += extra_cost->alu.bfx;
11955 return true;
11956 }
11957
11958 if (speed)
11959 {
11960 if (VECTOR_MODE_P (mode))
11961 {
11962 /* UMOV. */
11963 *cost += extra_cost->vect.alu;
11964 }
11965 else
11966 {
11967 /* We generate an AND instead of UXTB/UXTH. */
11968 *cost += extra_cost->alu.logical;
11969 }
11970 }
11971 return false;
11972
11973 case SIGN_EXTEND:
11974 if (MEM_P (XEXP (x, 0)))
11975 {
11976 /* LDRSH. */
11977 if (speed)
11978 {
11979 rtx address = XEXP (XEXP (x, 0), 0);
11980 *cost += extra_cost->ldst.load_sign_extend;
11981
11982 *cost +=
11983 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11984 0, speed));
11985 }
11986 return true;
11987 }
11988
11989 op0 = aarch64_extend_bitfield_pattern_p (x);
11990 if (op0)
11991 {
11992 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
11993 if (speed)
11994 *cost += extra_cost->alu.bfx;
11995 return true;
11996 }
11997
11998 if (speed)
11999 {
12000 if (VECTOR_MODE_P (mode))
12001 *cost += extra_cost->vect.alu;
12002 else
12003 *cost += extra_cost->alu.extend;
12004 }
12005 return false;
12006
12007 case ASHIFT:
12008 op0 = XEXP (x, 0);
12009 op1 = XEXP (x, 1);
12010
12011 if (CONST_INT_P (op1))
12012 {
12013 if (speed)
12014 {
12015 if (VECTOR_MODE_P (mode))
12016 {
12017 /* Vector shift (immediate). */
12018 *cost += extra_cost->vect.alu;
12019 }
12020 else
12021 {
12022 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
12023 aliases. */
12024 *cost += extra_cost->alu.shift;
12025 }
12026 }
12027
12028 /* We can incorporate zero/sign extend for free. */
12029 if (GET_CODE (op0) == ZERO_EXTEND
12030 || GET_CODE (op0) == SIGN_EXTEND)
12031 op0 = XEXP (op0, 0);
12032
12033 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12034 return true;
12035 }
12036 else
12037 {
12038 if (VECTOR_MODE_P (mode))
12039 {
12040 if (speed)
12041 /* Vector shift (register). */
12042 *cost += extra_cost->vect.alu;
12043 }
12044 else
12045 {
12046 if (speed)
12047 /* LSLV. */
12048 *cost += extra_cost->alu.shift_reg;
12049
12050 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12051 && CONST_INT_P (XEXP (op1, 1))
12052 && known_eq (INTVAL (XEXP (op1, 1)),
12053 GET_MODE_BITSIZE (mode) - 1))
12054 {
12055 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12056 /* We already demanded XEXP (op1, 0) to be REG_P, so
12057 don't recurse into it. */
12058 return true;
12059 }
12060 }
12061 return false; /* All arguments need to be in registers. */
12062 }
12063
12064 case ROTATE:
12065 case ROTATERT:
12066 case LSHIFTRT:
12067 case ASHIFTRT:
12068 op0 = XEXP (x, 0);
12069 op1 = XEXP (x, 1);
12070
12071 if (CONST_INT_P (op1))
12072 {
12073 /* ASR (immediate) and friends. */
12074 if (speed)
12075 {
12076 if (VECTOR_MODE_P (mode))
12077 *cost += extra_cost->vect.alu;
12078 else
12079 *cost += extra_cost->alu.shift;
12080 }
12081
12082 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12083 return true;
12084 }
12085 else
12086 {
12087 if (VECTOR_MODE_P (mode))
12088 {
12089 if (speed)
12090 /* Vector shift (register). */
12091 *cost += extra_cost->vect.alu;
12092 }
12093 else
12094 {
12095 if (speed)
12096 /* ASR (register) and friends. */
12097 *cost += extra_cost->alu.shift_reg;
12098
12099 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12100 && CONST_INT_P (XEXP (op1, 1))
12101 && known_eq (INTVAL (XEXP (op1, 1)),
12102 GET_MODE_BITSIZE (mode) - 1))
12103 {
12104 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12105 /* We already demanded XEXP (op1, 0) to be REG_P, so
12106 don't recurse into it. */
12107 return true;
12108 }
12109 }
12110 return false; /* All arguments need to be in registers. */
12111 }
12112
12113 case SYMBOL_REF:
12114
12115 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12116 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12117 {
12118 /* LDR. */
12119 if (speed)
12120 *cost += extra_cost->ldst.load;
12121 }
12122 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12123 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12124 {
12125 /* ADRP, followed by ADD. */
12126 *cost += COSTS_N_INSNS (1);
12127 if (speed)
12128 *cost += 2 * extra_cost->alu.arith;
12129 }
12130 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12131 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12132 {
12133 /* ADR. */
12134 if (speed)
12135 *cost += extra_cost->alu.arith;
12136 }
12137
12138 if (flag_pic)
12139 {
12140 /* One extra load instruction, after accessing the GOT. */
12141 *cost += COSTS_N_INSNS (1);
12142 if (speed)
12143 *cost += extra_cost->ldst.load;
12144 }
12145 return true;
12146
12147 case HIGH:
12148 case LO_SUM:
12149 /* ADRP/ADD (immediate). */
12150 if (speed)
12151 *cost += extra_cost->alu.arith;
12152 return true;
12153
12154 case ZERO_EXTRACT:
12155 case SIGN_EXTRACT:
12156 /* UBFX/SBFX. */
12157 if (speed)
12158 {
12159 if (VECTOR_MODE_P (mode))
12160 *cost += extra_cost->vect.alu;
12161 else
12162 *cost += extra_cost->alu.bfx;
12163 }
12164
12165 /* We can trust that the immediates used will be correct (there
12166 are no by-register forms), so we need only cost op0. */
12167 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12168 return true;
12169
12170 case MULT:
12171 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12172 /* aarch64_rtx_mult_cost always handles recursion to its
12173 operands. */
12174 return true;
12175
12176 case MOD:
12177 /* We can expand signed mod by power of 2 using a NEGS, two parallel
12178 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
12179 an unconditional negate. This case should only ever be reached through
12180 the set_smod_pow2_cheap check in expmed.c. */
12181 if (CONST_INT_P (XEXP (x, 1))
12182 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
12183 && (mode == SImode || mode == DImode))
12184 {
12185 /* We expand to 4 instructions. Reset the baseline. */
12186 *cost = COSTS_N_INSNS (4);
12187
12188 if (speed)
12189 *cost += 2 * extra_cost->alu.logical
12190 + 2 * extra_cost->alu.arith;
12191
12192 return true;
12193 }
12194
12195 /* Fall-through. */
12196 case UMOD:
12197 if (speed)
12198 {
12199 /* Slighly prefer UMOD over SMOD. */
12200 if (VECTOR_MODE_P (mode))
12201 *cost += extra_cost->vect.alu;
12202 else if (GET_MODE_CLASS (mode) == MODE_INT)
12203 *cost += (extra_cost->mult[mode == DImode].add
12204 + extra_cost->mult[mode == DImode].idiv
12205 + (code == MOD ? 1 : 0));
12206 }
12207 return false; /* All arguments need to be in registers. */
12208
12209 case DIV:
12210 case UDIV:
12211 case SQRT:
12212 if (speed)
12213 {
12214 if (VECTOR_MODE_P (mode))
12215 *cost += extra_cost->vect.alu;
12216 else if (GET_MODE_CLASS (mode) == MODE_INT)
12217 /* There is no integer SQRT, so only DIV and UDIV can get
12218 here. */
12219 *cost += (extra_cost->mult[mode == DImode].idiv
12220 /* Slighly prefer UDIV over SDIV. */
12221 + (code == DIV ? 1 : 0));
12222 else
12223 *cost += extra_cost->fp[mode == DFmode].div;
12224 }
12225 return false; /* All arguments need to be in registers. */
12226
12227 case IF_THEN_ELSE:
12228 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12229 XEXP (x, 2), cost, speed);
12230
12231 case EQ:
12232 case NE:
12233 case GT:
12234 case GTU:
12235 case LT:
12236 case LTU:
12237 case GE:
12238 case GEU:
12239 case LE:
12240 case LEU:
12241
12242 return false; /* All arguments must be in registers. */
12243
12244 case FMA:
12245 op0 = XEXP (x, 0);
12246 op1 = XEXP (x, 1);
12247 op2 = XEXP (x, 2);
12248
12249 if (speed)
12250 {
12251 if (VECTOR_MODE_P (mode))
12252 *cost += extra_cost->vect.alu;
12253 else
12254 *cost += extra_cost->fp[mode == DFmode].fma;
12255 }
12256
12257 /* FMSUB, FNMADD, and FNMSUB are free. */
12258 if (GET_CODE (op0) == NEG)
12259 op0 = XEXP (op0, 0);
12260
12261 if (GET_CODE (op2) == NEG)
12262 op2 = XEXP (op2, 0);
12263
12264 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12265 and the by-element operand as operand 0. */
12266 if (GET_CODE (op1) == NEG)
12267 op1 = XEXP (op1, 0);
12268
12269 /* Catch vector-by-element operations. The by-element operand can
12270 either be (vec_duplicate (vec_select (x))) or just
12271 (vec_select (x)), depending on whether we are multiplying by
12272 a vector or a scalar.
12273
12274 Canonicalization is not very good in these cases, FMA4 will put the
12275 by-element operand as operand 0, FNMA4 will have it as operand 1. */
12276 if (GET_CODE (op0) == VEC_DUPLICATE)
12277 op0 = XEXP (op0, 0);
12278 else if (GET_CODE (op1) == VEC_DUPLICATE)
12279 op1 = XEXP (op1, 0);
12280
12281 if (GET_CODE (op0) == VEC_SELECT)
12282 op0 = XEXP (op0, 0);
12283 else if (GET_CODE (op1) == VEC_SELECT)
12284 op1 = XEXP (op1, 0);
12285
12286 /* If the remaining parameters are not registers,
12287 get the cost to put them into registers. */
12288 *cost += rtx_cost (op0, mode, FMA, 0, speed);
12289 *cost += rtx_cost (op1, mode, FMA, 1, speed);
12290 *cost += rtx_cost (op2, mode, FMA, 2, speed);
12291 return true;
12292
12293 case FLOAT:
12294 case UNSIGNED_FLOAT:
12295 if (speed)
12296 *cost += extra_cost->fp[mode == DFmode].fromint;
12297 return false;
12298
12299 case FLOAT_EXTEND:
12300 if (speed)
12301 {
12302 if (VECTOR_MODE_P (mode))
12303 {
12304 /*Vector truncate. */
12305 *cost += extra_cost->vect.alu;
12306 }
12307 else
12308 *cost += extra_cost->fp[mode == DFmode].widen;
12309 }
12310 return false;
12311
12312 case FLOAT_TRUNCATE:
12313 if (speed)
12314 {
12315 if (VECTOR_MODE_P (mode))
12316 {
12317 /*Vector conversion. */
12318 *cost += extra_cost->vect.alu;
12319 }
12320 else
12321 *cost += extra_cost->fp[mode == DFmode].narrow;
12322 }
12323 return false;
12324
12325 case FIX:
12326 case UNSIGNED_FIX:
12327 x = XEXP (x, 0);
12328 /* Strip the rounding part. They will all be implemented
12329 by the fcvt* family of instructions anyway. */
12330 if (GET_CODE (x) == UNSPEC)
12331 {
12332 unsigned int uns_code = XINT (x, 1);
12333
12334 if (uns_code == UNSPEC_FRINTA
12335 || uns_code == UNSPEC_FRINTM
12336 || uns_code == UNSPEC_FRINTN
12337 || uns_code == UNSPEC_FRINTP
12338 || uns_code == UNSPEC_FRINTZ)
12339 x = XVECEXP (x, 0, 0);
12340 }
12341
12342 if (speed)
12343 {
12344 if (VECTOR_MODE_P (mode))
12345 *cost += extra_cost->vect.alu;
12346 else
12347 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
12348 }
12349
12350 /* We can combine fmul by a power of 2 followed by a fcvt into a single
12351 fixed-point fcvt. */
12352 if (GET_CODE (x) == MULT
12353 && ((VECTOR_MODE_P (mode)
12354 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
12355 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
12356 {
12357 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
12358 0, speed);
12359 return true;
12360 }
12361
12362 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
12363 return true;
12364
12365 case ABS:
12366 if (VECTOR_MODE_P (mode))
12367 {
12368 /* ABS (vector). */
12369 if (speed)
12370 *cost += extra_cost->vect.alu;
12371 }
12372 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12373 {
12374 op0 = XEXP (x, 0);
12375
12376 /* FABD, which is analogous to FADD. */
12377 if (GET_CODE (op0) == MINUS)
12378 {
12379 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
12380 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
12381 if (speed)
12382 *cost += extra_cost->fp[mode == DFmode].addsub;
12383
12384 return true;
12385 }
12386 /* Simple FABS is analogous to FNEG. */
12387 if (speed)
12388 *cost += extra_cost->fp[mode == DFmode].neg;
12389 }
12390 else
12391 {
12392 /* Integer ABS will either be split to
12393 two arithmetic instructions, or will be an ABS
12394 (scalar), which we don't model. */
12395 *cost = COSTS_N_INSNS (2);
12396 if (speed)
12397 *cost += 2 * extra_cost->alu.arith;
12398 }
12399 return false;
12400
12401 case SMAX:
12402 case SMIN:
12403 if (speed)
12404 {
12405 if (VECTOR_MODE_P (mode))
12406 *cost += extra_cost->vect.alu;
12407 else
12408 {
12409 /* FMAXNM/FMINNM/FMAX/FMIN.
12410 TODO: This may not be accurate for all implementations, but
12411 we do not model this in the cost tables. */
12412 *cost += extra_cost->fp[mode == DFmode].addsub;
12413 }
12414 }
12415 return false;
12416
12417 case UNSPEC:
12418 /* The floating point round to integer frint* instructions. */
12419 if (aarch64_frint_unspec_p (XINT (x, 1)))
12420 {
12421 if (speed)
12422 *cost += extra_cost->fp[mode == DFmode].roundint;
12423
12424 return false;
12425 }
12426
12427 if (XINT (x, 1) == UNSPEC_RBIT)
12428 {
12429 if (speed)
12430 *cost += extra_cost->alu.rev;
12431
12432 return false;
12433 }
12434 break;
12435
12436 case TRUNCATE:
12437
12438 /* Decompose <su>muldi3_highpart. */
12439 if (/* (truncate:DI */
12440 mode == DImode
12441 /* (lshiftrt:TI */
12442 && GET_MODE (XEXP (x, 0)) == TImode
12443 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
12444 /* (mult:TI */
12445 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12446 /* (ANY_EXTEND:TI (reg:DI))
12447 (ANY_EXTEND:TI (reg:DI))) */
12448 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
12449 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
12450 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
12451 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
12452 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
12453 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
12454 /* (const_int 64) */
12455 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12456 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
12457 {
12458 /* UMULH/SMULH. */
12459 if (speed)
12460 *cost += extra_cost->mult[mode == DImode].extend;
12461 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
12462 mode, MULT, 0, speed);
12463 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
12464 mode, MULT, 1, speed);
12465 return true;
12466 }
12467
12468 /* Fall through. */
12469 default:
12470 break;
12471 }
12472
12473 if (dump_file
12474 && flag_aarch64_verbose_cost)
12475 fprintf (dump_file,
12476 "\nFailed to cost RTX. Assuming default cost.\n");
12477
12478 return true;
12479 }
12480
12481 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
12482 calculated for X. This cost is stored in *COST. Returns true
12483 if the total cost of X was calculated. */
12484 static bool
12485 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
12486 int param, int *cost, bool speed)
12487 {
12488 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
12489
12490 if (dump_file
12491 && flag_aarch64_verbose_cost)
12492 {
12493 print_rtl_single (dump_file, x);
12494 fprintf (dump_file, "\n%s cost: %d (%s)\n",
12495 speed ? "Hot" : "Cold",
12496 *cost, result ? "final" : "partial");
12497 }
12498
12499 return result;
12500 }
12501
12502 static int
12503 aarch64_register_move_cost (machine_mode mode,
12504 reg_class_t from_i, reg_class_t to_i)
12505 {
12506 enum reg_class from = (enum reg_class) from_i;
12507 enum reg_class to = (enum reg_class) to_i;
12508 const struct cpu_regmove_cost *regmove_cost
12509 = aarch64_tune_params.regmove_cost;
12510
12511 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
12512 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
12513 to = GENERAL_REGS;
12514
12515 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
12516 from = GENERAL_REGS;
12517
12518 /* Make RDFFR very expensive. In particular, if we know that the FFR
12519 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
12520 as a way of obtaining a PTRUE. */
12521 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
12522 && hard_reg_set_subset_p (reg_class_contents[from_i],
12523 reg_class_contents[FFR_REGS]))
12524 return 80;
12525
12526 /* Moving between GPR and stack cost is the same as GP2GP. */
12527 if ((from == GENERAL_REGS && to == STACK_REG)
12528 || (to == GENERAL_REGS && from == STACK_REG))
12529 return regmove_cost->GP2GP;
12530
12531 /* To/From the stack register, we move via the gprs. */
12532 if (to == STACK_REG || from == STACK_REG)
12533 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
12534 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
12535
12536 if (known_eq (GET_MODE_SIZE (mode), 16))
12537 {
12538 /* 128-bit operations on general registers require 2 instructions. */
12539 if (from == GENERAL_REGS && to == GENERAL_REGS)
12540 return regmove_cost->GP2GP * 2;
12541 else if (from == GENERAL_REGS)
12542 return regmove_cost->GP2FP * 2;
12543 else if (to == GENERAL_REGS)
12544 return regmove_cost->FP2GP * 2;
12545
12546 /* When AdvSIMD instructions are disabled it is not possible to move
12547 a 128-bit value directly between Q registers. This is handled in
12548 secondary reload. A general register is used as a scratch to move
12549 the upper DI value and the lower DI value is moved directly,
12550 hence the cost is the sum of three moves. */
12551 if (! TARGET_SIMD)
12552 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
12553
12554 return regmove_cost->FP2FP;
12555 }
12556
12557 if (from == GENERAL_REGS && to == GENERAL_REGS)
12558 return regmove_cost->GP2GP;
12559 else if (from == GENERAL_REGS)
12560 return regmove_cost->GP2FP;
12561 else if (to == GENERAL_REGS)
12562 return regmove_cost->FP2GP;
12563
12564 return regmove_cost->FP2FP;
12565 }
12566
12567 static int
12568 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
12569 reg_class_t rclass ATTRIBUTE_UNUSED,
12570 bool in ATTRIBUTE_UNUSED)
12571 {
12572 return aarch64_tune_params.memmov_cost;
12573 }
12574
12575 /* Implement TARGET_INIT_BUILTINS. */
12576 static void
12577 aarch64_init_builtins ()
12578 {
12579 aarch64_general_init_builtins ();
12580 aarch64_sve::init_builtins ();
12581 }
12582
12583 /* Implement TARGET_FOLD_BUILTIN. */
12584 static tree
12585 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
12586 {
12587 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12588 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12589 tree type = TREE_TYPE (TREE_TYPE (fndecl));
12590 switch (code & AARCH64_BUILTIN_CLASS)
12591 {
12592 case AARCH64_BUILTIN_GENERAL:
12593 return aarch64_general_fold_builtin (subcode, type, nargs, args);
12594
12595 case AARCH64_BUILTIN_SVE:
12596 return NULL_TREE;
12597 }
12598 gcc_unreachable ();
12599 }
12600
12601 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
12602 static bool
12603 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
12604 {
12605 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
12606 tree fndecl = gimple_call_fndecl (stmt);
12607 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12608 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12609 gimple *new_stmt = NULL;
12610 switch (code & AARCH64_BUILTIN_CLASS)
12611 {
12612 case AARCH64_BUILTIN_GENERAL:
12613 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
12614 break;
12615
12616 case AARCH64_BUILTIN_SVE:
12617 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
12618 break;
12619 }
12620
12621 if (!new_stmt)
12622 return false;
12623
12624 gsi_replace (gsi, new_stmt, true);
12625 return true;
12626 }
12627
12628 /* Implement TARGET_EXPAND_BUILTIN. */
12629 static rtx
12630 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
12631 {
12632 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
12633 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12634 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12635 switch (code & AARCH64_BUILTIN_CLASS)
12636 {
12637 case AARCH64_BUILTIN_GENERAL:
12638 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
12639
12640 case AARCH64_BUILTIN_SVE:
12641 return aarch64_sve::expand_builtin (subcode, exp, target);
12642 }
12643 gcc_unreachable ();
12644 }
12645
12646 /* Implement TARGET_BUILTIN_DECL. */
12647 static tree
12648 aarch64_builtin_decl (unsigned int code, bool initialize_p)
12649 {
12650 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12651 switch (code & AARCH64_BUILTIN_CLASS)
12652 {
12653 case AARCH64_BUILTIN_GENERAL:
12654 return aarch64_general_builtin_decl (subcode, initialize_p);
12655
12656 case AARCH64_BUILTIN_SVE:
12657 return aarch64_sve::builtin_decl (subcode, initialize_p);
12658 }
12659 gcc_unreachable ();
12660 }
12661
12662 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
12663 to optimize 1.0/sqrt. */
12664
12665 static bool
12666 use_rsqrt_p (machine_mode mode)
12667 {
12668 return (!flag_trapping_math
12669 && flag_unsafe_math_optimizations
12670 && ((aarch64_tune_params.approx_modes->recip_sqrt
12671 & AARCH64_APPROX_MODE (mode))
12672 || flag_mrecip_low_precision_sqrt));
12673 }
12674
12675 /* Function to decide when to use the approximate reciprocal square root
12676 builtin. */
12677
12678 static tree
12679 aarch64_builtin_reciprocal (tree fndecl)
12680 {
12681 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
12682
12683 if (!use_rsqrt_p (mode))
12684 return NULL_TREE;
12685 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
12686 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
12687 switch (code & AARCH64_BUILTIN_CLASS)
12688 {
12689 case AARCH64_BUILTIN_GENERAL:
12690 return aarch64_general_builtin_rsqrt (subcode);
12691
12692 case AARCH64_BUILTIN_SVE:
12693 return NULL_TREE;
12694 }
12695 gcc_unreachable ();
12696 }
12697
12698 /* Emit instruction sequence to compute either the approximate square root
12699 or its approximate reciprocal, depending on the flag RECP, and return
12700 whether the sequence was emitted or not. */
12701
12702 bool
12703 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
12704 {
12705 machine_mode mode = GET_MODE (dst);
12706
12707 if (GET_MODE_INNER (mode) == HFmode)
12708 {
12709 gcc_assert (!recp);
12710 return false;
12711 }
12712
12713 if (!recp)
12714 {
12715 if (!(flag_mlow_precision_sqrt
12716 || (aarch64_tune_params.approx_modes->sqrt
12717 & AARCH64_APPROX_MODE (mode))))
12718 return false;
12719
12720 if (flag_finite_math_only
12721 || flag_trapping_math
12722 || !flag_unsafe_math_optimizations
12723 || optimize_function_for_size_p (cfun))
12724 return false;
12725 }
12726 else
12727 /* Caller assumes we cannot fail. */
12728 gcc_assert (use_rsqrt_p (mode));
12729
12730 machine_mode mmsk = (VECTOR_MODE_P (mode)
12731 ? related_int_vector_mode (mode).require ()
12732 : int_mode_for_mode (mode).require ());
12733 rtx xmsk = gen_reg_rtx (mmsk);
12734 if (!recp)
12735 /* When calculating the approximate square root, compare the
12736 argument with 0.0 and create a mask. */
12737 emit_insn (gen_rtx_SET (xmsk,
12738 gen_rtx_NEG (mmsk,
12739 gen_rtx_EQ (mmsk, src,
12740 CONST0_RTX (mode)))));
12741
12742 /* Estimate the approximate reciprocal square root. */
12743 rtx xdst = gen_reg_rtx (mode);
12744 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
12745
12746 /* Iterate over the series twice for SF and thrice for DF. */
12747 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12748
12749 /* Optionally iterate over the series once less for faster performance
12750 while sacrificing the accuracy. */
12751 if ((recp && flag_mrecip_low_precision_sqrt)
12752 || (!recp && flag_mlow_precision_sqrt))
12753 iterations--;
12754
12755 /* Iterate over the series to calculate the approximate reciprocal square
12756 root. */
12757 rtx x1 = gen_reg_rtx (mode);
12758 while (iterations--)
12759 {
12760 rtx x2 = gen_reg_rtx (mode);
12761 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
12762
12763 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
12764
12765 if (iterations > 0)
12766 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
12767 }
12768
12769 if (!recp)
12770 {
12771 /* Qualify the approximate reciprocal square root when the argument is
12772 0.0 by squashing the intermediary result to 0.0. */
12773 rtx xtmp = gen_reg_rtx (mmsk);
12774 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
12775 gen_rtx_SUBREG (mmsk, xdst, 0)));
12776 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
12777
12778 /* Calculate the approximate square root. */
12779 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
12780 }
12781
12782 /* Finalize the approximation. */
12783 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
12784
12785 return true;
12786 }
12787
12788 /* Emit the instruction sequence to compute the approximation for the division
12789 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
12790
12791 bool
12792 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
12793 {
12794 machine_mode mode = GET_MODE (quo);
12795
12796 if (GET_MODE_INNER (mode) == HFmode)
12797 return false;
12798
12799 bool use_approx_division_p = (flag_mlow_precision_div
12800 || (aarch64_tune_params.approx_modes->division
12801 & AARCH64_APPROX_MODE (mode)));
12802
12803 if (!flag_finite_math_only
12804 || flag_trapping_math
12805 || !flag_unsafe_math_optimizations
12806 || optimize_function_for_size_p (cfun)
12807 || !use_approx_division_p)
12808 return false;
12809
12810 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
12811 return false;
12812
12813 /* Estimate the approximate reciprocal. */
12814 rtx xrcp = gen_reg_rtx (mode);
12815 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
12816
12817 /* Iterate over the series twice for SF and thrice for DF. */
12818 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
12819
12820 /* Optionally iterate over the series once less for faster performance,
12821 while sacrificing the accuracy. */
12822 if (flag_mlow_precision_div)
12823 iterations--;
12824
12825 /* Iterate over the series to calculate the approximate reciprocal. */
12826 rtx xtmp = gen_reg_rtx (mode);
12827 while (iterations--)
12828 {
12829 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
12830
12831 if (iterations > 0)
12832 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
12833 }
12834
12835 if (num != CONST1_RTX (mode))
12836 {
12837 /* As the approximate reciprocal of DEN is already calculated, only
12838 calculate the approximate division when NUM is not 1.0. */
12839 rtx xnum = force_reg (mode, num);
12840 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
12841 }
12842
12843 /* Finalize the approximation. */
12844 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
12845 return true;
12846 }
12847
12848 /* Return the number of instructions that can be issued per cycle. */
12849 static int
12850 aarch64_sched_issue_rate (void)
12851 {
12852 return aarch64_tune_params.issue_rate;
12853 }
12854
12855 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
12856 static int
12857 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
12858 {
12859 if (DEBUG_INSN_P (insn))
12860 return more;
12861
12862 rtx_code code = GET_CODE (PATTERN (insn));
12863 if (code == USE || code == CLOBBER)
12864 return more;
12865
12866 if (get_attr_type (insn) == TYPE_NO_INSN)
12867 return more;
12868
12869 return more - 1;
12870 }
12871
12872 static int
12873 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
12874 {
12875 int issue_rate = aarch64_sched_issue_rate ();
12876
12877 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
12878 }
12879
12880
12881 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
12882 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
12883 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
12884
12885 static int
12886 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
12887 int ready_index)
12888 {
12889 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
12890 }
12891
12892
12893 /* Vectorizer cost model target hooks. */
12894
12895 /* Implement targetm.vectorize.builtin_vectorization_cost. */
12896 static int
12897 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
12898 tree vectype,
12899 int misalign ATTRIBUTE_UNUSED)
12900 {
12901 unsigned elements;
12902 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
12903 bool fp = false;
12904
12905 if (vectype != NULL)
12906 fp = FLOAT_TYPE_P (vectype);
12907
12908 switch (type_of_cost)
12909 {
12910 case scalar_stmt:
12911 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
12912
12913 case scalar_load:
12914 return costs->scalar_load_cost;
12915
12916 case scalar_store:
12917 return costs->scalar_store_cost;
12918
12919 case vector_stmt:
12920 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12921
12922 case vector_load:
12923 return costs->vec_align_load_cost;
12924
12925 case vector_store:
12926 return costs->vec_store_cost;
12927
12928 case vec_to_scalar:
12929 return costs->vec_to_scalar_cost;
12930
12931 case scalar_to_vec:
12932 return costs->scalar_to_vec_cost;
12933
12934 case unaligned_load:
12935 case vector_gather_load:
12936 return costs->vec_unalign_load_cost;
12937
12938 case unaligned_store:
12939 case vector_scatter_store:
12940 return costs->vec_unalign_store_cost;
12941
12942 case cond_branch_taken:
12943 return costs->cond_taken_branch_cost;
12944
12945 case cond_branch_not_taken:
12946 return costs->cond_not_taken_branch_cost;
12947
12948 case vec_perm:
12949 return costs->vec_permute_cost;
12950
12951 case vec_promote_demote:
12952 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
12953
12954 case vec_construct:
12955 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
12956 return elements / 2 + 1;
12957
12958 default:
12959 gcc_unreachable ();
12960 }
12961 }
12962
12963 /* Return true if STMT_INFO extends the result of a load. */
12964 static bool
12965 aarch64_extending_load_p (stmt_vec_info stmt_info)
12966 {
12967 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
12968 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
12969 return false;
12970
12971 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
12972 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
12973 tree rhs_type = TREE_TYPE (rhs);
12974 if (!INTEGRAL_TYPE_P (lhs_type)
12975 || !INTEGRAL_TYPE_P (rhs_type)
12976 || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
12977 return false;
12978
12979 stmt_vec_info def_stmt_info = stmt_info->vinfo->lookup_def (rhs);
12980 return (def_stmt_info
12981 && STMT_VINFO_DATA_REF (def_stmt_info)
12982 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
12983 }
12984
12985 /* Return true if STMT_INFO is an integer truncation. */
12986 static bool
12987 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
12988 {
12989 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
12990 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
12991 return false;
12992
12993 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
12994 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
12995 return (INTEGRAL_TYPE_P (lhs_type)
12996 && INTEGRAL_TYPE_P (rhs_type)
12997 && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
12998 }
12999
13000 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13001 for STMT_INFO, which has cost kind KIND. Adjust the cost as necessary
13002 for SVE targets. */
13003 static unsigned int
13004 aarch64_sve_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
13005 unsigned int stmt_cost)
13006 {
13007 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13008 vector register size or number of units. Integer promotions of this
13009 type therefore map to SXT[BHW] or UXT[BHW].
13010
13011 Most loads have extending forms that can do the sign or zero extension
13012 on the fly. Optimistically assume that a load followed by an extension
13013 will fold to this form during combine, and that the extension therefore
13014 comes for free. */
13015 if (kind == vector_stmt && aarch64_extending_load_p (stmt_info))
13016 stmt_cost = 0;
13017
13018 /* For similar reasons, vector_stmt integer truncations are a no-op,
13019 because we can just ignore the unused upper bits of the source. */
13020 if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13021 stmt_cost = 0;
13022
13023 return stmt_cost;
13024 }
13025
13026 /* Implement targetm.vectorize.add_stmt_cost. */
13027 static unsigned
13028 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
13029 struct _stmt_vec_info *stmt_info, int misalign,
13030 enum vect_cost_model_location where)
13031 {
13032 unsigned *cost = (unsigned *) data;
13033 unsigned retval = 0;
13034
13035 if (flag_vect_cost_model)
13036 {
13037 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
13038 int stmt_cost =
13039 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13040
13041 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13042 stmt_cost = aarch64_sve_adjust_stmt_cost (kind, stmt_info, stmt_cost);
13043
13044 /* Statements in an inner loop relative to the loop being
13045 vectorized are weighted more heavily. The value here is
13046 arbitrary and could potentially be improved with analysis. */
13047 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
13048 count *= 50; /* FIXME */
13049
13050 retval = (unsigned) (count * stmt_cost);
13051 cost[where] += retval;
13052 }
13053
13054 return retval;
13055 }
13056
13057 static void initialize_aarch64_code_model (struct gcc_options *);
13058
13059 /* Parse the TO_PARSE string and put the architecture struct that it
13060 selects into RES and the architectural features into ISA_FLAGS.
13061 Return an aarch64_parse_opt_result describing the parse result.
13062 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13063 When the TO_PARSE string contains an invalid extension,
13064 a copy of the string is created and stored to INVALID_EXTENSION. */
13065
13066 static enum aarch64_parse_opt_result
13067 aarch64_parse_arch (const char *to_parse, const struct processor **res,
13068 uint64_t *isa_flags, std::string *invalid_extension)
13069 {
13070 const char *ext;
13071 const struct processor *arch;
13072 size_t len;
13073
13074 ext = strchr (to_parse, '+');
13075
13076 if (ext != NULL)
13077 len = ext - to_parse;
13078 else
13079 len = strlen (to_parse);
13080
13081 if (len == 0)
13082 return AARCH64_PARSE_MISSING_ARG;
13083
13084
13085 /* Loop through the list of supported ARCHes to find a match. */
13086 for (arch = all_architectures; arch->name != NULL; arch++)
13087 {
13088 if (strlen (arch->name) == len
13089 && strncmp (arch->name, to_parse, len) == 0)
13090 {
13091 uint64_t isa_temp = arch->flags;
13092
13093 if (ext != NULL)
13094 {
13095 /* TO_PARSE string contains at least one extension. */
13096 enum aarch64_parse_opt_result ext_res
13097 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13098
13099 if (ext_res != AARCH64_PARSE_OK)
13100 return ext_res;
13101 }
13102 /* Extension parsing was successful. Confirm the result
13103 arch and ISA flags. */
13104 *res = arch;
13105 *isa_flags = isa_temp;
13106 return AARCH64_PARSE_OK;
13107 }
13108 }
13109
13110 /* ARCH name not found in list. */
13111 return AARCH64_PARSE_INVALID_ARG;
13112 }
13113
13114 /* Parse the TO_PARSE string and put the result tuning in RES and the
13115 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
13116 describing the parse result. If there is an error parsing, RES and
13117 ISA_FLAGS are left unchanged.
13118 When the TO_PARSE string contains an invalid extension,
13119 a copy of the string is created and stored to INVALID_EXTENSION. */
13120
13121 static enum aarch64_parse_opt_result
13122 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
13123 uint64_t *isa_flags, std::string *invalid_extension)
13124 {
13125 const char *ext;
13126 const struct processor *cpu;
13127 size_t len;
13128
13129 ext = strchr (to_parse, '+');
13130
13131 if (ext != NULL)
13132 len = ext - to_parse;
13133 else
13134 len = strlen (to_parse);
13135
13136 if (len == 0)
13137 return AARCH64_PARSE_MISSING_ARG;
13138
13139
13140 /* Loop through the list of supported CPUs to find a match. */
13141 for (cpu = all_cores; cpu->name != NULL; cpu++)
13142 {
13143 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
13144 {
13145 uint64_t isa_temp = cpu->flags;
13146
13147
13148 if (ext != NULL)
13149 {
13150 /* TO_PARSE string contains at least one extension. */
13151 enum aarch64_parse_opt_result ext_res
13152 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13153
13154 if (ext_res != AARCH64_PARSE_OK)
13155 return ext_res;
13156 }
13157 /* Extension parsing was successfull. Confirm the result
13158 cpu and ISA flags. */
13159 *res = cpu;
13160 *isa_flags = isa_temp;
13161 return AARCH64_PARSE_OK;
13162 }
13163 }
13164
13165 /* CPU name not found in list. */
13166 return AARCH64_PARSE_INVALID_ARG;
13167 }
13168
13169 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13170 Return an aarch64_parse_opt_result describing the parse result.
13171 If the parsing fails the RES does not change. */
13172
13173 static enum aarch64_parse_opt_result
13174 aarch64_parse_tune (const char *to_parse, const struct processor **res)
13175 {
13176 const struct processor *cpu;
13177
13178 /* Loop through the list of supported CPUs to find a match. */
13179 for (cpu = all_cores; cpu->name != NULL; cpu++)
13180 {
13181 if (strcmp (cpu->name, to_parse) == 0)
13182 {
13183 *res = cpu;
13184 return AARCH64_PARSE_OK;
13185 }
13186 }
13187
13188 /* CPU name not found in list. */
13189 return AARCH64_PARSE_INVALID_ARG;
13190 }
13191
13192 /* Parse TOKEN, which has length LENGTH to see if it is an option
13193 described in FLAG. If it is, return the index bit for that fusion type.
13194 If not, error (printing OPTION_NAME) and return zero. */
13195
13196 static unsigned int
13197 aarch64_parse_one_option_token (const char *token,
13198 size_t length,
13199 const struct aarch64_flag_desc *flag,
13200 const char *option_name)
13201 {
13202 for (; flag->name != NULL; flag++)
13203 {
13204 if (length == strlen (flag->name)
13205 && !strncmp (flag->name, token, length))
13206 return flag->flag;
13207 }
13208
13209 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
13210 return 0;
13211 }
13212
13213 /* Parse OPTION which is a comma-separated list of flags to enable.
13214 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
13215 default state we inherit from the CPU tuning structures. OPTION_NAME
13216 gives the top-level option we are parsing in the -moverride string,
13217 for use in error messages. */
13218
13219 static unsigned int
13220 aarch64_parse_boolean_options (const char *option,
13221 const struct aarch64_flag_desc *flags,
13222 unsigned int initial_state,
13223 const char *option_name)
13224 {
13225 const char separator = '.';
13226 const char* specs = option;
13227 const char* ntoken = option;
13228 unsigned int found_flags = initial_state;
13229
13230 while ((ntoken = strchr (specs, separator)))
13231 {
13232 size_t token_length = ntoken - specs;
13233 unsigned token_ops = aarch64_parse_one_option_token (specs,
13234 token_length,
13235 flags,
13236 option_name);
13237 /* If we find "none" (or, for simplicity's sake, an error) anywhere
13238 in the token stream, reset the supported operations. So:
13239
13240 adrp+add.cmp+branch.none.adrp+add
13241
13242 would have the result of turning on only adrp+add fusion. */
13243 if (!token_ops)
13244 found_flags = 0;
13245
13246 found_flags |= token_ops;
13247 specs = ++ntoken;
13248 }
13249
13250 /* We ended with a comma, print something. */
13251 if (!(*specs))
13252 {
13253 error ("%s string ill-formed\n", option_name);
13254 return 0;
13255 }
13256
13257 /* We still have one more token to parse. */
13258 size_t token_length = strlen (specs);
13259 unsigned token_ops = aarch64_parse_one_option_token (specs,
13260 token_length,
13261 flags,
13262 option_name);
13263 if (!token_ops)
13264 found_flags = 0;
13265
13266 found_flags |= token_ops;
13267 return found_flags;
13268 }
13269
13270 /* Support for overriding instruction fusion. */
13271
13272 static void
13273 aarch64_parse_fuse_string (const char *fuse_string,
13274 struct tune_params *tune)
13275 {
13276 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
13277 aarch64_fusible_pairs,
13278 tune->fusible_ops,
13279 "fuse=");
13280 }
13281
13282 /* Support for overriding other tuning flags. */
13283
13284 static void
13285 aarch64_parse_tune_string (const char *tune_string,
13286 struct tune_params *tune)
13287 {
13288 tune->extra_tuning_flags
13289 = aarch64_parse_boolean_options (tune_string,
13290 aarch64_tuning_flags,
13291 tune->extra_tuning_flags,
13292 "tune=");
13293 }
13294
13295 /* Parse the sve_width tuning moverride string in TUNE_STRING.
13296 Accept the valid SVE vector widths allowed by
13297 aarch64_sve_vector_bits_enum and use it to override sve_width
13298 in TUNE. */
13299
13300 static void
13301 aarch64_parse_sve_width_string (const char *tune_string,
13302 struct tune_params *tune)
13303 {
13304 int width = -1;
13305
13306 int n = sscanf (tune_string, "%d", &width);
13307 if (n == EOF)
13308 {
13309 error ("invalid format for sve_width");
13310 return;
13311 }
13312 switch (width)
13313 {
13314 case SVE_128:
13315 case SVE_256:
13316 case SVE_512:
13317 case SVE_1024:
13318 case SVE_2048:
13319 break;
13320 default:
13321 error ("invalid sve_width value: %d", width);
13322 }
13323 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
13324 }
13325
13326 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
13327 we understand. If it is, extract the option string and handoff to
13328 the appropriate function. */
13329
13330 void
13331 aarch64_parse_one_override_token (const char* token,
13332 size_t length,
13333 struct tune_params *tune)
13334 {
13335 const struct aarch64_tuning_override_function *fn
13336 = aarch64_tuning_override_functions;
13337
13338 const char *option_part = strchr (token, '=');
13339 if (!option_part)
13340 {
13341 error ("tuning string missing in option (%s)", token);
13342 return;
13343 }
13344
13345 /* Get the length of the option name. */
13346 length = option_part - token;
13347 /* Skip the '=' to get to the option string. */
13348 option_part++;
13349
13350 for (; fn->name != NULL; fn++)
13351 {
13352 if (!strncmp (fn->name, token, length))
13353 {
13354 fn->parse_override (option_part, tune);
13355 return;
13356 }
13357 }
13358
13359 error ("unknown tuning option (%s)",token);
13360 return;
13361 }
13362
13363 /* A checking mechanism for the implementation of the tls size. */
13364
13365 static void
13366 initialize_aarch64_tls_size (struct gcc_options *opts)
13367 {
13368 if (aarch64_tls_size == 0)
13369 aarch64_tls_size = 24;
13370
13371 switch (opts->x_aarch64_cmodel_var)
13372 {
13373 case AARCH64_CMODEL_TINY:
13374 /* Both the default and maximum TLS size allowed under tiny is 1M which
13375 needs two instructions to address, so we clamp the size to 24. */
13376 if (aarch64_tls_size > 24)
13377 aarch64_tls_size = 24;
13378 break;
13379 case AARCH64_CMODEL_SMALL:
13380 /* The maximum TLS size allowed under small is 4G. */
13381 if (aarch64_tls_size > 32)
13382 aarch64_tls_size = 32;
13383 break;
13384 case AARCH64_CMODEL_LARGE:
13385 /* The maximum TLS size allowed under large is 16E.
13386 FIXME: 16E should be 64bit, we only support 48bit offset now. */
13387 if (aarch64_tls_size > 48)
13388 aarch64_tls_size = 48;
13389 break;
13390 default:
13391 gcc_unreachable ();
13392 }
13393
13394 return;
13395 }
13396
13397 /* Parse STRING looking for options in the format:
13398 string :: option:string
13399 option :: name=substring
13400 name :: {a-z}
13401 substring :: defined by option. */
13402
13403 static void
13404 aarch64_parse_override_string (const char* input_string,
13405 struct tune_params* tune)
13406 {
13407 const char separator = ':';
13408 size_t string_length = strlen (input_string) + 1;
13409 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
13410 char *string = string_root;
13411 strncpy (string, input_string, string_length);
13412 string[string_length - 1] = '\0';
13413
13414 char* ntoken = string;
13415
13416 while ((ntoken = strchr (string, separator)))
13417 {
13418 size_t token_length = ntoken - string;
13419 /* Make this substring look like a string. */
13420 *ntoken = '\0';
13421 aarch64_parse_one_override_token (string, token_length, tune);
13422 string = ++ntoken;
13423 }
13424
13425 /* One last option to parse. */
13426 aarch64_parse_one_override_token (string, strlen (string), tune);
13427 free (string_root);
13428 }
13429
13430
13431 static void
13432 aarch64_override_options_after_change_1 (struct gcc_options *opts)
13433 {
13434 if (accepted_branch_protection_string)
13435 {
13436 opts->x_aarch64_branch_protection_string
13437 = xstrdup (accepted_branch_protection_string);
13438 }
13439
13440 /* PR 70044: We have to be careful about being called multiple times for the
13441 same function. This means all changes should be repeatable. */
13442
13443 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
13444 Disable the frame pointer flag so the mid-end will not use a frame
13445 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
13446 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
13447 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
13448 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
13449 if (opts->x_flag_omit_frame_pointer == 0)
13450 opts->x_flag_omit_frame_pointer = 2;
13451
13452 /* If not optimizing for size, set the default
13453 alignment to what the target wants. */
13454 if (!opts->x_optimize_size)
13455 {
13456 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
13457 opts->x_str_align_loops = aarch64_tune_params.loop_align;
13458 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
13459 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
13460 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
13461 opts->x_str_align_functions = aarch64_tune_params.function_align;
13462 }
13463
13464 /* We default to no pc-relative literal loads. */
13465
13466 aarch64_pcrelative_literal_loads = false;
13467
13468 /* If -mpc-relative-literal-loads is set on the command line, this
13469 implies that the user asked for PC relative literal loads. */
13470 if (opts->x_pcrelative_literal_loads == 1)
13471 aarch64_pcrelative_literal_loads = true;
13472
13473 /* In the tiny memory model it makes no sense to disallow PC relative
13474 literal pool loads. */
13475 if (aarch64_cmodel == AARCH64_CMODEL_TINY
13476 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13477 aarch64_pcrelative_literal_loads = true;
13478
13479 /* When enabling the lower precision Newton series for the square root, also
13480 enable it for the reciprocal square root, since the latter is an
13481 intermediary step for the former. */
13482 if (flag_mlow_precision_sqrt)
13483 flag_mrecip_low_precision_sqrt = true;
13484 }
13485
13486 /* 'Unpack' up the internal tuning structs and update the options
13487 in OPTS. The caller must have set up selected_tune and selected_arch
13488 as all the other target-specific codegen decisions are
13489 derived from them. */
13490
13491 void
13492 aarch64_override_options_internal (struct gcc_options *opts)
13493 {
13494 aarch64_tune_flags = selected_tune->flags;
13495 aarch64_tune = selected_tune->sched_core;
13496 /* Make a copy of the tuning parameters attached to the core, which
13497 we may later overwrite. */
13498 aarch64_tune_params = *(selected_tune->tune);
13499 aarch64_architecture_version = selected_arch->architecture_version;
13500
13501 if (opts->x_aarch64_override_tune_string)
13502 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
13503 &aarch64_tune_params);
13504
13505 /* This target defaults to strict volatile bitfields. */
13506 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
13507 opts->x_flag_strict_volatile_bitfields = 1;
13508
13509 if (aarch64_stack_protector_guard == SSP_GLOBAL
13510 && opts->x_aarch64_stack_protector_guard_offset_str)
13511 {
13512 error ("incompatible options %<-mstack-protector-guard=global%> and "
13513 "%<-mstack-protector-guard-offset=%s%>",
13514 aarch64_stack_protector_guard_offset_str);
13515 }
13516
13517 if (aarch64_stack_protector_guard == SSP_SYSREG
13518 && !(opts->x_aarch64_stack_protector_guard_offset_str
13519 && opts->x_aarch64_stack_protector_guard_reg_str))
13520 {
13521 error ("both %<-mstack-protector-guard-offset%> and "
13522 "%<-mstack-protector-guard-reg%> must be used "
13523 "with %<-mstack-protector-guard=sysreg%>");
13524 }
13525
13526 if (opts->x_aarch64_stack_protector_guard_reg_str)
13527 {
13528 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
13529 error ("specify a system register with a small string length.");
13530 }
13531
13532 if (opts->x_aarch64_stack_protector_guard_offset_str)
13533 {
13534 char *end;
13535 const char *str = aarch64_stack_protector_guard_offset_str;
13536 errno = 0;
13537 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
13538 if (!*str || *end || errno)
13539 error ("%qs is not a valid offset in %qs", str,
13540 "-mstack-protector-guard-offset=");
13541 aarch64_stack_protector_guard_offset = offs;
13542 }
13543
13544 initialize_aarch64_code_model (opts);
13545 initialize_aarch64_tls_size (opts);
13546
13547 int queue_depth = 0;
13548 switch (aarch64_tune_params.autoprefetcher_model)
13549 {
13550 case tune_params::AUTOPREFETCHER_OFF:
13551 queue_depth = -1;
13552 break;
13553 case tune_params::AUTOPREFETCHER_WEAK:
13554 queue_depth = 0;
13555 break;
13556 case tune_params::AUTOPREFETCHER_STRONG:
13557 queue_depth = max_insn_queue_index + 1;
13558 break;
13559 default:
13560 gcc_unreachable ();
13561 }
13562
13563 /* We don't mind passing in global_options_set here as we don't use
13564 the *options_set structs anyway. */
13565 SET_OPTION_IF_UNSET (opts, &global_options_set,
13566 param_sched_autopref_queue_depth, queue_depth);
13567
13568 /* Set up parameters to be used in prefetching algorithm. Do not
13569 override the defaults unless we are tuning for a core we have
13570 researched values for. */
13571 if (aarch64_tune_params.prefetch->num_slots > 0)
13572 SET_OPTION_IF_UNSET (opts, &global_options_set,
13573 param_simultaneous_prefetches,
13574 aarch64_tune_params.prefetch->num_slots);
13575 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
13576 SET_OPTION_IF_UNSET (opts, &global_options_set,
13577 param_l1_cache_size,
13578 aarch64_tune_params.prefetch->l1_cache_size);
13579 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
13580 SET_OPTION_IF_UNSET (opts, &global_options_set,
13581 param_l1_cache_line_size,
13582 aarch64_tune_params.prefetch->l1_cache_line_size);
13583 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
13584 SET_OPTION_IF_UNSET (opts, &global_options_set,
13585 param_l2_cache_size,
13586 aarch64_tune_params.prefetch->l2_cache_size);
13587 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
13588 SET_OPTION_IF_UNSET (opts, &global_options_set,
13589 param_prefetch_dynamic_strides, 0);
13590 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
13591 SET_OPTION_IF_UNSET (opts, &global_options_set,
13592 param_prefetch_minimum_stride,
13593 aarch64_tune_params.prefetch->minimum_stride);
13594
13595 /* Use the alternative scheduling-pressure algorithm by default. */
13596 SET_OPTION_IF_UNSET (opts, &global_options_set,
13597 param_sched_pressure_algorithm,
13598 SCHED_PRESSURE_MODEL);
13599
13600 /* Validate the guard size. */
13601 int guard_size = param_stack_clash_protection_guard_size;
13602
13603 if (guard_size != 12 && guard_size != 16)
13604 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
13605 "size. Given value %d (%llu KB) is out of range",
13606 guard_size, (1ULL << guard_size) / 1024ULL);
13607
13608 /* Enforce that interval is the same size as size so the mid-end does the
13609 right thing. */
13610 SET_OPTION_IF_UNSET (opts, &global_options_set,
13611 param_stack_clash_protection_probe_interval,
13612 guard_size);
13613
13614 /* The maybe_set calls won't update the value if the user has explicitly set
13615 one. Which means we need to validate that probing interval and guard size
13616 are equal. */
13617 int probe_interval
13618 = param_stack_clash_protection_probe_interval;
13619 if (guard_size != probe_interval)
13620 error ("stack clash guard size %<%d%> must be equal to probing interval "
13621 "%<%d%>", guard_size, probe_interval);
13622
13623 /* Enable sw prefetching at specified optimization level for
13624 CPUS that have prefetch. Lower optimization level threshold by 1
13625 when profiling is enabled. */
13626 if (opts->x_flag_prefetch_loop_arrays < 0
13627 && !opts->x_optimize_size
13628 && aarch64_tune_params.prefetch->default_opt_level >= 0
13629 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
13630 opts->x_flag_prefetch_loop_arrays = 1;
13631
13632 if (opts->x_aarch64_arch_string == NULL)
13633 opts->x_aarch64_arch_string = selected_arch->name;
13634 if (opts->x_aarch64_cpu_string == NULL)
13635 opts->x_aarch64_cpu_string = selected_cpu->name;
13636 if (opts->x_aarch64_tune_string == NULL)
13637 opts->x_aarch64_tune_string = selected_tune->name;
13638
13639 aarch64_override_options_after_change_1 (opts);
13640 }
13641
13642 /* Print a hint with a suggestion for a core or architecture name that
13643 most closely resembles what the user passed in STR. ARCH is true if
13644 the user is asking for an architecture name. ARCH is false if the user
13645 is asking for a core name. */
13646
13647 static void
13648 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
13649 {
13650 auto_vec<const char *> candidates;
13651 const struct processor *entry = arch ? all_architectures : all_cores;
13652 for (; entry->name != NULL; entry++)
13653 candidates.safe_push (entry->name);
13654
13655 #ifdef HAVE_LOCAL_CPU_DETECT
13656 /* Add also "native" as possible value. */
13657 if (arch)
13658 candidates.safe_push ("native");
13659 #endif
13660
13661 char *s;
13662 const char *hint = candidates_list_and_hint (str, s, candidates);
13663 if (hint)
13664 inform (input_location, "valid arguments are: %s;"
13665 " did you mean %qs?", s, hint);
13666 else
13667 inform (input_location, "valid arguments are: %s", s);
13668
13669 XDELETEVEC (s);
13670 }
13671
13672 /* Print a hint with a suggestion for a core name that most closely resembles
13673 what the user passed in STR. */
13674
13675 inline static void
13676 aarch64_print_hint_for_core (const char *str)
13677 {
13678 aarch64_print_hint_for_core_or_arch (str, false);
13679 }
13680
13681 /* Print a hint with a suggestion for an architecture name that most closely
13682 resembles what the user passed in STR. */
13683
13684 inline static void
13685 aarch64_print_hint_for_arch (const char *str)
13686 {
13687 aarch64_print_hint_for_core_or_arch (str, true);
13688 }
13689
13690
13691 /* Print a hint with a suggestion for an extension name
13692 that most closely resembles what the user passed in STR. */
13693
13694 void
13695 aarch64_print_hint_for_extensions (const std::string &str)
13696 {
13697 auto_vec<const char *> candidates;
13698 aarch64_get_all_extension_candidates (&candidates);
13699 char *s;
13700 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
13701 if (hint)
13702 inform (input_location, "valid arguments are: %s;"
13703 " did you mean %qs?", s, hint);
13704 else
13705 inform (input_location, "valid arguments are: %s;", s);
13706
13707 XDELETEVEC (s);
13708 }
13709
13710 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
13711 specified in STR and throw errors if appropriate. Put the results if
13712 they are valid in RES and ISA_FLAGS. Return whether the option is
13713 valid. */
13714
13715 static bool
13716 aarch64_validate_mcpu (const char *str, const struct processor **res,
13717 uint64_t *isa_flags)
13718 {
13719 std::string invalid_extension;
13720 enum aarch64_parse_opt_result parse_res
13721 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
13722
13723 if (parse_res == AARCH64_PARSE_OK)
13724 return true;
13725
13726 switch (parse_res)
13727 {
13728 case AARCH64_PARSE_MISSING_ARG:
13729 error ("missing cpu name in %<-mcpu=%s%>", str);
13730 break;
13731 case AARCH64_PARSE_INVALID_ARG:
13732 error ("unknown value %qs for %<-mcpu%>", str);
13733 aarch64_print_hint_for_core (str);
13734 break;
13735 case AARCH64_PARSE_INVALID_FEATURE:
13736 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
13737 invalid_extension.c_str (), str);
13738 aarch64_print_hint_for_extensions (invalid_extension);
13739 break;
13740 default:
13741 gcc_unreachable ();
13742 }
13743
13744 return false;
13745 }
13746
13747 /* Parses CONST_STR for branch protection features specified in
13748 aarch64_branch_protect_types, and set any global variables required. Returns
13749 the parsing result and assigns LAST_STR to the last processed token from
13750 CONST_STR so that it can be used for error reporting. */
13751
13752 static enum
13753 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
13754 char** last_str)
13755 {
13756 char *str_root = xstrdup (const_str);
13757 char* token_save = NULL;
13758 char *str = strtok_r (str_root, "+", &token_save);
13759 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
13760 if (!str)
13761 res = AARCH64_PARSE_MISSING_ARG;
13762 else
13763 {
13764 char *next_str = strtok_r (NULL, "+", &token_save);
13765 /* Reset the branch protection features to their defaults. */
13766 aarch64_handle_no_branch_protection (NULL, NULL);
13767
13768 while (str && res == AARCH64_PARSE_OK)
13769 {
13770 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
13771 bool found = false;
13772 /* Search for this type. */
13773 while (type && type->name && !found && res == AARCH64_PARSE_OK)
13774 {
13775 if (strcmp (str, type->name) == 0)
13776 {
13777 found = true;
13778 res = type->handler (str, next_str);
13779 str = next_str;
13780 next_str = strtok_r (NULL, "+", &token_save);
13781 }
13782 else
13783 type++;
13784 }
13785 if (found && res == AARCH64_PARSE_OK)
13786 {
13787 bool found_subtype = true;
13788 /* Loop through each token until we find one that isn't a
13789 subtype. */
13790 while (found_subtype)
13791 {
13792 found_subtype = false;
13793 const aarch64_branch_protect_type *subtype = type->subtypes;
13794 /* Search for the subtype. */
13795 while (str && subtype && subtype->name && !found_subtype
13796 && res == AARCH64_PARSE_OK)
13797 {
13798 if (strcmp (str, subtype->name) == 0)
13799 {
13800 found_subtype = true;
13801 res = subtype->handler (str, next_str);
13802 str = next_str;
13803 next_str = strtok_r (NULL, "+", &token_save);
13804 }
13805 else
13806 subtype++;
13807 }
13808 }
13809 }
13810 else if (!found)
13811 res = AARCH64_PARSE_INVALID_ARG;
13812 }
13813 }
13814 /* Copy the last processed token into the argument to pass it back.
13815 Used by option and attribute validation to print the offending token. */
13816 if (last_str)
13817 {
13818 if (str) strcpy (*last_str, str);
13819 else *last_str = NULL;
13820 }
13821 if (res == AARCH64_PARSE_OK)
13822 {
13823 /* If needed, alloc the accepted string then copy in const_str.
13824 Used by override_option_after_change_1. */
13825 if (!accepted_branch_protection_string)
13826 accepted_branch_protection_string = (char *) xmalloc (
13827 BRANCH_PROTECT_STR_MAX
13828 + 1);
13829 strncpy (accepted_branch_protection_string, const_str,
13830 BRANCH_PROTECT_STR_MAX + 1);
13831 /* Forcibly null-terminate. */
13832 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
13833 }
13834 return res;
13835 }
13836
13837 static bool
13838 aarch64_validate_mbranch_protection (const char *const_str)
13839 {
13840 char *str = (char *) xmalloc (strlen (const_str));
13841 enum aarch64_parse_opt_result res =
13842 aarch64_parse_branch_protection (const_str, &str);
13843 if (res == AARCH64_PARSE_INVALID_ARG)
13844 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
13845 else if (res == AARCH64_PARSE_MISSING_ARG)
13846 error ("missing argument for %<-mbranch-protection=%>");
13847 free (str);
13848 return res == AARCH64_PARSE_OK;
13849 }
13850
13851 /* Validate a command-line -march option. Parse the arch and extensions
13852 (if any) specified in STR and throw errors if appropriate. Put the
13853 results, if they are valid, in RES and ISA_FLAGS. Return whether the
13854 option is valid. */
13855
13856 static bool
13857 aarch64_validate_march (const char *str, const struct processor **res,
13858 uint64_t *isa_flags)
13859 {
13860 std::string invalid_extension;
13861 enum aarch64_parse_opt_result parse_res
13862 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
13863
13864 if (parse_res == AARCH64_PARSE_OK)
13865 return true;
13866
13867 switch (parse_res)
13868 {
13869 case AARCH64_PARSE_MISSING_ARG:
13870 error ("missing arch name in %<-march=%s%>", str);
13871 break;
13872 case AARCH64_PARSE_INVALID_ARG:
13873 error ("unknown value %qs for %<-march%>", str);
13874 aarch64_print_hint_for_arch (str);
13875 break;
13876 case AARCH64_PARSE_INVALID_FEATURE:
13877 error ("invalid feature modifier %qs in %<-march=%s%>",
13878 invalid_extension.c_str (), str);
13879 aarch64_print_hint_for_extensions (invalid_extension);
13880 break;
13881 default:
13882 gcc_unreachable ();
13883 }
13884
13885 return false;
13886 }
13887
13888 /* Validate a command-line -mtune option. Parse the cpu
13889 specified in STR and throw errors if appropriate. Put the
13890 result, if it is valid, in RES. Return whether the option is
13891 valid. */
13892
13893 static bool
13894 aarch64_validate_mtune (const char *str, const struct processor **res)
13895 {
13896 enum aarch64_parse_opt_result parse_res
13897 = aarch64_parse_tune (str, res);
13898
13899 if (parse_res == AARCH64_PARSE_OK)
13900 return true;
13901
13902 switch (parse_res)
13903 {
13904 case AARCH64_PARSE_MISSING_ARG:
13905 error ("missing cpu name in %<-mtune=%s%>", str);
13906 break;
13907 case AARCH64_PARSE_INVALID_ARG:
13908 error ("unknown value %qs for %<-mtune%>", str);
13909 aarch64_print_hint_for_core (str);
13910 break;
13911 default:
13912 gcc_unreachable ();
13913 }
13914 return false;
13915 }
13916
13917 /* Return the CPU corresponding to the enum CPU.
13918 If it doesn't specify a cpu, return the default. */
13919
13920 static const struct processor *
13921 aarch64_get_tune_cpu (enum aarch64_processor cpu)
13922 {
13923 if (cpu != aarch64_none)
13924 return &all_cores[cpu];
13925
13926 /* The & 0x3f is to extract the bottom 6 bits that encode the
13927 default cpu as selected by the --with-cpu GCC configure option
13928 in config.gcc.
13929 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
13930 flags mechanism should be reworked to make it more sane. */
13931 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
13932 }
13933
13934 /* Return the architecture corresponding to the enum ARCH.
13935 If it doesn't specify a valid architecture, return the default. */
13936
13937 static const struct processor *
13938 aarch64_get_arch (enum aarch64_arch arch)
13939 {
13940 if (arch != aarch64_no_arch)
13941 return &all_architectures[arch];
13942
13943 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
13944
13945 return &all_architectures[cpu->arch];
13946 }
13947
13948 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
13949
13950 static poly_uint16
13951 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
13952 {
13953 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
13954 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
13955 deciding which .md file patterns to use and when deciding whether
13956 something is a legitimate address or constant. */
13957 if (value == SVE_SCALABLE || value == SVE_128)
13958 return poly_uint16 (2, 2);
13959 else
13960 return (int) value / 64;
13961 }
13962
13963 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
13964 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
13965 tuning structs. In particular it must set selected_tune and
13966 aarch64_isa_flags that define the available ISA features and tuning
13967 decisions. It must also set selected_arch as this will be used to
13968 output the .arch asm tags for each function. */
13969
13970 static void
13971 aarch64_override_options (void)
13972 {
13973 uint64_t cpu_isa = 0;
13974 uint64_t arch_isa = 0;
13975 aarch64_isa_flags = 0;
13976
13977 bool valid_cpu = true;
13978 bool valid_tune = true;
13979 bool valid_arch = true;
13980
13981 selected_cpu = NULL;
13982 selected_arch = NULL;
13983 selected_tune = NULL;
13984
13985 if (aarch64_branch_protection_string)
13986 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
13987
13988 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
13989 If either of -march or -mtune is given, they override their
13990 respective component of -mcpu. */
13991 if (aarch64_cpu_string)
13992 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
13993 &cpu_isa);
13994
13995 if (aarch64_arch_string)
13996 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
13997 &arch_isa);
13998
13999 if (aarch64_tune_string)
14000 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
14001
14002 #ifdef SUBTARGET_OVERRIDE_OPTIONS
14003 SUBTARGET_OVERRIDE_OPTIONS;
14004 #endif
14005
14006 /* If the user did not specify a processor, choose the default
14007 one for them. This will be the CPU set during configuration using
14008 --with-cpu, otherwise it is "generic". */
14009 if (!selected_cpu)
14010 {
14011 if (selected_arch)
14012 {
14013 selected_cpu = &all_cores[selected_arch->ident];
14014 aarch64_isa_flags = arch_isa;
14015 explicit_arch = selected_arch->arch;
14016 }
14017 else
14018 {
14019 /* Get default configure-time CPU. */
14020 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
14021 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
14022 }
14023
14024 if (selected_tune)
14025 explicit_tune_core = selected_tune->ident;
14026 }
14027 /* If both -mcpu and -march are specified check that they are architecturally
14028 compatible, warn if they're not and prefer the -march ISA flags. */
14029 else if (selected_arch)
14030 {
14031 if (selected_arch->arch != selected_cpu->arch)
14032 {
14033 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14034 all_architectures[selected_cpu->arch].name,
14035 selected_arch->name);
14036 }
14037 aarch64_isa_flags = arch_isa;
14038 explicit_arch = selected_arch->arch;
14039 explicit_tune_core = selected_tune ? selected_tune->ident
14040 : selected_cpu->ident;
14041 }
14042 else
14043 {
14044 /* -mcpu but no -march. */
14045 aarch64_isa_flags = cpu_isa;
14046 explicit_tune_core = selected_tune ? selected_tune->ident
14047 : selected_cpu->ident;
14048 gcc_assert (selected_cpu);
14049 selected_arch = &all_architectures[selected_cpu->arch];
14050 explicit_arch = selected_arch->arch;
14051 }
14052
14053 /* Set the arch as well as we will need it when outputing
14054 the .arch directive in assembly. */
14055 if (!selected_arch)
14056 {
14057 gcc_assert (selected_cpu);
14058 selected_arch = &all_architectures[selected_cpu->arch];
14059 }
14060
14061 if (!selected_tune)
14062 selected_tune = selected_cpu;
14063
14064 if (aarch64_enable_bti == 2)
14065 {
14066 #ifdef TARGET_ENABLE_BTI
14067 aarch64_enable_bti = 1;
14068 #else
14069 aarch64_enable_bti = 0;
14070 #endif
14071 }
14072
14073 /* Return address signing is currently not supported for ILP32 targets. For
14074 LP64 targets use the configured option in the absence of a command-line
14075 option for -mbranch-protection. */
14076 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
14077 {
14078 #ifdef TARGET_ENABLE_PAC_RET
14079 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
14080 #else
14081 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
14082 #endif
14083 }
14084
14085 #ifndef HAVE_AS_MABI_OPTION
14086 /* The compiler may have been configured with 2.23.* binutils, which does
14087 not have support for ILP32. */
14088 if (TARGET_ILP32)
14089 error ("assembler does not support %<-mabi=ilp32%>");
14090 #endif
14091
14092 /* Convert -msve-vector-bits to a VG count. */
14093 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
14094
14095 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
14096 sorry ("return address signing is only supported for %<-mabi=lp64%>");
14097
14098 /* Make sure we properly set up the explicit options. */
14099 if ((aarch64_cpu_string && valid_cpu)
14100 || (aarch64_tune_string && valid_tune))
14101 gcc_assert (explicit_tune_core != aarch64_none);
14102
14103 if ((aarch64_cpu_string && valid_cpu)
14104 || (aarch64_arch_string && valid_arch))
14105 gcc_assert (explicit_arch != aarch64_no_arch);
14106
14107 /* The pass to insert speculation tracking runs before
14108 shrink-wrapping and the latter does not know how to update the
14109 tracking status. So disable it in this case. */
14110 if (aarch64_track_speculation)
14111 flag_shrink_wrap = 0;
14112
14113 aarch64_override_options_internal (&global_options);
14114
14115 /* Save these options as the default ones in case we push and pop them later
14116 while processing functions with potential target attributes. */
14117 target_option_default_node = target_option_current_node
14118 = build_target_option_node (&global_options);
14119 }
14120
14121 /* Implement targetm.override_options_after_change. */
14122
14123 static void
14124 aarch64_override_options_after_change (void)
14125 {
14126 aarch64_override_options_after_change_1 (&global_options);
14127 }
14128
14129 static struct machine_function *
14130 aarch64_init_machine_status (void)
14131 {
14132 struct machine_function *machine;
14133 machine = ggc_cleared_alloc<machine_function> ();
14134 return machine;
14135 }
14136
14137 void
14138 aarch64_init_expanders (void)
14139 {
14140 init_machine_status = aarch64_init_machine_status;
14141 }
14142
14143 /* A checking mechanism for the implementation of the various code models. */
14144 static void
14145 initialize_aarch64_code_model (struct gcc_options *opts)
14146 {
14147 if (opts->x_flag_pic)
14148 {
14149 switch (opts->x_aarch64_cmodel_var)
14150 {
14151 case AARCH64_CMODEL_TINY:
14152 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
14153 break;
14154 case AARCH64_CMODEL_SMALL:
14155 #ifdef HAVE_AS_SMALL_PIC_RELOCS
14156 aarch64_cmodel = (flag_pic == 2
14157 ? AARCH64_CMODEL_SMALL_PIC
14158 : AARCH64_CMODEL_SMALL_SPIC);
14159 #else
14160 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
14161 #endif
14162 break;
14163 case AARCH64_CMODEL_LARGE:
14164 sorry ("code model %qs with %<-f%s%>", "large",
14165 opts->x_flag_pic > 1 ? "PIC" : "pic");
14166 break;
14167 default:
14168 gcc_unreachable ();
14169 }
14170 }
14171 else
14172 aarch64_cmodel = opts->x_aarch64_cmodel_var;
14173 }
14174
14175 /* Implement TARGET_OPTION_SAVE. */
14176
14177 static void
14178 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
14179 {
14180 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
14181 ptr->x_aarch64_branch_protection_string
14182 = opts->x_aarch64_branch_protection_string;
14183 }
14184
14185 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
14186 using the information saved in PTR. */
14187
14188 static void
14189 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
14190 {
14191 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
14192 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14193 opts->x_explicit_arch = ptr->x_explicit_arch;
14194 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
14195 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
14196 opts->x_aarch64_branch_protection_string
14197 = ptr->x_aarch64_branch_protection_string;
14198 if (opts->x_aarch64_branch_protection_string)
14199 {
14200 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
14201 NULL);
14202 }
14203
14204 aarch64_override_options_internal (opts);
14205 }
14206
14207 /* Implement TARGET_OPTION_PRINT. */
14208
14209 static void
14210 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
14211 {
14212 const struct processor *cpu
14213 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
14214 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
14215 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
14216 std::string extension
14217 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
14218
14219 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
14220 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
14221 arch->name, extension.c_str ());
14222 }
14223
14224 static GTY(()) tree aarch64_previous_fndecl;
14225
14226 void
14227 aarch64_reset_previous_fndecl (void)
14228 {
14229 aarch64_previous_fndecl = NULL;
14230 }
14231
14232 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
14233 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
14234 make sure optab availability predicates are recomputed when necessary. */
14235
14236 void
14237 aarch64_save_restore_target_globals (tree new_tree)
14238 {
14239 if (TREE_TARGET_GLOBALS (new_tree))
14240 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
14241 else if (new_tree == target_option_default_node)
14242 restore_target_globals (&default_target_globals);
14243 else
14244 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
14245 }
14246
14247 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
14248 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
14249 of the function, if such exists. This function may be called multiple
14250 times on a single function so use aarch64_previous_fndecl to avoid
14251 setting up identical state. */
14252
14253 static void
14254 aarch64_set_current_function (tree fndecl)
14255 {
14256 if (!fndecl || fndecl == aarch64_previous_fndecl)
14257 return;
14258
14259 tree old_tree = (aarch64_previous_fndecl
14260 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
14261 : NULL_TREE);
14262
14263 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14264
14265 /* If current function has no attributes but the previous one did,
14266 use the default node. */
14267 if (!new_tree && old_tree)
14268 new_tree = target_option_default_node;
14269
14270 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
14271 the default have been handled by aarch64_save_restore_target_globals from
14272 aarch64_pragma_target_parse. */
14273 if (old_tree == new_tree)
14274 return;
14275
14276 aarch64_previous_fndecl = fndecl;
14277
14278 /* First set the target options. */
14279 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
14280
14281 aarch64_save_restore_target_globals (new_tree);
14282 }
14283
14284 /* Enum describing the various ways we can handle attributes.
14285 In many cases we can reuse the generic option handling machinery. */
14286
14287 enum aarch64_attr_opt_type
14288 {
14289 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
14290 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
14291 aarch64_attr_enum, /* Attribute sets an enum variable. */
14292 aarch64_attr_custom /* Attribute requires a custom handling function. */
14293 };
14294
14295 /* All the information needed to handle a target attribute.
14296 NAME is the name of the attribute.
14297 ATTR_TYPE specifies the type of behavior of the attribute as described
14298 in the definition of enum aarch64_attr_opt_type.
14299 ALLOW_NEG is true if the attribute supports a "no-" form.
14300 HANDLER is the function that takes the attribute string as an argument
14301 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
14302 OPT_NUM is the enum specifying the option that the attribute modifies.
14303 This is needed for attributes that mirror the behavior of a command-line
14304 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
14305 aarch64_attr_enum. */
14306
14307 struct aarch64_attribute_info
14308 {
14309 const char *name;
14310 enum aarch64_attr_opt_type attr_type;
14311 bool allow_neg;
14312 bool (*handler) (const char *);
14313 enum opt_code opt_num;
14314 };
14315
14316 /* Handle the ARCH_STR argument to the arch= target attribute. */
14317
14318 static bool
14319 aarch64_handle_attr_arch (const char *str)
14320 {
14321 const struct processor *tmp_arch = NULL;
14322 std::string invalid_extension;
14323 enum aarch64_parse_opt_result parse_res
14324 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
14325
14326 if (parse_res == AARCH64_PARSE_OK)
14327 {
14328 gcc_assert (tmp_arch);
14329 selected_arch = tmp_arch;
14330 explicit_arch = selected_arch->arch;
14331 return true;
14332 }
14333
14334 switch (parse_res)
14335 {
14336 case AARCH64_PARSE_MISSING_ARG:
14337 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
14338 break;
14339 case AARCH64_PARSE_INVALID_ARG:
14340 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
14341 aarch64_print_hint_for_arch (str);
14342 break;
14343 case AARCH64_PARSE_INVALID_FEATURE:
14344 error ("invalid feature modifier %s of value (\"%s\") in "
14345 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14346 aarch64_print_hint_for_extensions (invalid_extension);
14347 break;
14348 default:
14349 gcc_unreachable ();
14350 }
14351
14352 return false;
14353 }
14354
14355 /* Handle the argument CPU_STR to the cpu= target attribute. */
14356
14357 static bool
14358 aarch64_handle_attr_cpu (const char *str)
14359 {
14360 const struct processor *tmp_cpu = NULL;
14361 std::string invalid_extension;
14362 enum aarch64_parse_opt_result parse_res
14363 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
14364
14365 if (parse_res == AARCH64_PARSE_OK)
14366 {
14367 gcc_assert (tmp_cpu);
14368 selected_tune = tmp_cpu;
14369 explicit_tune_core = selected_tune->ident;
14370
14371 selected_arch = &all_architectures[tmp_cpu->arch];
14372 explicit_arch = selected_arch->arch;
14373 return true;
14374 }
14375
14376 switch (parse_res)
14377 {
14378 case AARCH64_PARSE_MISSING_ARG:
14379 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
14380 break;
14381 case AARCH64_PARSE_INVALID_ARG:
14382 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
14383 aarch64_print_hint_for_core (str);
14384 break;
14385 case AARCH64_PARSE_INVALID_FEATURE:
14386 error ("invalid feature modifier %s of value (\"%s\") in "
14387 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14388 aarch64_print_hint_for_extensions (invalid_extension);
14389 break;
14390 default:
14391 gcc_unreachable ();
14392 }
14393
14394 return false;
14395 }
14396
14397 /* Handle the argument STR to the branch-protection= attribute. */
14398
14399 static bool
14400 aarch64_handle_attr_branch_protection (const char* str)
14401 {
14402 char *err_str = (char *) xmalloc (strlen (str) + 1);
14403 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
14404 &err_str);
14405 bool success = false;
14406 switch (res)
14407 {
14408 case AARCH64_PARSE_MISSING_ARG:
14409 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
14410 " attribute");
14411 break;
14412 case AARCH64_PARSE_INVALID_ARG:
14413 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
14414 "=\")%> pragma or attribute", err_str);
14415 break;
14416 case AARCH64_PARSE_OK:
14417 success = true;
14418 /* Fall through. */
14419 case AARCH64_PARSE_INVALID_FEATURE:
14420 break;
14421 default:
14422 gcc_unreachable ();
14423 }
14424 free (err_str);
14425 return success;
14426 }
14427
14428 /* Handle the argument STR to the tune= target attribute. */
14429
14430 static bool
14431 aarch64_handle_attr_tune (const char *str)
14432 {
14433 const struct processor *tmp_tune = NULL;
14434 enum aarch64_parse_opt_result parse_res
14435 = aarch64_parse_tune (str, &tmp_tune);
14436
14437 if (parse_res == AARCH64_PARSE_OK)
14438 {
14439 gcc_assert (tmp_tune);
14440 selected_tune = tmp_tune;
14441 explicit_tune_core = selected_tune->ident;
14442 return true;
14443 }
14444
14445 switch (parse_res)
14446 {
14447 case AARCH64_PARSE_INVALID_ARG:
14448 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
14449 aarch64_print_hint_for_core (str);
14450 break;
14451 default:
14452 gcc_unreachable ();
14453 }
14454
14455 return false;
14456 }
14457
14458 /* Parse an architecture extensions target attribute string specified in STR.
14459 For example "+fp+nosimd". Show any errors if needed. Return TRUE
14460 if successful. Update aarch64_isa_flags to reflect the ISA features
14461 modified. */
14462
14463 static bool
14464 aarch64_handle_attr_isa_flags (char *str)
14465 {
14466 enum aarch64_parse_opt_result parse_res;
14467 uint64_t isa_flags = aarch64_isa_flags;
14468
14469 /* We allow "+nothing" in the beginning to clear out all architectural
14470 features if the user wants to handpick specific features. */
14471 if (strncmp ("+nothing", str, 8) == 0)
14472 {
14473 isa_flags = 0;
14474 str += 8;
14475 }
14476
14477 std::string invalid_extension;
14478 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
14479
14480 if (parse_res == AARCH64_PARSE_OK)
14481 {
14482 aarch64_isa_flags = isa_flags;
14483 return true;
14484 }
14485
14486 switch (parse_res)
14487 {
14488 case AARCH64_PARSE_MISSING_ARG:
14489 error ("missing value in %<target()%> pragma or attribute");
14490 break;
14491
14492 case AARCH64_PARSE_INVALID_FEATURE:
14493 error ("invalid feature modifier %s of value (\"%s\") in "
14494 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
14495 break;
14496
14497 default:
14498 gcc_unreachable ();
14499 }
14500
14501 return false;
14502 }
14503
14504 /* The target attributes that we support. On top of these we also support just
14505 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
14506 handled explicitly in aarch64_process_one_target_attr. */
14507
14508 static const struct aarch64_attribute_info aarch64_attributes[] =
14509 {
14510 { "general-regs-only", aarch64_attr_mask, false, NULL,
14511 OPT_mgeneral_regs_only },
14512 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
14513 OPT_mfix_cortex_a53_835769 },
14514 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
14515 OPT_mfix_cortex_a53_843419 },
14516 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
14517 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
14518 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
14519 OPT_momit_leaf_frame_pointer },
14520 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
14521 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
14522 OPT_march_ },
14523 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
14524 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
14525 OPT_mtune_ },
14526 { "branch-protection", aarch64_attr_custom, false,
14527 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
14528 { "sign-return-address", aarch64_attr_enum, false, NULL,
14529 OPT_msign_return_address_ },
14530 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
14531 };
14532
14533 /* Parse ARG_STR which contains the definition of one target attribute.
14534 Show appropriate errors if any or return true if the attribute is valid. */
14535
14536 static bool
14537 aarch64_process_one_target_attr (char *arg_str)
14538 {
14539 bool invert = false;
14540
14541 size_t len = strlen (arg_str);
14542
14543 if (len == 0)
14544 {
14545 error ("malformed %<target()%> pragma or attribute");
14546 return false;
14547 }
14548
14549 char *str_to_check = (char *) alloca (len + 1);
14550 strcpy (str_to_check, arg_str);
14551
14552 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
14553 It is easier to detect and handle it explicitly here rather than going
14554 through the machinery for the rest of the target attributes in this
14555 function. */
14556 if (*str_to_check == '+')
14557 return aarch64_handle_attr_isa_flags (str_to_check);
14558
14559 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
14560 {
14561 invert = true;
14562 str_to_check += 3;
14563 }
14564 char *arg = strchr (str_to_check, '=');
14565
14566 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
14567 and point ARG to "foo". */
14568 if (arg)
14569 {
14570 *arg = '\0';
14571 arg++;
14572 }
14573 const struct aarch64_attribute_info *p_attr;
14574 bool found = false;
14575 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
14576 {
14577 /* If the names don't match up, or the user has given an argument
14578 to an attribute that doesn't accept one, or didn't give an argument
14579 to an attribute that expects one, fail to match. */
14580 if (strcmp (str_to_check, p_attr->name) != 0)
14581 continue;
14582
14583 found = true;
14584 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
14585 || p_attr->attr_type == aarch64_attr_enum;
14586
14587 if (attr_need_arg_p ^ (arg != NULL))
14588 {
14589 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
14590 return false;
14591 }
14592
14593 /* If the name matches but the attribute does not allow "no-" versions
14594 then we can't match. */
14595 if (invert && !p_attr->allow_neg)
14596 {
14597 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
14598 return false;
14599 }
14600
14601 switch (p_attr->attr_type)
14602 {
14603 /* Has a custom handler registered.
14604 For example, cpu=, arch=, tune=. */
14605 case aarch64_attr_custom:
14606 gcc_assert (p_attr->handler);
14607 if (!p_attr->handler (arg))
14608 return false;
14609 break;
14610
14611 /* Either set or unset a boolean option. */
14612 case aarch64_attr_bool:
14613 {
14614 struct cl_decoded_option decoded;
14615
14616 generate_option (p_attr->opt_num, NULL, !invert,
14617 CL_TARGET, &decoded);
14618 aarch64_handle_option (&global_options, &global_options_set,
14619 &decoded, input_location);
14620 break;
14621 }
14622 /* Set or unset a bit in the target_flags. aarch64_handle_option
14623 should know what mask to apply given the option number. */
14624 case aarch64_attr_mask:
14625 {
14626 struct cl_decoded_option decoded;
14627 /* We only need to specify the option number.
14628 aarch64_handle_option will know which mask to apply. */
14629 decoded.opt_index = p_attr->opt_num;
14630 decoded.value = !invert;
14631 aarch64_handle_option (&global_options, &global_options_set,
14632 &decoded, input_location);
14633 break;
14634 }
14635 /* Use the option setting machinery to set an option to an enum. */
14636 case aarch64_attr_enum:
14637 {
14638 gcc_assert (arg);
14639 bool valid;
14640 int value;
14641 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
14642 &value, CL_TARGET);
14643 if (valid)
14644 {
14645 set_option (&global_options, NULL, p_attr->opt_num, value,
14646 NULL, DK_UNSPECIFIED, input_location,
14647 global_dc);
14648 }
14649 else
14650 {
14651 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
14652 }
14653 break;
14654 }
14655 default:
14656 gcc_unreachable ();
14657 }
14658 }
14659
14660 /* If we reached here we either have found an attribute and validated
14661 it or didn't match any. If we matched an attribute but its arguments
14662 were malformed we will have returned false already. */
14663 return found;
14664 }
14665
14666 /* Count how many times the character C appears in
14667 NULL-terminated string STR. */
14668
14669 static unsigned int
14670 num_occurences_in_str (char c, char *str)
14671 {
14672 unsigned int res = 0;
14673 while (*str != '\0')
14674 {
14675 if (*str == c)
14676 res++;
14677
14678 str++;
14679 }
14680
14681 return res;
14682 }
14683
14684 /* Parse the tree in ARGS that contains the target attribute information
14685 and update the global target options space. */
14686
14687 bool
14688 aarch64_process_target_attr (tree args)
14689 {
14690 if (TREE_CODE (args) == TREE_LIST)
14691 {
14692 do
14693 {
14694 tree head = TREE_VALUE (args);
14695 if (head)
14696 {
14697 if (!aarch64_process_target_attr (head))
14698 return false;
14699 }
14700 args = TREE_CHAIN (args);
14701 } while (args);
14702
14703 return true;
14704 }
14705
14706 if (TREE_CODE (args) != STRING_CST)
14707 {
14708 error ("attribute %<target%> argument not a string");
14709 return false;
14710 }
14711
14712 size_t len = strlen (TREE_STRING_POINTER (args));
14713 char *str_to_check = (char *) alloca (len + 1);
14714 strcpy (str_to_check, TREE_STRING_POINTER (args));
14715
14716 if (len == 0)
14717 {
14718 error ("malformed %<target()%> pragma or attribute");
14719 return false;
14720 }
14721
14722 /* Used to catch empty spaces between commas i.e.
14723 attribute ((target ("attr1,,attr2"))). */
14724 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
14725
14726 /* Handle multiple target attributes separated by ','. */
14727 char *token = strtok_r (str_to_check, ",", &str_to_check);
14728
14729 unsigned int num_attrs = 0;
14730 while (token)
14731 {
14732 num_attrs++;
14733 if (!aarch64_process_one_target_attr (token))
14734 {
14735 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
14736 return false;
14737 }
14738
14739 token = strtok_r (NULL, ",", &str_to_check);
14740 }
14741
14742 if (num_attrs != num_commas + 1)
14743 {
14744 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
14745 return false;
14746 }
14747
14748 return true;
14749 }
14750
14751 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
14752 process attribute ((target ("..."))). */
14753
14754 static bool
14755 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
14756 {
14757 struct cl_target_option cur_target;
14758 bool ret;
14759 tree old_optimize;
14760 tree new_target, new_optimize;
14761 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14762
14763 /* If what we're processing is the current pragma string then the
14764 target option node is already stored in target_option_current_node
14765 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
14766 having to re-parse the string. This is especially useful to keep
14767 arm_neon.h compile times down since that header contains a lot
14768 of intrinsics enclosed in pragmas. */
14769 if (!existing_target && args == current_target_pragma)
14770 {
14771 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
14772 return true;
14773 }
14774 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14775
14776 old_optimize = build_optimization_node (&global_options);
14777 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
14778
14779 /* If the function changed the optimization levels as well as setting
14780 target options, start with the optimizations specified. */
14781 if (func_optimize && func_optimize != old_optimize)
14782 cl_optimization_restore (&global_options,
14783 TREE_OPTIMIZATION (func_optimize));
14784
14785 /* Save the current target options to restore at the end. */
14786 cl_target_option_save (&cur_target, &global_options);
14787
14788 /* If fndecl already has some target attributes applied to it, unpack
14789 them so that we add this attribute on top of them, rather than
14790 overwriting them. */
14791 if (existing_target)
14792 {
14793 struct cl_target_option *existing_options
14794 = TREE_TARGET_OPTION (existing_target);
14795
14796 if (existing_options)
14797 cl_target_option_restore (&global_options, existing_options);
14798 }
14799 else
14800 cl_target_option_restore (&global_options,
14801 TREE_TARGET_OPTION (target_option_current_node));
14802
14803 ret = aarch64_process_target_attr (args);
14804
14805 /* Set up any additional state. */
14806 if (ret)
14807 {
14808 aarch64_override_options_internal (&global_options);
14809 /* Initialize SIMD builtins if we haven't already.
14810 Set current_target_pragma to NULL for the duration so that
14811 the builtin initialization code doesn't try to tag the functions
14812 being built with the attributes specified by any current pragma, thus
14813 going into an infinite recursion. */
14814 if (TARGET_SIMD)
14815 {
14816 tree saved_current_target_pragma = current_target_pragma;
14817 current_target_pragma = NULL;
14818 aarch64_init_simd_builtins ();
14819 current_target_pragma = saved_current_target_pragma;
14820 }
14821 new_target = build_target_option_node (&global_options);
14822 }
14823 else
14824 new_target = NULL;
14825
14826 new_optimize = build_optimization_node (&global_options);
14827
14828 if (fndecl && ret)
14829 {
14830 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
14831
14832 if (old_optimize != new_optimize)
14833 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
14834 }
14835
14836 cl_target_option_restore (&global_options, &cur_target);
14837
14838 if (old_optimize != new_optimize)
14839 cl_optimization_restore (&global_options,
14840 TREE_OPTIMIZATION (old_optimize));
14841 return ret;
14842 }
14843
14844 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
14845 tri-bool options (yes, no, don't care) and the default value is
14846 DEF, determine whether to reject inlining. */
14847
14848 static bool
14849 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
14850 int dont_care, int def)
14851 {
14852 /* If the callee doesn't care, always allow inlining. */
14853 if (callee == dont_care)
14854 return true;
14855
14856 /* If the caller doesn't care, always allow inlining. */
14857 if (caller == dont_care)
14858 return true;
14859
14860 /* Otherwise, allow inlining if either the callee and caller values
14861 agree, or if the callee is using the default value. */
14862 return (callee == caller || callee == def);
14863 }
14864
14865 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
14866 to inline CALLEE into CALLER based on target-specific info.
14867 Make sure that the caller and callee have compatible architectural
14868 features. Then go through the other possible target attributes
14869 and see if they can block inlining. Try not to reject always_inline
14870 callees unless they are incompatible architecturally. */
14871
14872 static bool
14873 aarch64_can_inline_p (tree caller, tree callee)
14874 {
14875 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
14876 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
14877
14878 struct cl_target_option *caller_opts
14879 = TREE_TARGET_OPTION (caller_tree ? caller_tree
14880 : target_option_default_node);
14881
14882 struct cl_target_option *callee_opts
14883 = TREE_TARGET_OPTION (callee_tree ? callee_tree
14884 : target_option_default_node);
14885
14886 /* Callee's ISA flags should be a subset of the caller's. */
14887 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
14888 != callee_opts->x_aarch64_isa_flags)
14889 return false;
14890
14891 /* Allow non-strict aligned functions inlining into strict
14892 aligned ones. */
14893 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
14894 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
14895 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
14896 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
14897 return false;
14898
14899 bool always_inline = lookup_attribute ("always_inline",
14900 DECL_ATTRIBUTES (callee));
14901
14902 /* If the architectural features match up and the callee is always_inline
14903 then the other attributes don't matter. */
14904 if (always_inline)
14905 return true;
14906
14907 if (caller_opts->x_aarch64_cmodel_var
14908 != callee_opts->x_aarch64_cmodel_var)
14909 return false;
14910
14911 if (caller_opts->x_aarch64_tls_dialect
14912 != callee_opts->x_aarch64_tls_dialect)
14913 return false;
14914
14915 /* Honour explicit requests to workaround errata. */
14916 if (!aarch64_tribools_ok_for_inlining_p (
14917 caller_opts->x_aarch64_fix_a53_err835769,
14918 callee_opts->x_aarch64_fix_a53_err835769,
14919 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
14920 return false;
14921
14922 if (!aarch64_tribools_ok_for_inlining_p (
14923 caller_opts->x_aarch64_fix_a53_err843419,
14924 callee_opts->x_aarch64_fix_a53_err843419,
14925 2, TARGET_FIX_ERR_A53_843419))
14926 return false;
14927
14928 /* If the user explicitly specified -momit-leaf-frame-pointer for the
14929 caller and calle and they don't match up, reject inlining. */
14930 if (!aarch64_tribools_ok_for_inlining_p (
14931 caller_opts->x_flag_omit_leaf_frame_pointer,
14932 callee_opts->x_flag_omit_leaf_frame_pointer,
14933 2, 1))
14934 return false;
14935
14936 /* If the callee has specific tuning overrides, respect them. */
14937 if (callee_opts->x_aarch64_override_tune_string != NULL
14938 && caller_opts->x_aarch64_override_tune_string == NULL)
14939 return false;
14940
14941 /* If the user specified tuning override strings for the
14942 caller and callee and they don't match up, reject inlining.
14943 We just do a string compare here, we don't analyze the meaning
14944 of the string, as it would be too costly for little gain. */
14945 if (callee_opts->x_aarch64_override_tune_string
14946 && caller_opts->x_aarch64_override_tune_string
14947 && (strcmp (callee_opts->x_aarch64_override_tune_string,
14948 caller_opts->x_aarch64_override_tune_string) != 0))
14949 return false;
14950
14951 return true;
14952 }
14953
14954 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
14955 been already. */
14956
14957 unsigned int
14958 aarch64_tlsdesc_abi_id ()
14959 {
14960 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
14961 if (!tlsdesc_abi.initialized_p ())
14962 {
14963 HARD_REG_SET full_reg_clobbers;
14964 CLEAR_HARD_REG_SET (full_reg_clobbers);
14965 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
14966 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
14967 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
14968 SET_HARD_REG_BIT (full_reg_clobbers, regno);
14969 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
14970 }
14971 return tlsdesc_abi.id ();
14972 }
14973
14974 /* Return true if SYMBOL_REF X binds locally. */
14975
14976 static bool
14977 aarch64_symbol_binds_local_p (const_rtx x)
14978 {
14979 return (SYMBOL_REF_DECL (x)
14980 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
14981 : SYMBOL_REF_LOCAL_P (x));
14982 }
14983
14984 /* Return true if SYMBOL_REF X is thread local */
14985 static bool
14986 aarch64_tls_symbol_p (rtx x)
14987 {
14988 if (! TARGET_HAVE_TLS)
14989 return false;
14990
14991 if (GET_CODE (x) != SYMBOL_REF)
14992 return false;
14993
14994 return SYMBOL_REF_TLS_MODEL (x) != 0;
14995 }
14996
14997 /* Classify a TLS symbol into one of the TLS kinds. */
14998 enum aarch64_symbol_type
14999 aarch64_classify_tls_symbol (rtx x)
15000 {
15001 enum tls_model tls_kind = tls_symbolic_operand_type (x);
15002
15003 switch (tls_kind)
15004 {
15005 case TLS_MODEL_GLOBAL_DYNAMIC:
15006 case TLS_MODEL_LOCAL_DYNAMIC:
15007 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
15008
15009 case TLS_MODEL_INITIAL_EXEC:
15010 switch (aarch64_cmodel)
15011 {
15012 case AARCH64_CMODEL_TINY:
15013 case AARCH64_CMODEL_TINY_PIC:
15014 return SYMBOL_TINY_TLSIE;
15015 default:
15016 return SYMBOL_SMALL_TLSIE;
15017 }
15018
15019 case TLS_MODEL_LOCAL_EXEC:
15020 if (aarch64_tls_size == 12)
15021 return SYMBOL_TLSLE12;
15022 else if (aarch64_tls_size == 24)
15023 return SYMBOL_TLSLE24;
15024 else if (aarch64_tls_size == 32)
15025 return SYMBOL_TLSLE32;
15026 else if (aarch64_tls_size == 48)
15027 return SYMBOL_TLSLE48;
15028 else
15029 gcc_unreachable ();
15030
15031 case TLS_MODEL_EMULATED:
15032 case TLS_MODEL_NONE:
15033 return SYMBOL_FORCE_TO_MEM;
15034
15035 default:
15036 gcc_unreachable ();
15037 }
15038 }
15039
15040 /* Return the correct method for accessing X + OFFSET, where X is either
15041 a SYMBOL_REF or LABEL_REF. */
15042
15043 enum aarch64_symbol_type
15044 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
15045 {
15046 if (GET_CODE (x) == LABEL_REF)
15047 {
15048 switch (aarch64_cmodel)
15049 {
15050 case AARCH64_CMODEL_LARGE:
15051 return SYMBOL_FORCE_TO_MEM;
15052
15053 case AARCH64_CMODEL_TINY_PIC:
15054 case AARCH64_CMODEL_TINY:
15055 return SYMBOL_TINY_ABSOLUTE;
15056
15057 case AARCH64_CMODEL_SMALL_SPIC:
15058 case AARCH64_CMODEL_SMALL_PIC:
15059 case AARCH64_CMODEL_SMALL:
15060 return SYMBOL_SMALL_ABSOLUTE;
15061
15062 default:
15063 gcc_unreachable ();
15064 }
15065 }
15066
15067 if (GET_CODE (x) == SYMBOL_REF)
15068 {
15069 if (aarch64_tls_symbol_p (x))
15070 return aarch64_classify_tls_symbol (x);
15071
15072 switch (aarch64_cmodel)
15073 {
15074 case AARCH64_CMODEL_TINY:
15075 /* When we retrieve symbol + offset address, we have to make sure
15076 the offset does not cause overflow of the final address. But
15077 we have no way of knowing the address of symbol at compile time
15078 so we can't accurately say if the distance between the PC and
15079 symbol + offset is outside the addressible range of +/-1MB in the
15080 TINY code model. So we limit the maximum offset to +/-64KB and
15081 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15082 If offset_within_block_p is true we allow larger offsets.
15083 Furthermore force to memory if the symbol is a weak reference to
15084 something that doesn't resolve to a symbol in this module. */
15085
15086 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15087 return SYMBOL_FORCE_TO_MEM;
15088 if (!(IN_RANGE (offset, -0x10000, 0x10000)
15089 || offset_within_block_p (x, offset)))
15090 return SYMBOL_FORCE_TO_MEM;
15091
15092 return SYMBOL_TINY_ABSOLUTE;
15093
15094 case AARCH64_CMODEL_SMALL:
15095 /* Same reasoning as the tiny code model, but the offset cap here is
15096 1MB, allowing +/-3.9GB for the offset to the symbol. */
15097
15098 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15099 return SYMBOL_FORCE_TO_MEM;
15100 if (!(IN_RANGE (offset, -0x100000, 0x100000)
15101 || offset_within_block_p (x, offset)))
15102 return SYMBOL_FORCE_TO_MEM;
15103
15104 return SYMBOL_SMALL_ABSOLUTE;
15105
15106 case AARCH64_CMODEL_TINY_PIC:
15107 if (!aarch64_symbol_binds_local_p (x))
15108 return SYMBOL_TINY_GOT;
15109 return SYMBOL_TINY_ABSOLUTE;
15110
15111 case AARCH64_CMODEL_SMALL_SPIC:
15112 case AARCH64_CMODEL_SMALL_PIC:
15113 if (!aarch64_symbol_binds_local_p (x))
15114 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
15115 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
15116 return SYMBOL_SMALL_ABSOLUTE;
15117
15118 case AARCH64_CMODEL_LARGE:
15119 /* This is alright even in PIC code as the constant
15120 pool reference is always PC relative and within
15121 the same translation unit. */
15122 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
15123 return SYMBOL_SMALL_ABSOLUTE;
15124 else
15125 return SYMBOL_FORCE_TO_MEM;
15126
15127 default:
15128 gcc_unreachable ();
15129 }
15130 }
15131
15132 /* By default push everything into the constant pool. */
15133 return SYMBOL_FORCE_TO_MEM;
15134 }
15135
15136 bool
15137 aarch64_constant_address_p (rtx x)
15138 {
15139 return (CONSTANT_P (x) && memory_address_p (DImode, x));
15140 }
15141
15142 bool
15143 aarch64_legitimate_pic_operand_p (rtx x)
15144 {
15145 if (GET_CODE (x) == SYMBOL_REF
15146 || (GET_CODE (x) == CONST
15147 && GET_CODE (XEXP (x, 0)) == PLUS
15148 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
15149 return false;
15150
15151 return true;
15152 }
15153
15154 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
15155 that should be rematerialized rather than spilled. */
15156
15157 static bool
15158 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
15159 {
15160 /* Support CSE and rematerialization of common constants. */
15161 if (CONST_INT_P (x)
15162 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
15163 || GET_CODE (x) == CONST_VECTOR)
15164 return true;
15165
15166 /* Do not allow vector struct mode constants for Advanced SIMD.
15167 We could support 0 and -1 easily, but they need support in
15168 aarch64-simd.md. */
15169 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15170 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15171 return false;
15172
15173 /* Only accept variable-length vector constants if they can be
15174 handled directly.
15175
15176 ??? It would be possible to handle rematerialization of other
15177 constants via secondary reloads. */
15178 if (vec_flags & VEC_ANY_SVE)
15179 return aarch64_simd_valid_immediate (x, NULL);
15180
15181 if (GET_CODE (x) == HIGH)
15182 x = XEXP (x, 0);
15183
15184 /* Accept polynomial constants that can be calculated by using the
15185 destination of a move as the sole temporary. Constants that
15186 require a second temporary cannot be rematerialized (they can't be
15187 forced to memory and also aren't legitimate constants). */
15188 poly_int64 offset;
15189 if (poly_int_rtx_p (x, &offset))
15190 return aarch64_offset_temporaries (false, offset) <= 1;
15191
15192 /* If an offset is being added to something else, we need to allow the
15193 base to be moved into the destination register, meaning that there
15194 are no free temporaries for the offset. */
15195 x = strip_offset (x, &offset);
15196 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
15197 return false;
15198
15199 /* Do not allow const (plus (anchor_symbol, const_int)). */
15200 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
15201 return false;
15202
15203 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
15204 so spilling them is better than rematerialization. */
15205 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
15206 return true;
15207
15208 /* Label references are always constant. */
15209 if (GET_CODE (x) == LABEL_REF)
15210 return true;
15211
15212 return false;
15213 }
15214
15215 rtx
15216 aarch64_load_tp (rtx target)
15217 {
15218 if (!target
15219 || GET_MODE (target) != Pmode
15220 || !register_operand (target, Pmode))
15221 target = gen_reg_rtx (Pmode);
15222
15223 /* Can return in any reg. */
15224 emit_insn (gen_aarch64_load_tp_hard (target));
15225 return target;
15226 }
15227
15228 /* On AAPCS systems, this is the "struct __va_list". */
15229 static GTY(()) tree va_list_type;
15230
15231 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
15232 Return the type to use as __builtin_va_list.
15233
15234 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
15235
15236 struct __va_list
15237 {
15238 void *__stack;
15239 void *__gr_top;
15240 void *__vr_top;
15241 int __gr_offs;
15242 int __vr_offs;
15243 }; */
15244
15245 static tree
15246 aarch64_build_builtin_va_list (void)
15247 {
15248 tree va_list_name;
15249 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15250
15251 /* Create the type. */
15252 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
15253 /* Give it the required name. */
15254 va_list_name = build_decl (BUILTINS_LOCATION,
15255 TYPE_DECL,
15256 get_identifier ("__va_list"),
15257 va_list_type);
15258 DECL_ARTIFICIAL (va_list_name) = 1;
15259 TYPE_NAME (va_list_type) = va_list_name;
15260 TYPE_STUB_DECL (va_list_type) = va_list_name;
15261
15262 /* Create the fields. */
15263 f_stack = build_decl (BUILTINS_LOCATION,
15264 FIELD_DECL, get_identifier ("__stack"),
15265 ptr_type_node);
15266 f_grtop = build_decl (BUILTINS_LOCATION,
15267 FIELD_DECL, get_identifier ("__gr_top"),
15268 ptr_type_node);
15269 f_vrtop = build_decl (BUILTINS_LOCATION,
15270 FIELD_DECL, get_identifier ("__vr_top"),
15271 ptr_type_node);
15272 f_groff = build_decl (BUILTINS_LOCATION,
15273 FIELD_DECL, get_identifier ("__gr_offs"),
15274 integer_type_node);
15275 f_vroff = build_decl (BUILTINS_LOCATION,
15276 FIELD_DECL, get_identifier ("__vr_offs"),
15277 integer_type_node);
15278
15279 /* Tell tree-stdarg pass about our internal offset fields.
15280 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
15281 purpose to identify whether the code is updating va_list internal
15282 offset fields through irregular way. */
15283 va_list_gpr_counter_field = f_groff;
15284 va_list_fpr_counter_field = f_vroff;
15285
15286 DECL_ARTIFICIAL (f_stack) = 1;
15287 DECL_ARTIFICIAL (f_grtop) = 1;
15288 DECL_ARTIFICIAL (f_vrtop) = 1;
15289 DECL_ARTIFICIAL (f_groff) = 1;
15290 DECL_ARTIFICIAL (f_vroff) = 1;
15291
15292 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
15293 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
15294 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
15295 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
15296 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
15297
15298 TYPE_FIELDS (va_list_type) = f_stack;
15299 DECL_CHAIN (f_stack) = f_grtop;
15300 DECL_CHAIN (f_grtop) = f_vrtop;
15301 DECL_CHAIN (f_vrtop) = f_groff;
15302 DECL_CHAIN (f_groff) = f_vroff;
15303
15304 /* Compute its layout. */
15305 layout_type (va_list_type);
15306
15307 return va_list_type;
15308 }
15309
15310 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
15311 static void
15312 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
15313 {
15314 const CUMULATIVE_ARGS *cum;
15315 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15316 tree stack, grtop, vrtop, groff, vroff;
15317 tree t;
15318 int gr_save_area_size = cfun->va_list_gpr_size;
15319 int vr_save_area_size = cfun->va_list_fpr_size;
15320 int vr_offset;
15321
15322 cum = &crtl->args.info;
15323 if (cfun->va_list_gpr_size)
15324 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
15325 cfun->va_list_gpr_size);
15326 if (cfun->va_list_fpr_size)
15327 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
15328 * UNITS_PER_VREG, cfun->va_list_fpr_size);
15329
15330 if (!TARGET_FLOAT)
15331 {
15332 gcc_assert (cum->aapcs_nvrn == 0);
15333 vr_save_area_size = 0;
15334 }
15335
15336 f_stack = TYPE_FIELDS (va_list_type_node);
15337 f_grtop = DECL_CHAIN (f_stack);
15338 f_vrtop = DECL_CHAIN (f_grtop);
15339 f_groff = DECL_CHAIN (f_vrtop);
15340 f_vroff = DECL_CHAIN (f_groff);
15341
15342 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
15343 NULL_TREE);
15344 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
15345 NULL_TREE);
15346 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
15347 NULL_TREE);
15348 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
15349 NULL_TREE);
15350 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
15351 NULL_TREE);
15352
15353 /* Emit code to initialize STACK, which points to the next varargs stack
15354 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
15355 by named arguments. STACK is 8-byte aligned. */
15356 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
15357 if (cum->aapcs_stack_size > 0)
15358 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
15359 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
15360 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15361
15362 /* Emit code to initialize GRTOP, the top of the GR save area.
15363 virtual_incoming_args_rtx should have been 16 byte aligned. */
15364 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
15365 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
15366 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15367
15368 /* Emit code to initialize VRTOP, the top of the VR save area.
15369 This address is gr_save_area_bytes below GRTOP, rounded
15370 down to the next 16-byte boundary. */
15371 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
15372 vr_offset = ROUND_UP (gr_save_area_size,
15373 STACK_BOUNDARY / BITS_PER_UNIT);
15374
15375 if (vr_offset)
15376 t = fold_build_pointer_plus_hwi (t, -vr_offset);
15377 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
15378 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15379
15380 /* Emit code to initialize GROFF, the offset from GRTOP of the
15381 next GPR argument. */
15382 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
15383 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
15384 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15385
15386 /* Likewise emit code to initialize VROFF, the offset from FTOP
15387 of the next VR argument. */
15388 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
15389 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
15390 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
15391 }
15392
15393 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
15394
15395 static tree
15396 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
15397 gimple_seq *post_p ATTRIBUTE_UNUSED)
15398 {
15399 tree addr;
15400 bool indirect_p;
15401 bool is_ha; /* is HFA or HVA. */
15402 bool dw_align; /* double-word align. */
15403 machine_mode ag_mode = VOIDmode;
15404 int nregs;
15405 machine_mode mode;
15406
15407 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
15408 tree stack, f_top, f_off, off, arg, roundup, on_stack;
15409 HOST_WIDE_INT size, rsize, adjust, align;
15410 tree t, u, cond1, cond2;
15411
15412 indirect_p = pass_va_arg_by_reference (type);
15413 if (indirect_p)
15414 type = build_pointer_type (type);
15415
15416 mode = TYPE_MODE (type);
15417
15418 f_stack = TYPE_FIELDS (va_list_type_node);
15419 f_grtop = DECL_CHAIN (f_stack);
15420 f_vrtop = DECL_CHAIN (f_grtop);
15421 f_groff = DECL_CHAIN (f_vrtop);
15422 f_vroff = DECL_CHAIN (f_groff);
15423
15424 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
15425 f_stack, NULL_TREE);
15426 size = int_size_in_bytes (type);
15427
15428 bool abi_break;
15429 align
15430 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
15431
15432 dw_align = false;
15433 adjust = 0;
15434 if (aarch64_vfp_is_call_or_return_candidate (mode,
15435 type,
15436 &ag_mode,
15437 &nregs,
15438 &is_ha))
15439 {
15440 /* No frontends can create types with variable-sized modes, so we
15441 shouldn't be asked to pass or return them. */
15442 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
15443
15444 /* TYPE passed in fp/simd registers. */
15445 if (!TARGET_FLOAT)
15446 aarch64_err_no_fpadvsimd (mode);
15447
15448 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
15449 unshare_expr (valist), f_vrtop, NULL_TREE);
15450 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
15451 unshare_expr (valist), f_vroff, NULL_TREE);
15452
15453 rsize = nregs * UNITS_PER_VREG;
15454
15455 if (is_ha)
15456 {
15457 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
15458 adjust = UNITS_PER_VREG - ag_size;
15459 }
15460 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15461 && size < UNITS_PER_VREG)
15462 {
15463 adjust = UNITS_PER_VREG - size;
15464 }
15465 }
15466 else
15467 {
15468 /* TYPE passed in general registers. */
15469 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
15470 unshare_expr (valist), f_grtop, NULL_TREE);
15471 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
15472 unshare_expr (valist), f_groff, NULL_TREE);
15473 rsize = ROUND_UP (size, UNITS_PER_WORD);
15474 nregs = rsize / UNITS_PER_WORD;
15475
15476 if (align > 8)
15477 {
15478 if (abi_break && warn_psabi)
15479 inform (input_location, "parameter passing for argument of type "
15480 "%qT changed in GCC 9.1", type);
15481 dw_align = true;
15482 }
15483
15484 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15485 && size < UNITS_PER_WORD)
15486 {
15487 adjust = UNITS_PER_WORD - size;
15488 }
15489 }
15490
15491 /* Get a local temporary for the field value. */
15492 off = get_initialized_tmp_var (f_off, pre_p, NULL);
15493
15494 /* Emit code to branch if off >= 0. */
15495 t = build2 (GE_EXPR, boolean_type_node, off,
15496 build_int_cst (TREE_TYPE (off), 0));
15497 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
15498
15499 if (dw_align)
15500 {
15501 /* Emit: offs = (offs + 15) & -16. */
15502 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15503 build_int_cst (TREE_TYPE (off), 15));
15504 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
15505 build_int_cst (TREE_TYPE (off), -16));
15506 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
15507 }
15508 else
15509 roundup = NULL;
15510
15511 /* Update ap.__[g|v]r_offs */
15512 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
15513 build_int_cst (TREE_TYPE (off), rsize));
15514 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
15515
15516 /* String up. */
15517 if (roundup)
15518 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15519
15520 /* [cond2] if (ap.__[g|v]r_offs > 0) */
15521 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
15522 build_int_cst (TREE_TYPE (f_off), 0));
15523 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
15524
15525 /* String up: make sure the assignment happens before the use. */
15526 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
15527 COND_EXPR_ELSE (cond1) = t;
15528
15529 /* Prepare the trees handling the argument that is passed on the stack;
15530 the top level node will store in ON_STACK. */
15531 arg = get_initialized_tmp_var (stack, pre_p, NULL);
15532 if (align > 8)
15533 {
15534 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
15535 t = fold_build_pointer_plus_hwi (arg, 15);
15536 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15537 build_int_cst (TREE_TYPE (t), -16));
15538 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
15539 }
15540 else
15541 roundup = NULL;
15542 /* Advance ap.__stack */
15543 t = fold_build_pointer_plus_hwi (arg, size + 7);
15544 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
15545 build_int_cst (TREE_TYPE (t), -8));
15546 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
15547 /* String up roundup and advance. */
15548 if (roundup)
15549 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
15550 /* String up with arg */
15551 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
15552 /* Big-endianness related address adjustment. */
15553 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
15554 && size < UNITS_PER_WORD)
15555 {
15556 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
15557 size_int (UNITS_PER_WORD - size));
15558 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
15559 }
15560
15561 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
15562 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
15563
15564 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
15565 t = off;
15566 if (adjust)
15567 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
15568 build_int_cst (TREE_TYPE (off), adjust));
15569
15570 t = fold_convert (sizetype, t);
15571 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
15572
15573 if (is_ha)
15574 {
15575 /* type ha; // treat as "struct {ftype field[n];}"
15576 ... [computing offs]
15577 for (i = 0; i <nregs; ++i, offs += 16)
15578 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
15579 return ha; */
15580 int i;
15581 tree tmp_ha, field_t, field_ptr_t;
15582
15583 /* Declare a local variable. */
15584 tmp_ha = create_tmp_var_raw (type, "ha");
15585 gimple_add_tmp_var (tmp_ha);
15586
15587 /* Establish the base type. */
15588 switch (ag_mode)
15589 {
15590 case E_SFmode:
15591 field_t = float_type_node;
15592 field_ptr_t = float_ptr_type_node;
15593 break;
15594 case E_DFmode:
15595 field_t = double_type_node;
15596 field_ptr_t = double_ptr_type_node;
15597 break;
15598 case E_TFmode:
15599 field_t = long_double_type_node;
15600 field_ptr_t = long_double_ptr_type_node;
15601 break;
15602 case E_HFmode:
15603 field_t = aarch64_fp16_type_node;
15604 field_ptr_t = aarch64_fp16_ptr_type_node;
15605 break;
15606 case E_V2SImode:
15607 case E_V4SImode:
15608 {
15609 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
15610 field_t = build_vector_type_for_mode (innertype, ag_mode);
15611 field_ptr_t = build_pointer_type (field_t);
15612 }
15613 break;
15614 default:
15615 gcc_assert (0);
15616 }
15617
15618 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
15619 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
15620 addr = t;
15621 t = fold_convert (field_ptr_t, addr);
15622 t = build2 (MODIFY_EXPR, field_t,
15623 build1 (INDIRECT_REF, field_t, tmp_ha),
15624 build1 (INDIRECT_REF, field_t, t));
15625
15626 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
15627 for (i = 1; i < nregs; ++i)
15628 {
15629 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
15630 u = fold_convert (field_ptr_t, addr);
15631 u = build2 (MODIFY_EXPR, field_t,
15632 build2 (MEM_REF, field_t, tmp_ha,
15633 build_int_cst (field_ptr_t,
15634 (i *
15635 int_size_in_bytes (field_t)))),
15636 build1 (INDIRECT_REF, field_t, u));
15637 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
15638 }
15639
15640 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
15641 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
15642 }
15643
15644 COND_EXPR_ELSE (cond2) = t;
15645 addr = fold_convert (build_pointer_type (type), cond1);
15646 addr = build_va_arg_indirect_ref (addr);
15647
15648 if (indirect_p)
15649 addr = build_va_arg_indirect_ref (addr);
15650
15651 return addr;
15652 }
15653
15654 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
15655
15656 static void
15657 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
15658 const function_arg_info &arg,
15659 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
15660 {
15661 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
15662 CUMULATIVE_ARGS local_cum;
15663 int gr_saved = cfun->va_list_gpr_size;
15664 int vr_saved = cfun->va_list_fpr_size;
15665
15666 /* The caller has advanced CUM up to, but not beyond, the last named
15667 argument. Advance a local copy of CUM past the last "real" named
15668 argument, to find out how many registers are left over. */
15669 local_cum = *cum;
15670 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
15671
15672 /* Found out how many registers we need to save.
15673 Honor tree-stdvar analysis results. */
15674 if (cfun->va_list_gpr_size)
15675 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
15676 cfun->va_list_gpr_size / UNITS_PER_WORD);
15677 if (cfun->va_list_fpr_size)
15678 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
15679 cfun->va_list_fpr_size / UNITS_PER_VREG);
15680
15681 if (!TARGET_FLOAT)
15682 {
15683 gcc_assert (local_cum.aapcs_nvrn == 0);
15684 vr_saved = 0;
15685 }
15686
15687 if (!no_rtl)
15688 {
15689 if (gr_saved > 0)
15690 {
15691 rtx ptr, mem;
15692
15693 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
15694 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
15695 - gr_saved * UNITS_PER_WORD);
15696 mem = gen_frame_mem (BLKmode, ptr);
15697 set_mem_alias_set (mem, get_varargs_alias_set ());
15698
15699 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
15700 mem, gr_saved);
15701 }
15702 if (vr_saved > 0)
15703 {
15704 /* We can't use move_block_from_reg, because it will use
15705 the wrong mode, storing D regs only. */
15706 machine_mode mode = TImode;
15707 int off, i, vr_start;
15708
15709 /* Set OFF to the offset from virtual_incoming_args_rtx of
15710 the first vector register. The VR save area lies below
15711 the GR one, and is aligned to 16 bytes. */
15712 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
15713 STACK_BOUNDARY / BITS_PER_UNIT);
15714 off -= vr_saved * UNITS_PER_VREG;
15715
15716 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
15717 for (i = 0; i < vr_saved; ++i)
15718 {
15719 rtx ptr, mem;
15720
15721 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
15722 mem = gen_frame_mem (mode, ptr);
15723 set_mem_alias_set (mem, get_varargs_alias_set ());
15724 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
15725 off += UNITS_PER_VREG;
15726 }
15727 }
15728 }
15729
15730 /* We don't save the size into *PRETEND_SIZE because we want to avoid
15731 any complication of having crtl->args.pretend_args_size changed. */
15732 cfun->machine->frame.saved_varargs_size
15733 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
15734 STACK_BOUNDARY / BITS_PER_UNIT)
15735 + vr_saved * UNITS_PER_VREG);
15736 }
15737
15738 static void
15739 aarch64_conditional_register_usage (void)
15740 {
15741 int i;
15742 if (!TARGET_FLOAT)
15743 {
15744 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
15745 {
15746 fixed_regs[i] = 1;
15747 call_used_regs[i] = 1;
15748 }
15749 }
15750 if (!TARGET_SVE)
15751 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
15752 {
15753 fixed_regs[i] = 1;
15754 call_used_regs[i] = 1;
15755 }
15756
15757 /* Only allow the FFR and FFRT to be accessed via special patterns. */
15758 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
15759 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
15760
15761 /* When tracking speculation, we need a couple of call-clobbered registers
15762 to track the speculation state. It would be nice to just use
15763 IP0 and IP1, but currently there are numerous places that just
15764 assume these registers are free for other uses (eg pointer
15765 authentication). */
15766 if (aarch64_track_speculation)
15767 {
15768 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
15769 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
15770 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15771 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
15772 }
15773 }
15774
15775 /* Walk down the type tree of TYPE counting consecutive base elements.
15776 If *MODEP is VOIDmode, then set it to the first valid floating point
15777 type. If a non-floating point type is found, or if a floating point
15778 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
15779 otherwise return the count in the sub-tree. */
15780 static int
15781 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
15782 {
15783 machine_mode mode;
15784 HOST_WIDE_INT size;
15785
15786 /* SVE types (and types containing SVE types) must be handled
15787 before calling this function. */
15788 gcc_assert (!aarch64_sve::builtin_type_p (type));
15789
15790 switch (TREE_CODE (type))
15791 {
15792 case REAL_TYPE:
15793 mode = TYPE_MODE (type);
15794 if (mode != DFmode && mode != SFmode
15795 && mode != TFmode && mode != HFmode)
15796 return -1;
15797
15798 if (*modep == VOIDmode)
15799 *modep = mode;
15800
15801 if (*modep == mode)
15802 return 1;
15803
15804 break;
15805
15806 case COMPLEX_TYPE:
15807 mode = TYPE_MODE (TREE_TYPE (type));
15808 if (mode != DFmode && mode != SFmode
15809 && mode != TFmode && mode != HFmode)
15810 return -1;
15811
15812 if (*modep == VOIDmode)
15813 *modep = mode;
15814
15815 if (*modep == mode)
15816 return 2;
15817
15818 break;
15819
15820 case VECTOR_TYPE:
15821 /* Use V2SImode and V4SImode as representatives of all 64-bit
15822 and 128-bit vector types. */
15823 size = int_size_in_bytes (type);
15824 switch (size)
15825 {
15826 case 8:
15827 mode = V2SImode;
15828 break;
15829 case 16:
15830 mode = V4SImode;
15831 break;
15832 default:
15833 return -1;
15834 }
15835
15836 if (*modep == VOIDmode)
15837 *modep = mode;
15838
15839 /* Vector modes are considered to be opaque: two vectors are
15840 equivalent for the purposes of being homogeneous aggregates
15841 if they are the same size. */
15842 if (*modep == mode)
15843 return 1;
15844
15845 break;
15846
15847 case ARRAY_TYPE:
15848 {
15849 int count;
15850 tree index = TYPE_DOMAIN (type);
15851
15852 /* Can't handle incomplete types nor sizes that are not
15853 fixed. */
15854 if (!COMPLETE_TYPE_P (type)
15855 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15856 return -1;
15857
15858 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
15859 if (count == -1
15860 || !index
15861 || !TYPE_MAX_VALUE (index)
15862 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
15863 || !TYPE_MIN_VALUE (index)
15864 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
15865 || count < 0)
15866 return -1;
15867
15868 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
15869 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
15870
15871 /* There must be no padding. */
15872 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15873 count * GET_MODE_BITSIZE (*modep)))
15874 return -1;
15875
15876 return count;
15877 }
15878
15879 case RECORD_TYPE:
15880 {
15881 int count = 0;
15882 int sub_count;
15883 tree field;
15884
15885 /* Can't handle incomplete types nor sizes that are not
15886 fixed. */
15887 if (!COMPLETE_TYPE_P (type)
15888 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15889 return -1;
15890
15891 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
15892 {
15893 if (TREE_CODE (field) != FIELD_DECL)
15894 continue;
15895
15896 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
15897 if (sub_count < 0)
15898 return -1;
15899 count += sub_count;
15900 }
15901
15902 /* There must be no padding. */
15903 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15904 count * GET_MODE_BITSIZE (*modep)))
15905 return -1;
15906
15907 return count;
15908 }
15909
15910 case UNION_TYPE:
15911 case QUAL_UNION_TYPE:
15912 {
15913 /* These aren't very interesting except in a degenerate case. */
15914 int count = 0;
15915 int sub_count;
15916 tree field;
15917
15918 /* Can't handle incomplete types nor sizes that are not
15919 fixed. */
15920 if (!COMPLETE_TYPE_P (type)
15921 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
15922 return -1;
15923
15924 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
15925 {
15926 if (TREE_CODE (field) != FIELD_DECL)
15927 continue;
15928
15929 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
15930 if (sub_count < 0)
15931 return -1;
15932 count = count > sub_count ? count : sub_count;
15933 }
15934
15935 /* There must be no padding. */
15936 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
15937 count * GET_MODE_BITSIZE (*modep)))
15938 return -1;
15939
15940 return count;
15941 }
15942
15943 default:
15944 break;
15945 }
15946
15947 return -1;
15948 }
15949
15950 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
15951 type as described in AAPCS64 \S 4.1.2.
15952
15953 See the comment above aarch64_composite_type_p for the notes on MODE. */
15954
15955 static bool
15956 aarch64_short_vector_p (const_tree type,
15957 machine_mode mode)
15958 {
15959 poly_int64 size = -1;
15960
15961 if (type && aarch64_sve::builtin_type_p (type))
15962 return false;
15963
15964 if (type && TREE_CODE (type) == VECTOR_TYPE)
15965 size = int_size_in_bytes (type);
15966 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
15967 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
15968 size = GET_MODE_SIZE (mode);
15969
15970 return known_eq (size, 8) || known_eq (size, 16);
15971 }
15972
15973 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
15974 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
15975 array types. The C99 floating-point complex types are also considered
15976 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
15977 types, which are GCC extensions and out of the scope of AAPCS64, are
15978 treated as composite types here as well.
15979
15980 Note that MODE itself is not sufficient in determining whether a type
15981 is such a composite type or not. This is because
15982 stor-layout.c:compute_record_mode may have already changed the MODE
15983 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
15984 structure with only one field may have its MODE set to the mode of the
15985 field. Also an integer mode whose size matches the size of the
15986 RECORD_TYPE type may be used to substitute the original mode
15987 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
15988 solely relied on. */
15989
15990 static bool
15991 aarch64_composite_type_p (const_tree type,
15992 machine_mode mode)
15993 {
15994 if (aarch64_short_vector_p (type, mode))
15995 return false;
15996
15997 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
15998 return true;
15999
16000 if (mode == BLKmode
16001 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
16002 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
16003 return true;
16004
16005 return false;
16006 }
16007
16008 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16009 shall be passed or returned in simd/fp register(s) (providing these
16010 parameter passing registers are available).
16011
16012 Upon successful return, *COUNT returns the number of needed registers,
16013 *BASE_MODE returns the mode of the individual register and when IS_HAF
16014 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
16015 floating-point aggregate or a homogeneous short-vector aggregate. */
16016
16017 static bool
16018 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
16019 const_tree type,
16020 machine_mode *base_mode,
16021 int *count,
16022 bool *is_ha)
16023 {
16024 if (is_ha != NULL) *is_ha = false;
16025
16026 if (type && aarch64_sve::builtin_type_p (type))
16027 return false;
16028
16029 machine_mode new_mode = VOIDmode;
16030 bool composite_p = aarch64_composite_type_p (type, mode);
16031
16032 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
16033 || aarch64_short_vector_p (type, mode))
16034 {
16035 *count = 1;
16036 new_mode = mode;
16037 }
16038 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
16039 {
16040 if (is_ha != NULL) *is_ha = true;
16041 *count = 2;
16042 new_mode = GET_MODE_INNER (mode);
16043 }
16044 else if (type && composite_p)
16045 {
16046 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
16047
16048 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
16049 {
16050 if (is_ha != NULL) *is_ha = true;
16051 *count = ag_count;
16052 }
16053 else
16054 return false;
16055 }
16056 else
16057 return false;
16058
16059 *base_mode = new_mode;
16060 return true;
16061 }
16062
16063 /* Implement TARGET_STRUCT_VALUE_RTX. */
16064
16065 static rtx
16066 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
16067 int incoming ATTRIBUTE_UNUSED)
16068 {
16069 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
16070 }
16071
16072 /* Implements target hook vector_mode_supported_p. */
16073 static bool
16074 aarch64_vector_mode_supported_p (machine_mode mode)
16075 {
16076 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16077 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
16078 }
16079
16080 /* Return the full-width SVE vector mode for element mode MODE, if one
16081 exists. */
16082 opt_machine_mode
16083 aarch64_full_sve_mode (scalar_mode mode)
16084 {
16085 switch (mode)
16086 {
16087 case E_DFmode:
16088 return VNx2DFmode;
16089 case E_SFmode:
16090 return VNx4SFmode;
16091 case E_HFmode:
16092 return VNx8HFmode;
16093 case E_DImode:
16094 return VNx2DImode;
16095 case E_SImode:
16096 return VNx4SImode;
16097 case E_HImode:
16098 return VNx8HImode;
16099 case E_QImode:
16100 return VNx16QImode;
16101 default:
16102 return opt_machine_mode ();
16103 }
16104 }
16105
16106 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
16107 if it exists. */
16108 opt_machine_mode
16109 aarch64_vq_mode (scalar_mode mode)
16110 {
16111 switch (mode)
16112 {
16113 case E_DFmode:
16114 return V2DFmode;
16115 case E_SFmode:
16116 return V4SFmode;
16117 case E_HFmode:
16118 return V8HFmode;
16119 case E_SImode:
16120 return V4SImode;
16121 case E_HImode:
16122 return V8HImode;
16123 case E_QImode:
16124 return V16QImode;
16125 case E_DImode:
16126 return V2DImode;
16127 default:
16128 return opt_machine_mode ();
16129 }
16130 }
16131
16132 /* Return appropriate SIMD container
16133 for MODE within a vector of WIDTH bits. */
16134 static machine_mode
16135 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
16136 {
16137 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
16138 return aarch64_full_sve_mode (mode).else_mode (word_mode);
16139
16140 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
16141 if (TARGET_SIMD)
16142 {
16143 if (known_eq (width, 128))
16144 return aarch64_vq_mode (mode).else_mode (word_mode);
16145 else
16146 switch (mode)
16147 {
16148 case E_SFmode:
16149 return V2SFmode;
16150 case E_HFmode:
16151 return V4HFmode;
16152 case E_SImode:
16153 return V2SImode;
16154 case E_HImode:
16155 return V4HImode;
16156 case E_QImode:
16157 return V8QImode;
16158 default:
16159 break;
16160 }
16161 }
16162 return word_mode;
16163 }
16164
16165 /* Return 128-bit container as the preferred SIMD mode for MODE. */
16166 static machine_mode
16167 aarch64_preferred_simd_mode (scalar_mode mode)
16168 {
16169 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
16170 return aarch64_simd_container_mode (mode, bits);
16171 }
16172
16173 /* Return a list of possible vector sizes for the vectorizer
16174 to iterate over. */
16175 static unsigned int
16176 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
16177 {
16178 static const machine_mode sve_modes[] = {
16179 /* Try using full vectors for all element types. */
16180 VNx16QImode,
16181
16182 /* Try using 16-bit containers for 8-bit elements and full vectors
16183 for wider elements. */
16184 VNx8QImode,
16185
16186 /* Try using 32-bit containers for 8-bit and 16-bit elements and
16187 full vectors for wider elements. */
16188 VNx4QImode,
16189
16190 /* Try using 64-bit containers for all element types. */
16191 VNx2QImode
16192 };
16193
16194 static const machine_mode advsimd_modes[] = {
16195 /* Try using 128-bit vectors for all element types. */
16196 V16QImode,
16197
16198 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
16199 for wider elements. */
16200 V8QImode,
16201
16202 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
16203 for wider elements.
16204
16205 TODO: We could support a limited form of V4QImode too, so that
16206 we use 32-bit vectors for 8-bit elements. */
16207 V4HImode,
16208
16209 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
16210 for 64-bit elements.
16211
16212 TODO: We could similarly support limited forms of V2QImode and V2HImode
16213 for this case. */
16214 V2SImode
16215 };
16216
16217 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
16218 This is because:
16219
16220 - If we can't use N-byte Advanced SIMD vectors then the placement
16221 doesn't matter; we'll just continue as though the Advanced SIMD
16222 entry didn't exist.
16223
16224 - If an SVE main loop with N bytes ends up being cheaper than an
16225 Advanced SIMD main loop with N bytes then by default we'll replace
16226 the Advanced SIMD version with the SVE one.
16227
16228 - If an Advanced SIMD main loop with N bytes ends up being cheaper
16229 than an SVE main loop with N bytes then by default we'll try to
16230 use the SVE loop to vectorize the epilogue instead. */
16231 unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
16232 unsigned int advsimd_i = 0;
16233 while (advsimd_i < ARRAY_SIZE (advsimd_modes))
16234 {
16235 if (sve_i < ARRAY_SIZE (sve_modes)
16236 && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
16237 GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
16238 modes->safe_push (sve_modes[sve_i++]);
16239 else
16240 modes->safe_push (advsimd_modes[advsimd_i++]);
16241 }
16242 while (sve_i < ARRAY_SIZE (sve_modes))
16243 modes->safe_push (sve_modes[sve_i++]);
16244
16245 unsigned int flags = 0;
16246 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
16247 can compare SVE against Advanced SIMD and so that we can compare
16248 multiple SVE vectorization approaches against each other. There's
16249 not really any point doing this for Advanced SIMD only, since the
16250 first mode that works should always be the best. */
16251 if (TARGET_SVE && aarch64_sve_compare_costs)
16252 flags |= VECT_COMPARE_COSTS;
16253 return flags;
16254 }
16255
16256 /* Implement TARGET_MANGLE_TYPE. */
16257
16258 static const char *
16259 aarch64_mangle_type (const_tree type)
16260 {
16261 /* The AArch64 ABI documents say that "__va_list" has to be
16262 mangled as if it is in the "std" namespace. */
16263 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
16264 return "St9__va_list";
16265
16266 /* Half-precision float. */
16267 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
16268 return "Dh";
16269
16270 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
16271 builtin types. */
16272 if (TYPE_NAME (type) != NULL)
16273 {
16274 const char *res;
16275 if ((res = aarch64_general_mangle_builtin_type (type))
16276 || (res = aarch64_sve::mangle_builtin_type (type)))
16277 return res;
16278 }
16279
16280 /* Use the default mangling. */
16281 return NULL;
16282 }
16283
16284 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
16285
16286 static bool
16287 aarch64_verify_type_context (location_t loc, type_context_kind context,
16288 const_tree type, bool silent_p)
16289 {
16290 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
16291 }
16292
16293 /* Find the first rtx_insn before insn that will generate an assembly
16294 instruction. */
16295
16296 static rtx_insn *
16297 aarch64_prev_real_insn (rtx_insn *insn)
16298 {
16299 if (!insn)
16300 return NULL;
16301
16302 do
16303 {
16304 insn = prev_real_insn (insn);
16305 }
16306 while (insn && recog_memoized (insn) < 0);
16307
16308 return insn;
16309 }
16310
16311 static bool
16312 is_madd_op (enum attr_type t1)
16313 {
16314 unsigned int i;
16315 /* A number of these may be AArch32 only. */
16316 enum attr_type mlatypes[] = {
16317 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
16318 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
16319 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
16320 };
16321
16322 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
16323 {
16324 if (t1 == mlatypes[i])
16325 return true;
16326 }
16327
16328 return false;
16329 }
16330
16331 /* Check if there is a register dependency between a load and the insn
16332 for which we hold recog_data. */
16333
16334 static bool
16335 dep_between_memop_and_curr (rtx memop)
16336 {
16337 rtx load_reg;
16338 int opno;
16339
16340 gcc_assert (GET_CODE (memop) == SET);
16341
16342 if (!REG_P (SET_DEST (memop)))
16343 return false;
16344
16345 load_reg = SET_DEST (memop);
16346 for (opno = 1; opno < recog_data.n_operands; opno++)
16347 {
16348 rtx operand = recog_data.operand[opno];
16349 if (REG_P (operand)
16350 && reg_overlap_mentioned_p (load_reg, operand))
16351 return true;
16352
16353 }
16354 return false;
16355 }
16356
16357
16358 /* When working around the Cortex-A53 erratum 835769,
16359 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
16360 instruction and has a preceding memory instruction such that a NOP
16361 should be inserted between them. */
16362
16363 bool
16364 aarch64_madd_needs_nop (rtx_insn* insn)
16365 {
16366 enum attr_type attr_type;
16367 rtx_insn *prev;
16368 rtx body;
16369
16370 if (!TARGET_FIX_ERR_A53_835769)
16371 return false;
16372
16373 if (!INSN_P (insn) || recog_memoized (insn) < 0)
16374 return false;
16375
16376 attr_type = get_attr_type (insn);
16377 if (!is_madd_op (attr_type))
16378 return false;
16379
16380 prev = aarch64_prev_real_insn (insn);
16381 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
16382 Restore recog state to INSN to avoid state corruption. */
16383 extract_constrain_insn_cached (insn);
16384
16385 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
16386 return false;
16387
16388 body = single_set (prev);
16389
16390 /* If the previous insn is a memory op and there is no dependency between
16391 it and the DImode madd, emit a NOP between them. If body is NULL then we
16392 have a complex memory operation, probably a load/store pair.
16393 Be conservative for now and emit a NOP. */
16394 if (GET_MODE (recog_data.operand[0]) == DImode
16395 && (!body || !dep_between_memop_and_curr (body)))
16396 return true;
16397
16398 return false;
16399
16400 }
16401
16402
16403 /* Implement FINAL_PRESCAN_INSN. */
16404
16405 void
16406 aarch64_final_prescan_insn (rtx_insn *insn)
16407 {
16408 if (aarch64_madd_needs_nop (insn))
16409 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
16410 }
16411
16412
16413 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
16414 instruction. */
16415
16416 bool
16417 aarch64_sve_index_immediate_p (rtx base_or_step)
16418 {
16419 return (CONST_INT_P (base_or_step)
16420 && IN_RANGE (INTVAL (base_or_step), -16, 15));
16421 }
16422
16423 /* Return true if X is a valid immediate for the SVE ADD and SUB
16424 instructions. Negate X first if NEGATE_P is true. */
16425
16426 bool
16427 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
16428 {
16429 rtx elt;
16430
16431 if (!const_vec_duplicate_p (x, &elt)
16432 || !CONST_INT_P (elt))
16433 return false;
16434
16435 HOST_WIDE_INT val = INTVAL (elt);
16436 if (negate_p)
16437 val = -val;
16438 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
16439
16440 if (val & 0xff)
16441 return IN_RANGE (val, 0, 0xff);
16442 return IN_RANGE (val, 0, 0xff00);
16443 }
16444
16445 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
16446 instructions. Negate X first if NEGATE_P is true. */
16447
16448 bool
16449 aarch64_sve_sqadd_sqsub_immediate_p (rtx x, bool negate_p)
16450 {
16451 rtx elt;
16452
16453 if (!const_vec_duplicate_p (x, &elt)
16454 || !CONST_INT_P (elt))
16455 return false;
16456
16457 if (!aarch64_sve_arith_immediate_p (x, negate_p))
16458 return false;
16459
16460 /* After the optional negation, the immediate must be nonnegative.
16461 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
16462 instead of SQADD Zn.B, Zn.B, #129. */
16463 return negate_p == (INTVAL (elt) < 0);
16464 }
16465
16466 /* Return true if X is a valid immediate operand for an SVE logical
16467 instruction such as AND. */
16468
16469 bool
16470 aarch64_sve_bitmask_immediate_p (rtx x)
16471 {
16472 rtx elt;
16473
16474 return (const_vec_duplicate_p (x, &elt)
16475 && CONST_INT_P (elt)
16476 && aarch64_bitmask_imm (INTVAL (elt),
16477 GET_MODE_INNER (GET_MODE (x))));
16478 }
16479
16480 /* Return true if X is a valid immediate for the SVE DUP and CPY
16481 instructions. */
16482
16483 bool
16484 aarch64_sve_dup_immediate_p (rtx x)
16485 {
16486 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
16487 if (!CONST_INT_P (x))
16488 return false;
16489
16490 HOST_WIDE_INT val = INTVAL (x);
16491 if (val & 0xff)
16492 return IN_RANGE (val, -0x80, 0x7f);
16493 return IN_RANGE (val, -0x8000, 0x7f00);
16494 }
16495
16496 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
16497 SIGNED_P says whether the operand is signed rather than unsigned. */
16498
16499 bool
16500 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
16501 {
16502 x = unwrap_const_vec_duplicate (x);
16503 return (CONST_INT_P (x)
16504 && (signed_p
16505 ? IN_RANGE (INTVAL (x), -16, 15)
16506 : IN_RANGE (INTVAL (x), 0, 127)));
16507 }
16508
16509 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
16510 instruction. Negate X first if NEGATE_P is true. */
16511
16512 bool
16513 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
16514 {
16515 rtx elt;
16516 REAL_VALUE_TYPE r;
16517
16518 if (!const_vec_duplicate_p (x, &elt)
16519 || GET_CODE (elt) != CONST_DOUBLE)
16520 return false;
16521
16522 r = *CONST_DOUBLE_REAL_VALUE (elt);
16523
16524 if (negate_p)
16525 r = real_value_negate (&r);
16526
16527 if (real_equal (&r, &dconst1))
16528 return true;
16529 if (real_equal (&r, &dconsthalf))
16530 return true;
16531 return false;
16532 }
16533
16534 /* Return true if X is a valid immediate operand for an SVE FMUL
16535 instruction. */
16536
16537 bool
16538 aarch64_sve_float_mul_immediate_p (rtx x)
16539 {
16540 rtx elt;
16541
16542 return (const_vec_duplicate_p (x, &elt)
16543 && GET_CODE (elt) == CONST_DOUBLE
16544 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
16545 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
16546 }
16547
16548 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
16549 for the Advanced SIMD operation described by WHICH and INSN. If INFO
16550 is nonnull, use it to describe valid immediates. */
16551 static bool
16552 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
16553 simd_immediate_info *info,
16554 enum simd_immediate_check which,
16555 simd_immediate_info::insn_type insn)
16556 {
16557 /* Try a 4-byte immediate with LSL. */
16558 for (unsigned int shift = 0; shift < 32; shift += 8)
16559 if ((val32 & (0xff << shift)) == val32)
16560 {
16561 if (info)
16562 *info = simd_immediate_info (SImode, val32 >> shift, insn,
16563 simd_immediate_info::LSL, shift);
16564 return true;
16565 }
16566
16567 /* Try a 2-byte immediate with LSL. */
16568 unsigned int imm16 = val32 & 0xffff;
16569 if (imm16 == (val32 >> 16))
16570 for (unsigned int shift = 0; shift < 16; shift += 8)
16571 if ((imm16 & (0xff << shift)) == imm16)
16572 {
16573 if (info)
16574 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
16575 simd_immediate_info::LSL, shift);
16576 return true;
16577 }
16578
16579 /* Try a 4-byte immediate with MSL, except for cases that MVN
16580 can handle. */
16581 if (which == AARCH64_CHECK_MOV)
16582 for (unsigned int shift = 8; shift < 24; shift += 8)
16583 {
16584 unsigned int low = (1 << shift) - 1;
16585 if (((val32 & (0xff << shift)) | low) == val32)
16586 {
16587 if (info)
16588 *info = simd_immediate_info (SImode, val32 >> shift, insn,
16589 simd_immediate_info::MSL, shift);
16590 return true;
16591 }
16592 }
16593
16594 return false;
16595 }
16596
16597 /* Return true if replicating VAL64 is a valid immediate for the
16598 Advanced SIMD operation described by WHICH. If INFO is nonnull,
16599 use it to describe valid immediates. */
16600 static bool
16601 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
16602 simd_immediate_info *info,
16603 enum simd_immediate_check which)
16604 {
16605 unsigned int val32 = val64 & 0xffffffff;
16606 unsigned int val16 = val64 & 0xffff;
16607 unsigned int val8 = val64 & 0xff;
16608
16609 if (val32 == (val64 >> 32))
16610 {
16611 if ((which & AARCH64_CHECK_ORR) != 0
16612 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
16613 simd_immediate_info::MOV))
16614 return true;
16615
16616 if ((which & AARCH64_CHECK_BIC) != 0
16617 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
16618 simd_immediate_info::MVN))
16619 return true;
16620
16621 /* Try using a replicated byte. */
16622 if (which == AARCH64_CHECK_MOV
16623 && val16 == (val32 >> 16)
16624 && val8 == (val16 >> 8))
16625 {
16626 if (info)
16627 *info = simd_immediate_info (QImode, val8);
16628 return true;
16629 }
16630 }
16631
16632 /* Try using a bit-to-bytemask. */
16633 if (which == AARCH64_CHECK_MOV)
16634 {
16635 unsigned int i;
16636 for (i = 0; i < 64; i += 8)
16637 {
16638 unsigned char byte = (val64 >> i) & 0xff;
16639 if (byte != 0 && byte != 0xff)
16640 break;
16641 }
16642 if (i == 64)
16643 {
16644 if (info)
16645 *info = simd_immediate_info (DImode, val64);
16646 return true;
16647 }
16648 }
16649 return false;
16650 }
16651
16652 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
16653 instruction. If INFO is nonnull, use it to describe valid immediates. */
16654
16655 static bool
16656 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
16657 simd_immediate_info *info)
16658 {
16659 scalar_int_mode mode = DImode;
16660 unsigned int val32 = val64 & 0xffffffff;
16661 if (val32 == (val64 >> 32))
16662 {
16663 mode = SImode;
16664 unsigned int val16 = val32 & 0xffff;
16665 if (val16 == (val32 >> 16))
16666 {
16667 mode = HImode;
16668 unsigned int val8 = val16 & 0xff;
16669 if (val8 == (val16 >> 8))
16670 mode = QImode;
16671 }
16672 }
16673 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
16674 if (IN_RANGE (val, -0x80, 0x7f))
16675 {
16676 /* DUP with no shift. */
16677 if (info)
16678 *info = simd_immediate_info (mode, val);
16679 return true;
16680 }
16681 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
16682 {
16683 /* DUP with LSL #8. */
16684 if (info)
16685 *info = simd_immediate_info (mode, val);
16686 return true;
16687 }
16688 if (aarch64_bitmask_imm (val64, mode))
16689 {
16690 /* DUPM. */
16691 if (info)
16692 *info = simd_immediate_info (mode, val);
16693 return true;
16694 }
16695 return false;
16696 }
16697
16698 /* Return true if X is an UNSPEC_PTRUE constant of the form:
16699
16700 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
16701
16702 where PATTERN is the svpattern as a CONST_INT and where ZERO
16703 is a zero constant of the required PTRUE mode (which can have
16704 fewer elements than X's mode, if zero bits are significant).
16705
16706 If so, and if INFO is nonnull, describe the immediate in INFO. */
16707 bool
16708 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
16709 {
16710 if (GET_CODE (x) != CONST)
16711 return false;
16712
16713 x = XEXP (x, 0);
16714 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
16715 return false;
16716
16717 if (info)
16718 {
16719 aarch64_svpattern pattern
16720 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
16721 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
16722 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
16723 *info = simd_immediate_info (int_mode, pattern);
16724 }
16725 return true;
16726 }
16727
16728 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
16729 it to describe valid immediates. */
16730
16731 static bool
16732 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
16733 {
16734 if (aarch64_sve_ptrue_svpattern_p (x, info))
16735 return true;
16736
16737 if (x == CONST0_RTX (GET_MODE (x)))
16738 {
16739 if (info)
16740 *info = simd_immediate_info (DImode, 0);
16741 return true;
16742 }
16743
16744 /* Analyze the value as a VNx16BImode. This should be relatively
16745 efficient, since rtx_vector_builder has enough built-in capacity
16746 to store all VLA predicate constants without needing the heap. */
16747 rtx_vector_builder builder;
16748 if (!aarch64_get_sve_pred_bits (builder, x))
16749 return false;
16750
16751 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
16752 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
16753 {
16754 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
16755 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
16756 if (pattern != AARCH64_NUM_SVPATTERNS)
16757 {
16758 if (info)
16759 {
16760 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
16761 *info = simd_immediate_info (int_mode, pattern);
16762 }
16763 return true;
16764 }
16765 }
16766 return false;
16767 }
16768
16769 /* Return true if OP is a valid SIMD immediate for the operation
16770 described by WHICH. If INFO is nonnull, use it to describe valid
16771 immediates. */
16772 bool
16773 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
16774 enum simd_immediate_check which)
16775 {
16776 machine_mode mode = GET_MODE (op);
16777 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16778 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16779 return false;
16780
16781 if (vec_flags & VEC_SVE_PRED)
16782 return aarch64_sve_pred_valid_immediate (op, info);
16783
16784 scalar_mode elt_mode = GET_MODE_INNER (mode);
16785 rtx base, step;
16786 unsigned int n_elts;
16787 if (GET_CODE (op) == CONST_VECTOR
16788 && CONST_VECTOR_DUPLICATE_P (op))
16789 n_elts = CONST_VECTOR_NPATTERNS (op);
16790 else if ((vec_flags & VEC_SVE_DATA)
16791 && const_vec_series_p (op, &base, &step))
16792 {
16793 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
16794 if (!aarch64_sve_index_immediate_p (base)
16795 || !aarch64_sve_index_immediate_p (step))
16796 return false;
16797
16798 if (info)
16799 {
16800 /* Get the corresponding container mode. E.g. an INDEX on V2SI
16801 should yield two integer values per 128-bit block, meaning
16802 that we need to treat it in the same way as V2DI and then
16803 ignore the upper 32 bits of each element. */
16804 elt_mode = aarch64_sve_container_int_mode (mode);
16805 *info = simd_immediate_info (elt_mode, base, step);
16806 }
16807 return true;
16808 }
16809 else if (GET_CODE (op) == CONST_VECTOR
16810 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
16811 /* N_ELTS set above. */;
16812 else
16813 return false;
16814
16815 scalar_float_mode elt_float_mode;
16816 if (n_elts == 1
16817 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
16818 {
16819 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
16820 if (aarch64_float_const_zero_rtx_p (elt)
16821 || aarch64_float_const_representable_p (elt))
16822 {
16823 if (info)
16824 *info = simd_immediate_info (elt_float_mode, elt);
16825 return true;
16826 }
16827 }
16828
16829 /* If all elements in an SVE vector have the same value, we have a free
16830 choice between using the element mode and using the container mode.
16831 Using the element mode means that unused parts of the vector are
16832 duplicates of the used elements, while using the container mode means
16833 that the unused parts are an extension of the used elements. Using the
16834 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
16835 for its container mode VNx4SI while 0x00000101 isn't.
16836
16837 If not all elements in an SVE vector have the same value, we need the
16838 transition from one element to the next to occur at container boundaries.
16839 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
16840 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
16841 scalar_int_mode elt_int_mode;
16842 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
16843 elt_int_mode = aarch64_sve_container_int_mode (mode);
16844 else
16845 elt_int_mode = int_mode_for_mode (elt_mode).require ();
16846
16847 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
16848 if (elt_size > 8)
16849 return false;
16850
16851 /* Expand the vector constant out into a byte vector, with the least
16852 significant byte of the register first. */
16853 auto_vec<unsigned char, 16> bytes;
16854 bytes.reserve (n_elts * elt_size);
16855 for (unsigned int i = 0; i < n_elts; i++)
16856 {
16857 /* The vector is provided in gcc endian-neutral fashion.
16858 For aarch64_be Advanced SIMD, it must be laid out in the vector
16859 register in reverse order. */
16860 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
16861 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
16862
16863 if (elt_mode != elt_int_mode)
16864 elt = gen_lowpart (elt_int_mode, elt);
16865
16866 if (!CONST_INT_P (elt))
16867 return false;
16868
16869 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
16870 for (unsigned int byte = 0; byte < elt_size; byte++)
16871 {
16872 bytes.quick_push (elt_val & 0xff);
16873 elt_val >>= BITS_PER_UNIT;
16874 }
16875 }
16876
16877 /* The immediate must repeat every eight bytes. */
16878 unsigned int nbytes = bytes.length ();
16879 for (unsigned i = 8; i < nbytes; ++i)
16880 if (bytes[i] != bytes[i - 8])
16881 return false;
16882
16883 /* Get the repeating 8-byte value as an integer. No endian correction
16884 is needed here because bytes is already in lsb-first order. */
16885 unsigned HOST_WIDE_INT val64 = 0;
16886 for (unsigned int i = 0; i < 8; i++)
16887 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
16888 << (i * BITS_PER_UNIT));
16889
16890 if (vec_flags & VEC_SVE_DATA)
16891 return aarch64_sve_valid_immediate (val64, info);
16892 else
16893 return aarch64_advsimd_valid_immediate (val64, info, which);
16894 }
16895
16896 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
16897 has a step in the range of INDEX. Return the index expression if so,
16898 otherwise return null. */
16899 rtx
16900 aarch64_check_zero_based_sve_index_immediate (rtx x)
16901 {
16902 rtx base, step;
16903 if (const_vec_series_p (x, &base, &step)
16904 && base == const0_rtx
16905 && aarch64_sve_index_immediate_p (step))
16906 return step;
16907 return NULL_RTX;
16908 }
16909
16910 /* Check of immediate shift constants are within range. */
16911 bool
16912 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
16913 {
16914 x = unwrap_const_vec_duplicate (x);
16915 if (!CONST_INT_P (x))
16916 return false;
16917 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
16918 if (left)
16919 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
16920 else
16921 return IN_RANGE (INTVAL (x), 1, bit_width);
16922 }
16923
16924 /* Return the bitmask CONST_INT to select the bits required by a zero extract
16925 operation of width WIDTH at bit position POS. */
16926
16927 rtx
16928 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
16929 {
16930 gcc_assert (CONST_INT_P (width));
16931 gcc_assert (CONST_INT_P (pos));
16932
16933 unsigned HOST_WIDE_INT mask
16934 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
16935 return GEN_INT (mask << UINTVAL (pos));
16936 }
16937
16938 bool
16939 aarch64_mov_operand_p (rtx x, machine_mode mode)
16940 {
16941 if (GET_CODE (x) == HIGH
16942 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
16943 return true;
16944
16945 if (CONST_INT_P (x))
16946 return true;
16947
16948 if (VECTOR_MODE_P (GET_MODE (x)))
16949 {
16950 /* Require predicate constants to be VNx16BI before RA, so that we
16951 force everything to have a canonical form. */
16952 if (!lra_in_progress
16953 && !reload_completed
16954 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
16955 && GET_MODE (x) != VNx16BImode)
16956 return false;
16957
16958 return aarch64_simd_valid_immediate (x, NULL);
16959 }
16960
16961 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
16962 return true;
16963
16964 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
16965 return true;
16966
16967 return aarch64_classify_symbolic_expression (x)
16968 == SYMBOL_TINY_ABSOLUTE;
16969 }
16970
16971 /* Return a const_int vector of VAL. */
16972 rtx
16973 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
16974 {
16975 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
16976 return gen_const_vec_duplicate (mode, c);
16977 }
16978
16979 /* Check OP is a legal scalar immediate for the MOVI instruction. */
16980
16981 bool
16982 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
16983 {
16984 machine_mode vmode;
16985
16986 vmode = aarch64_simd_container_mode (mode, 64);
16987 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
16988 return aarch64_simd_valid_immediate (op_v, NULL);
16989 }
16990
16991 /* Construct and return a PARALLEL RTX vector with elements numbering the
16992 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
16993 the vector - from the perspective of the architecture. This does not
16994 line up with GCC's perspective on lane numbers, so we end up with
16995 different masks depending on our target endian-ness. The diagram
16996 below may help. We must draw the distinction when building masks
16997 which select one half of the vector. An instruction selecting
16998 architectural low-lanes for a big-endian target, must be described using
16999 a mask selecting GCC high-lanes.
17000
17001 Big-Endian Little-Endian
17002
17003 GCC 0 1 2 3 3 2 1 0
17004 | x | x | x | x | | x | x | x | x |
17005 Architecture 3 2 1 0 3 2 1 0
17006
17007 Low Mask: { 2, 3 } { 0, 1 }
17008 High Mask: { 0, 1 } { 2, 3 }
17009
17010 MODE Is the mode of the vector and NUNITS is the number of units in it. */
17011
17012 rtx
17013 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
17014 {
17015 rtvec v = rtvec_alloc (nunits / 2);
17016 int high_base = nunits / 2;
17017 int low_base = 0;
17018 int base;
17019 rtx t1;
17020 int i;
17021
17022 if (BYTES_BIG_ENDIAN)
17023 base = high ? low_base : high_base;
17024 else
17025 base = high ? high_base : low_base;
17026
17027 for (i = 0; i < nunits / 2; i++)
17028 RTVEC_ELT (v, i) = GEN_INT (base + i);
17029
17030 t1 = gen_rtx_PARALLEL (mode, v);
17031 return t1;
17032 }
17033
17034 /* Check OP for validity as a PARALLEL RTX vector with elements
17035 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
17036 from the perspective of the architecture. See the diagram above
17037 aarch64_simd_vect_par_cnst_half for more details. */
17038
17039 bool
17040 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
17041 bool high)
17042 {
17043 int nelts;
17044 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
17045 return false;
17046
17047 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
17048 HOST_WIDE_INT count_op = XVECLEN (op, 0);
17049 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
17050 int i = 0;
17051
17052 if (count_op != count_ideal)
17053 return false;
17054
17055 for (i = 0; i < count_ideal; i++)
17056 {
17057 rtx elt_op = XVECEXP (op, 0, i);
17058 rtx elt_ideal = XVECEXP (ideal, 0, i);
17059
17060 if (!CONST_INT_P (elt_op)
17061 || INTVAL (elt_ideal) != INTVAL (elt_op))
17062 return false;
17063 }
17064 return true;
17065 }
17066
17067 /* Return a PARALLEL containing NELTS elements, with element I equal
17068 to BASE + I * STEP. */
17069
17070 rtx
17071 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
17072 {
17073 rtvec vec = rtvec_alloc (nelts);
17074 for (unsigned int i = 0; i < nelts; ++i)
17075 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
17076 return gen_rtx_PARALLEL (VOIDmode, vec);
17077 }
17078
17079 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
17080 series with step STEP. */
17081
17082 bool
17083 aarch64_stepped_int_parallel_p (rtx op, int step)
17084 {
17085 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
17086 return false;
17087
17088 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
17089 for (int i = 1; i < XVECLEN (op, 0); ++i)
17090 if (!CONST_INT_P (XVECEXP (op, 0, i))
17091 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
17092 return false;
17093
17094 return true;
17095 }
17096
17097 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
17098 HIGH (exclusive). */
17099 void
17100 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
17101 const_tree exp)
17102 {
17103 HOST_WIDE_INT lane;
17104 gcc_assert (CONST_INT_P (operand));
17105 lane = INTVAL (operand);
17106
17107 if (lane < low || lane >= high)
17108 {
17109 if (exp)
17110 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
17111 else
17112 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
17113 }
17114 }
17115
17116 /* Peform endian correction on lane number N, which indexes a vector
17117 of mode MODE, and return the result as an SImode rtx. */
17118
17119 rtx
17120 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
17121 {
17122 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
17123 }
17124
17125 /* Return TRUE if OP is a valid vector addressing mode. */
17126
17127 bool
17128 aarch64_simd_mem_operand_p (rtx op)
17129 {
17130 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
17131 || REG_P (XEXP (op, 0)));
17132 }
17133
17134 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
17135
17136 bool
17137 aarch64_sve_ld1r_operand_p (rtx op)
17138 {
17139 struct aarch64_address_info addr;
17140 scalar_mode mode;
17141
17142 return (MEM_P (op)
17143 && is_a <scalar_mode> (GET_MODE (op), &mode)
17144 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
17145 && addr.type == ADDRESS_REG_IMM
17146 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
17147 }
17148
17149 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
17150 bool
17151 aarch64_sve_ld1rq_operand_p (rtx op)
17152 {
17153 struct aarch64_address_info addr;
17154 scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
17155 if (!MEM_P (op)
17156 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
17157 return false;
17158
17159 if (addr.type == ADDRESS_REG_IMM)
17160 return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
17161
17162 if (addr.type == ADDRESS_REG_REG)
17163 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
17164
17165 return false;
17166 }
17167
17168 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
17169 bool
17170 aarch64_sve_ldff1_operand_p (rtx op)
17171 {
17172 if (!MEM_P (op))
17173 return false;
17174
17175 struct aarch64_address_info addr;
17176 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
17177 return false;
17178
17179 if (addr.type == ADDRESS_REG_IMM)
17180 return known_eq (addr.const_offset, 0);
17181
17182 return addr.type == ADDRESS_REG_REG;
17183 }
17184
17185 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
17186 bool
17187 aarch64_sve_ldnf1_operand_p (rtx op)
17188 {
17189 struct aarch64_address_info addr;
17190
17191 return (MEM_P (op)
17192 && aarch64_classify_address (&addr, XEXP (op, 0),
17193 GET_MODE (op), false)
17194 && addr.type == ADDRESS_REG_IMM);
17195 }
17196
17197 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
17198 The conditions for STR are the same. */
17199 bool
17200 aarch64_sve_ldr_operand_p (rtx op)
17201 {
17202 struct aarch64_address_info addr;
17203
17204 return (MEM_P (op)
17205 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
17206 false, ADDR_QUERY_ANY)
17207 && addr.type == ADDRESS_REG_IMM);
17208 }
17209
17210 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
17211 addressing memory of mode MODE. */
17212 bool
17213 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
17214 {
17215 struct aarch64_address_info addr;
17216 if (!aarch64_classify_address (&addr, op, mode, false))
17217 return false;
17218
17219 if (addr.type == ADDRESS_REG_IMM)
17220 return known_eq (addr.const_offset, 0);
17221
17222 return addr.type == ADDRESS_REG_REG;
17223 }
17224
17225 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
17226 We need to be able to access the individual pieces, so the range
17227 is different from LD[234] and ST[234]. */
17228 bool
17229 aarch64_sve_struct_memory_operand_p (rtx op)
17230 {
17231 if (!MEM_P (op))
17232 return false;
17233
17234 machine_mode mode = GET_MODE (op);
17235 struct aarch64_address_info addr;
17236 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
17237 ADDR_QUERY_ANY)
17238 || addr.type != ADDRESS_REG_IMM)
17239 return false;
17240
17241 poly_int64 first = addr.const_offset;
17242 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
17243 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
17244 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
17245 }
17246
17247 /* Emit a register copy from operand to operand, taking care not to
17248 early-clobber source registers in the process.
17249
17250 COUNT is the number of components into which the copy needs to be
17251 decomposed. */
17252 void
17253 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
17254 unsigned int count)
17255 {
17256 unsigned int i;
17257 int rdest = REGNO (operands[0]);
17258 int rsrc = REGNO (operands[1]);
17259
17260 if (!reg_overlap_mentioned_p (operands[0], operands[1])
17261 || rdest < rsrc)
17262 for (i = 0; i < count; i++)
17263 emit_move_insn (gen_rtx_REG (mode, rdest + i),
17264 gen_rtx_REG (mode, rsrc + i));
17265 else
17266 for (i = 0; i < count; i++)
17267 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
17268 gen_rtx_REG (mode, rsrc + count - i - 1));
17269 }
17270
17271 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
17272 one of VSTRUCT modes: OI, CI, or XI. */
17273 int
17274 aarch64_simd_attr_length_rglist (machine_mode mode)
17275 {
17276 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
17277 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
17278 }
17279
17280 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
17281 alignment of a vector to 128 bits. SVE predicates have an alignment of
17282 16 bits. */
17283 static HOST_WIDE_INT
17284 aarch64_simd_vector_alignment (const_tree type)
17285 {
17286 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
17287 be set for non-predicate vectors of booleans. Modes are the most
17288 direct way we have of identifying real SVE predicate types. */
17289 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
17290 return 16;
17291 widest_int min_size
17292 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
17293 return wi::umin (min_size, 128).to_uhwi ();
17294 }
17295
17296 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
17297 static poly_uint64
17298 aarch64_vectorize_preferred_vector_alignment (const_tree type)
17299 {
17300 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
17301 {
17302 /* If the length of the vector is fixed, try to align to that length,
17303 otherwise don't try to align at all. */
17304 HOST_WIDE_INT result;
17305 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
17306 result = TYPE_ALIGN (TREE_TYPE (type));
17307 return result;
17308 }
17309 return TYPE_ALIGN (type);
17310 }
17311
17312 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
17313 static bool
17314 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
17315 {
17316 if (is_packed)
17317 return false;
17318
17319 /* For fixed-length vectors, check that the vectorizer will aim for
17320 full-vector alignment. This isn't true for generic GCC vectors
17321 that are wider than the ABI maximum of 128 bits. */
17322 poly_uint64 preferred_alignment =
17323 aarch64_vectorize_preferred_vector_alignment (type);
17324 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
17325 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
17326 preferred_alignment))
17327 return false;
17328
17329 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
17330 return true;
17331 }
17332
17333 /* Return true if the vector misalignment factor is supported by the
17334 target. */
17335 static bool
17336 aarch64_builtin_support_vector_misalignment (machine_mode mode,
17337 const_tree type, int misalignment,
17338 bool is_packed)
17339 {
17340 if (TARGET_SIMD && STRICT_ALIGNMENT)
17341 {
17342 /* Return if movmisalign pattern is not supported for this mode. */
17343 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
17344 return false;
17345
17346 /* Misalignment factor is unknown at compile time. */
17347 if (misalignment == -1)
17348 return false;
17349 }
17350 return default_builtin_support_vector_misalignment (mode, type, misalignment,
17351 is_packed);
17352 }
17353
17354 /* If VALS is a vector constant that can be loaded into a register
17355 using DUP, generate instructions to do so and return an RTX to
17356 assign to the register. Otherwise return NULL_RTX. */
17357 static rtx
17358 aarch64_simd_dup_constant (rtx vals)
17359 {
17360 machine_mode mode = GET_MODE (vals);
17361 machine_mode inner_mode = GET_MODE_INNER (mode);
17362 rtx x;
17363
17364 if (!const_vec_duplicate_p (vals, &x))
17365 return NULL_RTX;
17366
17367 /* We can load this constant by using DUP and a constant in a
17368 single ARM register. This will be cheaper than a vector
17369 load. */
17370 x = copy_to_mode_reg (inner_mode, x);
17371 return gen_vec_duplicate (mode, x);
17372 }
17373
17374
17375 /* Generate code to load VALS, which is a PARALLEL containing only
17376 constants (for vec_init) or CONST_VECTOR, efficiently into a
17377 register. Returns an RTX to copy into the register, or NULL_RTX
17378 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
17379 static rtx
17380 aarch64_simd_make_constant (rtx vals)
17381 {
17382 machine_mode mode = GET_MODE (vals);
17383 rtx const_dup;
17384 rtx const_vec = NULL_RTX;
17385 int n_const = 0;
17386 int i;
17387
17388 if (GET_CODE (vals) == CONST_VECTOR)
17389 const_vec = vals;
17390 else if (GET_CODE (vals) == PARALLEL)
17391 {
17392 /* A CONST_VECTOR must contain only CONST_INTs and
17393 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
17394 Only store valid constants in a CONST_VECTOR. */
17395 int n_elts = XVECLEN (vals, 0);
17396 for (i = 0; i < n_elts; ++i)
17397 {
17398 rtx x = XVECEXP (vals, 0, i);
17399 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17400 n_const++;
17401 }
17402 if (n_const == n_elts)
17403 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
17404 }
17405 else
17406 gcc_unreachable ();
17407
17408 if (const_vec != NULL_RTX
17409 && aarch64_simd_valid_immediate (const_vec, NULL))
17410 /* Load using MOVI/MVNI. */
17411 return const_vec;
17412 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
17413 /* Loaded using DUP. */
17414 return const_dup;
17415 else if (const_vec != NULL_RTX)
17416 /* Load from constant pool. We cannot take advantage of single-cycle
17417 LD1 because we need a PC-relative addressing mode. */
17418 return const_vec;
17419 else
17420 /* A PARALLEL containing something not valid inside CONST_VECTOR.
17421 We cannot construct an initializer. */
17422 return NULL_RTX;
17423 }
17424
17425 /* Expand a vector initialisation sequence, such that TARGET is
17426 initialised to contain VALS. */
17427
17428 void
17429 aarch64_expand_vector_init (rtx target, rtx vals)
17430 {
17431 machine_mode mode = GET_MODE (target);
17432 scalar_mode inner_mode = GET_MODE_INNER (mode);
17433 /* The number of vector elements. */
17434 int n_elts = XVECLEN (vals, 0);
17435 /* The number of vector elements which are not constant. */
17436 int n_var = 0;
17437 rtx any_const = NULL_RTX;
17438 /* The first element of vals. */
17439 rtx v0 = XVECEXP (vals, 0, 0);
17440 bool all_same = true;
17441
17442 /* This is a special vec_init<M><N> where N is not an element mode but a
17443 vector mode with half the elements of M. We expect to find two entries
17444 of mode N in VALS and we must put their concatentation into TARGET. */
17445 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
17446 {
17447 gcc_assert (known_eq (GET_MODE_SIZE (mode),
17448 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
17449 rtx lo = XVECEXP (vals, 0, 0);
17450 rtx hi = XVECEXP (vals, 0, 1);
17451 machine_mode narrow_mode = GET_MODE (lo);
17452 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
17453 gcc_assert (narrow_mode == GET_MODE (hi));
17454
17455 /* When we want to concatenate a half-width vector with zeroes we can
17456 use the aarch64_combinez[_be] patterns. Just make sure that the
17457 zeroes are in the right half. */
17458 if (BYTES_BIG_ENDIAN
17459 && aarch64_simd_imm_zero (lo, narrow_mode)
17460 && general_operand (hi, narrow_mode))
17461 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
17462 else if (!BYTES_BIG_ENDIAN
17463 && aarch64_simd_imm_zero (hi, narrow_mode)
17464 && general_operand (lo, narrow_mode))
17465 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
17466 else
17467 {
17468 /* Else create the two half-width registers and combine them. */
17469 if (!REG_P (lo))
17470 lo = force_reg (GET_MODE (lo), lo);
17471 if (!REG_P (hi))
17472 hi = force_reg (GET_MODE (hi), hi);
17473
17474 if (BYTES_BIG_ENDIAN)
17475 std::swap (lo, hi);
17476 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
17477 }
17478 return;
17479 }
17480
17481 /* Count the number of variable elements to initialise. */
17482 for (int i = 0; i < n_elts; ++i)
17483 {
17484 rtx x = XVECEXP (vals, 0, i);
17485 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
17486 ++n_var;
17487 else
17488 any_const = x;
17489
17490 all_same &= rtx_equal_p (x, v0);
17491 }
17492
17493 /* No variable elements, hand off to aarch64_simd_make_constant which knows
17494 how best to handle this. */
17495 if (n_var == 0)
17496 {
17497 rtx constant = aarch64_simd_make_constant (vals);
17498 if (constant != NULL_RTX)
17499 {
17500 emit_move_insn (target, constant);
17501 return;
17502 }
17503 }
17504
17505 /* Splat a single non-constant element if we can. */
17506 if (all_same)
17507 {
17508 rtx x = copy_to_mode_reg (inner_mode, v0);
17509 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17510 return;
17511 }
17512
17513 enum insn_code icode = optab_handler (vec_set_optab, mode);
17514 gcc_assert (icode != CODE_FOR_nothing);
17515
17516 /* If there are only variable elements, try to optimize
17517 the insertion using dup for the most common element
17518 followed by insertions. */
17519
17520 /* The algorithm will fill matches[*][0] with the earliest matching element,
17521 and matches[X][1] with the count of duplicate elements (if X is the
17522 earliest element which has duplicates). */
17523
17524 if (n_var == n_elts && n_elts <= 16)
17525 {
17526 int matches[16][2] = {0};
17527 for (int i = 0; i < n_elts; i++)
17528 {
17529 for (int j = 0; j <= i; j++)
17530 {
17531 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
17532 {
17533 matches[i][0] = j;
17534 matches[j][1]++;
17535 break;
17536 }
17537 }
17538 }
17539 int maxelement = 0;
17540 int maxv = 0;
17541 for (int i = 0; i < n_elts; i++)
17542 if (matches[i][1] > maxv)
17543 {
17544 maxelement = i;
17545 maxv = matches[i][1];
17546 }
17547
17548 /* Create a duplicate of the most common element, unless all elements
17549 are equally useless to us, in which case just immediately set the
17550 vector register using the first element. */
17551
17552 if (maxv == 1)
17553 {
17554 /* For vectors of two 64-bit elements, we can do even better. */
17555 if (n_elts == 2
17556 && (inner_mode == E_DImode
17557 || inner_mode == E_DFmode))
17558
17559 {
17560 rtx x0 = XVECEXP (vals, 0, 0);
17561 rtx x1 = XVECEXP (vals, 0, 1);
17562 /* Combine can pick up this case, but handling it directly
17563 here leaves clearer RTL.
17564
17565 This is load_pair_lanes<mode>, and also gives us a clean-up
17566 for store_pair_lanes<mode>. */
17567 if (memory_operand (x0, inner_mode)
17568 && memory_operand (x1, inner_mode)
17569 && !STRICT_ALIGNMENT
17570 && rtx_equal_p (XEXP (x1, 0),
17571 plus_constant (Pmode,
17572 XEXP (x0, 0),
17573 GET_MODE_SIZE (inner_mode))))
17574 {
17575 rtx t;
17576 if (inner_mode == DFmode)
17577 t = gen_load_pair_lanesdf (target, x0, x1);
17578 else
17579 t = gen_load_pair_lanesdi (target, x0, x1);
17580 emit_insn (t);
17581 return;
17582 }
17583 }
17584 /* The subreg-move sequence below will move into lane zero of the
17585 vector register. For big-endian we want that position to hold
17586 the last element of VALS. */
17587 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
17588 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17589 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
17590 }
17591 else
17592 {
17593 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
17594 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
17595 }
17596
17597 /* Insert the rest. */
17598 for (int i = 0; i < n_elts; i++)
17599 {
17600 rtx x = XVECEXP (vals, 0, i);
17601 if (matches[i][0] == maxelement)
17602 continue;
17603 x = copy_to_mode_reg (inner_mode, x);
17604 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17605 }
17606 return;
17607 }
17608
17609 /* Initialise a vector which is part-variable. We want to first try
17610 to build those lanes which are constant in the most efficient way we
17611 can. */
17612 if (n_var != n_elts)
17613 {
17614 rtx copy = copy_rtx (vals);
17615
17616 /* Load constant part of vector. We really don't care what goes into the
17617 parts we will overwrite, but we're more likely to be able to load the
17618 constant efficiently if it has fewer, larger, repeating parts
17619 (see aarch64_simd_valid_immediate). */
17620 for (int i = 0; i < n_elts; i++)
17621 {
17622 rtx x = XVECEXP (vals, 0, i);
17623 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17624 continue;
17625 rtx subst = any_const;
17626 for (int bit = n_elts / 2; bit > 0; bit /= 2)
17627 {
17628 /* Look in the copied vector, as more elements are const. */
17629 rtx test = XVECEXP (copy, 0, i ^ bit);
17630 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
17631 {
17632 subst = test;
17633 break;
17634 }
17635 }
17636 XVECEXP (copy, 0, i) = subst;
17637 }
17638 aarch64_expand_vector_init (target, copy);
17639 }
17640
17641 /* Insert the variable lanes directly. */
17642 for (int i = 0; i < n_elts; i++)
17643 {
17644 rtx x = XVECEXP (vals, 0, i);
17645 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
17646 continue;
17647 x = copy_to_mode_reg (inner_mode, x);
17648 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
17649 }
17650 }
17651
17652 /* Emit RTL corresponding to:
17653 insr TARGET, ELEM. */
17654
17655 static void
17656 emit_insr (rtx target, rtx elem)
17657 {
17658 machine_mode mode = GET_MODE (target);
17659 scalar_mode elem_mode = GET_MODE_INNER (mode);
17660 elem = force_reg (elem_mode, elem);
17661
17662 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
17663 gcc_assert (icode != CODE_FOR_nothing);
17664 emit_insn (GEN_FCN (icode) (target, target, elem));
17665 }
17666
17667 /* Subroutine of aarch64_sve_expand_vector_init for handling
17668 trailing constants.
17669 This function works as follows:
17670 (a) Create a new vector consisting of trailing constants.
17671 (b) Initialize TARGET with the constant vector using emit_move_insn.
17672 (c) Insert remaining elements in TARGET using insr.
17673 NELTS is the total number of elements in original vector while
17674 while NELTS_REQD is the number of elements that are actually
17675 significant.
17676
17677 ??? The heuristic used is to do above only if number of constants
17678 is at least half the total number of elements. May need fine tuning. */
17679
17680 static bool
17681 aarch64_sve_expand_vector_init_handle_trailing_constants
17682 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
17683 {
17684 machine_mode mode = GET_MODE (target);
17685 scalar_mode elem_mode = GET_MODE_INNER (mode);
17686 int n_trailing_constants = 0;
17687
17688 for (int i = nelts_reqd - 1;
17689 i >= 0 && aarch64_legitimate_constant_p (elem_mode, builder.elt (i));
17690 i--)
17691 n_trailing_constants++;
17692
17693 if (n_trailing_constants >= nelts_reqd / 2)
17694 {
17695 rtx_vector_builder v (mode, 1, nelts);
17696 for (int i = 0; i < nelts; i++)
17697 v.quick_push (builder.elt (i + nelts_reqd - n_trailing_constants));
17698 rtx const_vec = v.build ();
17699 emit_move_insn (target, const_vec);
17700
17701 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
17702 emit_insr (target, builder.elt (i));
17703
17704 return true;
17705 }
17706
17707 return false;
17708 }
17709
17710 /* Subroutine of aarch64_sve_expand_vector_init.
17711 Works as follows:
17712 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
17713 (b) Skip trailing elements from BUILDER, which are the same as
17714 element NELTS_REQD - 1.
17715 (c) Insert earlier elements in reverse order in TARGET using insr. */
17716
17717 static void
17718 aarch64_sve_expand_vector_init_insert_elems (rtx target,
17719 const rtx_vector_builder &builder,
17720 int nelts_reqd)
17721 {
17722 machine_mode mode = GET_MODE (target);
17723 scalar_mode elem_mode = GET_MODE_INNER (mode);
17724
17725 struct expand_operand ops[2];
17726 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
17727 gcc_assert (icode != CODE_FOR_nothing);
17728
17729 create_output_operand (&ops[0], target, mode);
17730 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
17731 expand_insn (icode, 2, ops);
17732
17733 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17734 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
17735 emit_insr (target, builder.elt (i));
17736 }
17737
17738 /* Subroutine of aarch64_sve_expand_vector_init to handle case
17739 when all trailing elements of builder are same.
17740 This works as follows:
17741 (a) Use expand_insn interface to broadcast last vector element in TARGET.
17742 (b) Insert remaining elements in TARGET using insr.
17743
17744 ??? The heuristic used is to do above if number of same trailing elements
17745 is at least 3/4 of total number of elements, loosely based on
17746 heuristic from mostly_zeros_p. May need fine-tuning. */
17747
17748 static bool
17749 aarch64_sve_expand_vector_init_handle_trailing_same_elem
17750 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
17751 {
17752 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
17753 if (ndups >= (3 * nelts_reqd) / 4)
17754 {
17755 aarch64_sve_expand_vector_init_insert_elems (target, builder,
17756 nelts_reqd - ndups + 1);
17757 return true;
17758 }
17759
17760 return false;
17761 }
17762
17763 /* Initialize register TARGET from BUILDER. NELTS is the constant number
17764 of elements in BUILDER.
17765
17766 The function tries to initialize TARGET from BUILDER if it fits one
17767 of the special cases outlined below.
17768
17769 Failing that, the function divides BUILDER into two sub-vectors:
17770 v_even = even elements of BUILDER;
17771 v_odd = odd elements of BUILDER;
17772
17773 and recursively calls itself with v_even and v_odd.
17774
17775 if (recursive call succeeded for v_even or v_odd)
17776 TARGET = zip (v_even, v_odd)
17777
17778 The function returns true if it managed to build TARGET from BUILDER
17779 with one of the special cases, false otherwise.
17780
17781 Example: {a, 1, b, 2, c, 3, d, 4}
17782
17783 The vector gets divided into:
17784 v_even = {a, b, c, d}
17785 v_odd = {1, 2, 3, 4}
17786
17787 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
17788 initialize tmp2 from constant vector v_odd using emit_move_insn.
17789
17790 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
17791 4 elements, so we construct tmp1 from v_even using insr:
17792 tmp1 = dup(d)
17793 insr tmp1, c
17794 insr tmp1, b
17795 insr tmp1, a
17796
17797 And finally:
17798 TARGET = zip (tmp1, tmp2)
17799 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
17800
17801 static bool
17802 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
17803 int nelts, int nelts_reqd)
17804 {
17805 machine_mode mode = GET_MODE (target);
17806
17807 /* Case 1: Vector contains trailing constants. */
17808
17809 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17810 (target, builder, nelts, nelts_reqd))
17811 return true;
17812
17813 /* Case 2: Vector contains leading constants. */
17814
17815 rtx_vector_builder rev_builder (mode, 1, nelts_reqd);
17816 for (int i = 0; i < nelts_reqd; i++)
17817 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
17818 rev_builder.finalize ();
17819
17820 if (aarch64_sve_expand_vector_init_handle_trailing_constants
17821 (target, rev_builder, nelts, nelts_reqd))
17822 {
17823 emit_insn (gen_aarch64_sve_rev (mode, target, target));
17824 return true;
17825 }
17826
17827 /* Case 3: Vector contains trailing same element. */
17828
17829 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17830 (target, builder, nelts_reqd))
17831 return true;
17832
17833 /* Case 4: Vector contains leading same element. */
17834
17835 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
17836 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
17837 {
17838 emit_insn (gen_aarch64_sve_rev (mode, target, target));
17839 return true;
17840 }
17841
17842 /* Avoid recursing below 4-elements.
17843 ??? The threshold 4 may need fine-tuning. */
17844
17845 if (nelts_reqd <= 4)
17846 return false;
17847
17848 rtx_vector_builder v_even (mode, 1, nelts);
17849 rtx_vector_builder v_odd (mode, 1, nelts);
17850
17851 for (int i = 0; i < nelts * 2; i += 2)
17852 {
17853 v_even.quick_push (builder.elt (i));
17854 v_odd.quick_push (builder.elt (i + 1));
17855 }
17856
17857 v_even.finalize ();
17858 v_odd.finalize ();
17859
17860 rtx tmp1 = gen_reg_rtx (mode);
17861 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
17862 nelts, nelts_reqd / 2);
17863
17864 rtx tmp2 = gen_reg_rtx (mode);
17865 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
17866 nelts, nelts_reqd / 2);
17867
17868 if (!did_even_p && !did_odd_p)
17869 return false;
17870
17871 /* Initialize v_even and v_odd using INSR if it didn't match any of the
17872 special cases and zip v_even, v_odd. */
17873
17874 if (!did_even_p)
17875 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
17876
17877 if (!did_odd_p)
17878 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
17879
17880 rtvec v = gen_rtvec (2, tmp1, tmp2);
17881 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
17882 return true;
17883 }
17884
17885 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
17886
17887 void
17888 aarch64_sve_expand_vector_init (rtx target, rtx vals)
17889 {
17890 machine_mode mode = GET_MODE (target);
17891 int nelts = XVECLEN (vals, 0);
17892
17893 rtx_vector_builder v (mode, 1, nelts);
17894 for (int i = 0; i < nelts; i++)
17895 v.quick_push (XVECEXP (vals, 0, i));
17896 v.finalize ();
17897
17898 /* If neither sub-vectors of v could be initialized specially,
17899 then use INSR to insert all elements from v into TARGET.
17900 ??? This might not be optimal for vectors with large
17901 initializers like 16-element or above.
17902 For nelts < 4, it probably isn't useful to handle specially. */
17903
17904 if (nelts < 4
17905 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
17906 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
17907 }
17908
17909 /* Check whether VALUE is a vector constant in which every element
17910 is either a power of 2 or a negated power of 2. If so, return
17911 a constant vector of log2s, and flip CODE between PLUS and MINUS
17912 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
17913
17914 static rtx
17915 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
17916 {
17917 if (GET_CODE (value) != CONST_VECTOR)
17918 return NULL_RTX;
17919
17920 rtx_vector_builder builder;
17921 if (!builder.new_unary_operation (GET_MODE (value), value, false))
17922 return NULL_RTX;
17923
17924 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
17925 /* 1 if the result of the multiplication must be negated,
17926 0 if it mustn't, or -1 if we don't yet care. */
17927 int negate = -1;
17928 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
17929 for (unsigned int i = 0; i < encoded_nelts; ++i)
17930 {
17931 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
17932 if (!CONST_SCALAR_INT_P (elt))
17933 return NULL_RTX;
17934 rtx_mode_t val (elt, int_mode);
17935 wide_int pow2 = wi::neg (val);
17936 if (val != pow2)
17937 {
17938 /* It matters whether we negate or not. Make that choice,
17939 and make sure that it's consistent with previous elements. */
17940 if (negate == !wi::neg_p (val))
17941 return NULL_RTX;
17942 negate = wi::neg_p (val);
17943 if (!negate)
17944 pow2 = val;
17945 }
17946 /* POW2 is now the value that we want to be a power of 2. */
17947 int shift = wi::exact_log2 (pow2);
17948 if (shift < 0)
17949 return NULL_RTX;
17950 builder.quick_push (gen_int_mode (shift, int_mode));
17951 }
17952 if (negate == -1)
17953 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
17954 code = PLUS;
17955 else if (negate == 1)
17956 code = code == PLUS ? MINUS : PLUS;
17957 return builder.build ();
17958 }
17959
17960 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
17961 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
17962 operands array, in the same order as for fma_optab. Return true if
17963 the function emitted all the necessary instructions, false if the caller
17964 should generate the pattern normally with the new OPERANDS array. */
17965
17966 bool
17967 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
17968 {
17969 machine_mode mode = GET_MODE (operands[0]);
17970 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
17971 {
17972 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
17973 NULL_RTX, true, OPTAB_DIRECT);
17974 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
17975 operands[3], product, operands[0], true,
17976 OPTAB_DIRECT);
17977 return true;
17978 }
17979 operands[2] = force_reg (mode, operands[2]);
17980 return false;
17981 }
17982
17983 /* Likewise, but for a conditional pattern. */
17984
17985 bool
17986 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
17987 {
17988 machine_mode mode = GET_MODE (operands[0]);
17989 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
17990 {
17991 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
17992 NULL_RTX, true, OPTAB_DIRECT);
17993 emit_insn (gen_cond (code, mode, operands[0], operands[1],
17994 operands[4], product, operands[5]));
17995 return true;
17996 }
17997 operands[3] = force_reg (mode, operands[3]);
17998 return false;
17999 }
18000
18001 static unsigned HOST_WIDE_INT
18002 aarch64_shift_truncation_mask (machine_mode mode)
18003 {
18004 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
18005 return 0;
18006 return GET_MODE_UNIT_BITSIZE (mode) - 1;
18007 }
18008
18009 /* Select a format to encode pointers in exception handling data. */
18010 int
18011 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
18012 {
18013 int type;
18014 switch (aarch64_cmodel)
18015 {
18016 case AARCH64_CMODEL_TINY:
18017 case AARCH64_CMODEL_TINY_PIC:
18018 case AARCH64_CMODEL_SMALL:
18019 case AARCH64_CMODEL_SMALL_PIC:
18020 case AARCH64_CMODEL_SMALL_SPIC:
18021 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
18022 for everything. */
18023 type = DW_EH_PE_sdata4;
18024 break;
18025 default:
18026 /* No assumptions here. 8-byte relocs required. */
18027 type = DW_EH_PE_sdata8;
18028 break;
18029 }
18030 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
18031 }
18032
18033 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
18034
18035 static void
18036 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
18037 {
18038 if (TREE_CODE (decl) == FUNCTION_DECL)
18039 {
18040 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
18041 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
18042 {
18043 fprintf (stream, "\t.variant_pcs\t");
18044 assemble_name (stream, name);
18045 fprintf (stream, "\n");
18046 }
18047 }
18048 }
18049
18050 /* The last .arch and .tune assembly strings that we printed. */
18051 static std::string aarch64_last_printed_arch_string;
18052 static std::string aarch64_last_printed_tune_string;
18053
18054 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
18055 by the function fndecl. */
18056
18057 void
18058 aarch64_declare_function_name (FILE *stream, const char* name,
18059 tree fndecl)
18060 {
18061 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18062
18063 struct cl_target_option *targ_options;
18064 if (target_parts)
18065 targ_options = TREE_TARGET_OPTION (target_parts);
18066 else
18067 targ_options = TREE_TARGET_OPTION (target_option_current_node);
18068 gcc_assert (targ_options);
18069
18070 const struct processor *this_arch
18071 = aarch64_get_arch (targ_options->x_explicit_arch);
18072
18073 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
18074 std::string extension
18075 = aarch64_get_extension_string_for_isa_flags (isa_flags,
18076 this_arch->flags);
18077 /* Only update the assembler .arch string if it is distinct from the last
18078 such string we printed. */
18079 std::string to_print = this_arch->name + extension;
18080 if (to_print != aarch64_last_printed_arch_string)
18081 {
18082 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
18083 aarch64_last_printed_arch_string = to_print;
18084 }
18085
18086 /* Print the cpu name we're tuning for in the comments, might be
18087 useful to readers of the generated asm. Do it only when it changes
18088 from function to function and verbose assembly is requested. */
18089 const struct processor *this_tune
18090 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
18091
18092 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
18093 {
18094 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
18095 this_tune->name);
18096 aarch64_last_printed_tune_string = this_tune->name;
18097 }
18098
18099 aarch64_asm_output_variant_pcs (stream, fndecl, name);
18100
18101 /* Don't forget the type directive for ELF. */
18102 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
18103 ASM_OUTPUT_LABEL (stream, name);
18104 }
18105
18106 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
18107
18108 void
18109 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
18110 {
18111 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
18112 const char *value = IDENTIFIER_POINTER (target);
18113 aarch64_asm_output_variant_pcs (stream, decl, name);
18114 ASM_OUTPUT_DEF (stream, name, value);
18115 }
18116
18117 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
18118 function symbol references. */
18119
18120 void
18121 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
18122 {
18123 default_elf_asm_output_external (stream, decl, name);
18124 aarch64_asm_output_variant_pcs (stream, decl, name);
18125 }
18126
18127 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
18128 Used to output the .cfi_b_key_frame directive when signing the current
18129 function with the B key. */
18130
18131 void
18132 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
18133 {
18134 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
18135 && aarch64_ra_sign_key == AARCH64_KEY_B)
18136 asm_fprintf (f, "\t.cfi_b_key_frame\n");
18137 }
18138
18139 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
18140
18141 static void
18142 aarch64_start_file (void)
18143 {
18144 struct cl_target_option *default_options
18145 = TREE_TARGET_OPTION (target_option_default_node);
18146
18147 const struct processor *default_arch
18148 = aarch64_get_arch (default_options->x_explicit_arch);
18149 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
18150 std::string extension
18151 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
18152 default_arch->flags);
18153
18154 aarch64_last_printed_arch_string = default_arch->name + extension;
18155 aarch64_last_printed_tune_string = "";
18156 asm_fprintf (asm_out_file, "\t.arch %s\n",
18157 aarch64_last_printed_arch_string.c_str ());
18158
18159 default_file_start ();
18160 }
18161
18162 /* Emit load exclusive. */
18163
18164 static void
18165 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
18166 rtx mem, rtx model_rtx)
18167 {
18168 if (mode == TImode)
18169 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
18170 gen_highpart (DImode, rval),
18171 mem, model_rtx));
18172 else
18173 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
18174 }
18175
18176 /* Emit store exclusive. */
18177
18178 static void
18179 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
18180 rtx mem, rtx rval, rtx model_rtx)
18181 {
18182 if (mode == TImode)
18183 emit_insn (gen_aarch64_store_exclusive_pair
18184 (bval, mem, operand_subword (rval, 0, 0, TImode),
18185 operand_subword (rval, 1, 0, TImode), model_rtx));
18186 else
18187 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
18188 }
18189
18190 /* Mark the previous jump instruction as unlikely. */
18191
18192 static void
18193 aarch64_emit_unlikely_jump (rtx insn)
18194 {
18195 rtx_insn *jump = emit_jump_insn (insn);
18196 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
18197 }
18198
18199 /* We store the names of the various atomic helpers in a 5x4 array.
18200 Return the libcall function given MODE, MODEL and NAMES. */
18201
18202 rtx
18203 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
18204 const atomic_ool_names *names)
18205 {
18206 memmodel model = memmodel_base (INTVAL (model_rtx));
18207 int mode_idx, model_idx;
18208
18209 switch (mode)
18210 {
18211 case E_QImode:
18212 mode_idx = 0;
18213 break;
18214 case E_HImode:
18215 mode_idx = 1;
18216 break;
18217 case E_SImode:
18218 mode_idx = 2;
18219 break;
18220 case E_DImode:
18221 mode_idx = 3;
18222 break;
18223 case E_TImode:
18224 mode_idx = 4;
18225 break;
18226 default:
18227 gcc_unreachable ();
18228 }
18229
18230 switch (model)
18231 {
18232 case MEMMODEL_RELAXED:
18233 model_idx = 0;
18234 break;
18235 case MEMMODEL_CONSUME:
18236 case MEMMODEL_ACQUIRE:
18237 model_idx = 1;
18238 break;
18239 case MEMMODEL_RELEASE:
18240 model_idx = 2;
18241 break;
18242 case MEMMODEL_ACQ_REL:
18243 case MEMMODEL_SEQ_CST:
18244 model_idx = 3;
18245 break;
18246 default:
18247 gcc_unreachable ();
18248 }
18249
18250 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
18251 VISIBILITY_HIDDEN);
18252 }
18253
18254 #define DEF0(B, N) \
18255 { "__aarch64_" #B #N "_relax", \
18256 "__aarch64_" #B #N "_acq", \
18257 "__aarch64_" #B #N "_rel", \
18258 "__aarch64_" #B #N "_acq_rel" }
18259
18260 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
18261 { NULL, NULL, NULL, NULL }
18262 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
18263
18264 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
18265 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
18266 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
18267 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
18268 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
18269 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
18270
18271 #undef DEF0
18272 #undef DEF4
18273 #undef DEF5
18274
18275 /* Expand a compare and swap pattern. */
18276
18277 void
18278 aarch64_expand_compare_and_swap (rtx operands[])
18279 {
18280 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
18281 machine_mode mode, r_mode;
18282
18283 bval = operands[0];
18284 rval = operands[1];
18285 mem = operands[2];
18286 oldval = operands[3];
18287 newval = operands[4];
18288 is_weak = operands[5];
18289 mod_s = operands[6];
18290 mod_f = operands[7];
18291 mode = GET_MODE (mem);
18292
18293 /* Normally the succ memory model must be stronger than fail, but in the
18294 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
18295 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
18296 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
18297 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
18298 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
18299
18300 r_mode = mode;
18301 if (mode == QImode || mode == HImode)
18302 {
18303 r_mode = SImode;
18304 rval = gen_reg_rtx (r_mode);
18305 }
18306
18307 if (TARGET_LSE)
18308 {
18309 /* The CAS insn requires oldval and rval overlap, but we need to
18310 have a copy of oldval saved across the operation to tell if
18311 the operation is successful. */
18312 if (reg_overlap_mentioned_p (rval, oldval))
18313 rval = copy_to_mode_reg (r_mode, oldval);
18314 else
18315 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
18316
18317 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
18318 newval, mod_s));
18319 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18320 }
18321 else if (TARGET_OUTLINE_ATOMICS)
18322 {
18323 /* Oldval must satisfy compare afterward. */
18324 if (!aarch64_plus_operand (oldval, mode))
18325 oldval = force_reg (mode, oldval);
18326 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
18327 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
18328 oldval, mode, newval, mode,
18329 XEXP (mem, 0), Pmode);
18330 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18331 }
18332 else
18333 {
18334 /* The oldval predicate varies by mode. Test it and force to reg. */
18335 insn_code code = code_for_aarch64_compare_and_swap (mode);
18336 if (!insn_data[code].operand[2].predicate (oldval, mode))
18337 oldval = force_reg (mode, oldval);
18338
18339 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
18340 is_weak, mod_s, mod_f));
18341 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
18342 }
18343
18344 if (r_mode != mode)
18345 rval = gen_lowpart (mode, rval);
18346 emit_move_insn (operands[1], rval);
18347
18348 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
18349 emit_insn (gen_rtx_SET (bval, x));
18350 }
18351
18352 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
18353 sequence implementing an atomic operation. */
18354
18355 static void
18356 aarch64_emit_post_barrier (enum memmodel model)
18357 {
18358 const enum memmodel base_model = memmodel_base (model);
18359
18360 if (is_mm_sync (model)
18361 && (base_model == MEMMODEL_ACQUIRE
18362 || base_model == MEMMODEL_ACQ_REL
18363 || base_model == MEMMODEL_SEQ_CST))
18364 {
18365 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
18366 }
18367 }
18368
18369 /* Split a compare and swap pattern. */
18370
18371 void
18372 aarch64_split_compare_and_swap (rtx operands[])
18373 {
18374 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
18375 machine_mode mode;
18376 bool is_weak;
18377 rtx_code_label *label1, *label2;
18378 enum memmodel model;
18379
18380 rval = operands[0];
18381 mem = operands[1];
18382 oldval = operands[2];
18383 newval = operands[3];
18384 is_weak = (operands[4] != const0_rtx);
18385 model_rtx = operands[5];
18386 scratch = operands[7];
18387 mode = GET_MODE (mem);
18388 model = memmodel_from_int (INTVAL (model_rtx));
18389
18390 /* When OLDVAL is zero and we want the strong version we can emit a tighter
18391 loop:
18392 .label1:
18393 LD[A]XR rval, [mem]
18394 CBNZ rval, .label2
18395 ST[L]XR scratch, newval, [mem]
18396 CBNZ scratch, .label1
18397 .label2:
18398 CMP rval, 0. */
18399 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
18400 oldval == const0_rtx && mode != TImode);
18401
18402 label1 = NULL;
18403 if (!is_weak)
18404 {
18405 label1 = gen_label_rtx ();
18406 emit_label (label1);
18407 }
18408 label2 = gen_label_rtx ();
18409
18410 /* The initial load can be relaxed for a __sync operation since a final
18411 barrier will be emitted to stop code hoisting. */
18412 if (is_mm_sync (model))
18413 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
18414 else
18415 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
18416
18417 if (strong_zero_p)
18418 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
18419 else
18420 {
18421 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
18422 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
18423 }
18424 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18425 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
18426 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18427
18428 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
18429
18430 if (!is_weak)
18431 {
18432 if (aarch64_track_speculation)
18433 {
18434 /* Emit an explicit compare instruction, so that we can correctly
18435 track the condition codes. */
18436 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18437 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18438 }
18439 else
18440 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
18441
18442 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18443 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
18444 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18445 }
18446 else
18447 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
18448
18449 emit_label (label2);
18450
18451 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
18452 to set the condition flags. If this is not used it will be removed by
18453 later passes. */
18454 if (strong_zero_p)
18455 aarch64_gen_compare_reg (NE, rval, const0_rtx);
18456
18457 /* Emit any final barrier needed for a __sync operation. */
18458 if (is_mm_sync (model))
18459 aarch64_emit_post_barrier (model);
18460 }
18461
18462 /* Split an atomic operation. */
18463
18464 void
18465 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
18466 rtx value, rtx model_rtx, rtx cond)
18467 {
18468 machine_mode mode = GET_MODE (mem);
18469 machine_mode wmode = (mode == DImode ? DImode : SImode);
18470 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
18471 const bool is_sync = is_mm_sync (model);
18472 rtx_code_label *label;
18473 rtx x;
18474
18475 /* Split the atomic operation into a sequence. */
18476 label = gen_label_rtx ();
18477 emit_label (label);
18478
18479 if (new_out)
18480 new_out = gen_lowpart (wmode, new_out);
18481 if (old_out)
18482 old_out = gen_lowpart (wmode, old_out);
18483 else
18484 old_out = new_out;
18485 value = simplify_gen_subreg (wmode, value, mode, 0);
18486
18487 /* The initial load can be relaxed for a __sync operation since a final
18488 barrier will be emitted to stop code hoisting. */
18489 if (is_sync)
18490 aarch64_emit_load_exclusive (mode, old_out, mem,
18491 GEN_INT (MEMMODEL_RELAXED));
18492 else
18493 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
18494
18495 switch (code)
18496 {
18497 case SET:
18498 new_out = value;
18499 break;
18500
18501 case NOT:
18502 x = gen_rtx_AND (wmode, old_out, value);
18503 emit_insn (gen_rtx_SET (new_out, x));
18504 x = gen_rtx_NOT (wmode, new_out);
18505 emit_insn (gen_rtx_SET (new_out, x));
18506 break;
18507
18508 case MINUS:
18509 if (CONST_INT_P (value))
18510 {
18511 value = GEN_INT (-INTVAL (value));
18512 code = PLUS;
18513 }
18514 /* Fall through. */
18515
18516 default:
18517 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
18518 emit_insn (gen_rtx_SET (new_out, x));
18519 break;
18520 }
18521
18522 aarch64_emit_store_exclusive (mode, cond, mem,
18523 gen_lowpart (mode, new_out), model_rtx);
18524
18525 if (aarch64_track_speculation)
18526 {
18527 /* Emit an explicit compare instruction, so that we can correctly
18528 track the condition codes. */
18529 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
18530 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
18531 }
18532 else
18533 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
18534
18535 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
18536 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
18537 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
18538
18539 /* Emit any final barrier needed for a __sync operation. */
18540 if (is_sync)
18541 aarch64_emit_post_barrier (model);
18542 }
18543
18544 static void
18545 aarch64_init_libfuncs (void)
18546 {
18547 /* Half-precision float operations. The compiler handles all operations
18548 with NULL libfuncs by converting to SFmode. */
18549
18550 /* Conversions. */
18551 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
18552 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
18553
18554 /* Arithmetic. */
18555 set_optab_libfunc (add_optab, HFmode, NULL);
18556 set_optab_libfunc (sdiv_optab, HFmode, NULL);
18557 set_optab_libfunc (smul_optab, HFmode, NULL);
18558 set_optab_libfunc (neg_optab, HFmode, NULL);
18559 set_optab_libfunc (sub_optab, HFmode, NULL);
18560
18561 /* Comparisons. */
18562 set_optab_libfunc (eq_optab, HFmode, NULL);
18563 set_optab_libfunc (ne_optab, HFmode, NULL);
18564 set_optab_libfunc (lt_optab, HFmode, NULL);
18565 set_optab_libfunc (le_optab, HFmode, NULL);
18566 set_optab_libfunc (ge_optab, HFmode, NULL);
18567 set_optab_libfunc (gt_optab, HFmode, NULL);
18568 set_optab_libfunc (unord_optab, HFmode, NULL);
18569 }
18570
18571 /* Target hook for c_mode_for_suffix. */
18572 static machine_mode
18573 aarch64_c_mode_for_suffix (char suffix)
18574 {
18575 if (suffix == 'q')
18576 return TFmode;
18577
18578 return VOIDmode;
18579 }
18580
18581 /* We can only represent floating point constants which will fit in
18582 "quarter-precision" values. These values are characterised by
18583 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
18584 by:
18585
18586 (-1)^s * (n/16) * 2^r
18587
18588 Where:
18589 's' is the sign bit.
18590 'n' is an integer in the range 16 <= n <= 31.
18591 'r' is an integer in the range -3 <= r <= 4. */
18592
18593 /* Return true iff X can be represented by a quarter-precision
18594 floating point immediate operand X. Note, we cannot represent 0.0. */
18595 bool
18596 aarch64_float_const_representable_p (rtx x)
18597 {
18598 /* This represents our current view of how many bits
18599 make up the mantissa. */
18600 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
18601 int exponent;
18602 unsigned HOST_WIDE_INT mantissa, mask;
18603 REAL_VALUE_TYPE r, m;
18604 bool fail;
18605
18606 x = unwrap_const_vec_duplicate (x);
18607 if (!CONST_DOUBLE_P (x))
18608 return false;
18609
18610 if (GET_MODE (x) == VOIDmode
18611 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
18612 return false;
18613
18614 r = *CONST_DOUBLE_REAL_VALUE (x);
18615
18616 /* We cannot represent infinities, NaNs or +/-zero. We won't
18617 know if we have +zero until we analyse the mantissa, but we
18618 can reject the other invalid values. */
18619 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
18620 || REAL_VALUE_MINUS_ZERO (r))
18621 return false;
18622
18623 /* Extract exponent. */
18624 r = real_value_abs (&r);
18625 exponent = REAL_EXP (&r);
18626
18627 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
18628 highest (sign) bit, with a fixed binary point at bit point_pos.
18629 m1 holds the low part of the mantissa, m2 the high part.
18630 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
18631 bits for the mantissa, this can fail (low bits will be lost). */
18632 real_ldexp (&m, &r, point_pos - exponent);
18633 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
18634
18635 /* If the low part of the mantissa has bits set we cannot represent
18636 the value. */
18637 if (w.ulow () != 0)
18638 return false;
18639 /* We have rejected the lower HOST_WIDE_INT, so update our
18640 understanding of how many bits lie in the mantissa and
18641 look only at the high HOST_WIDE_INT. */
18642 mantissa = w.elt (1);
18643 point_pos -= HOST_BITS_PER_WIDE_INT;
18644
18645 /* We can only represent values with a mantissa of the form 1.xxxx. */
18646 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
18647 if ((mantissa & mask) != 0)
18648 return false;
18649
18650 /* Having filtered unrepresentable values, we may now remove all
18651 but the highest 5 bits. */
18652 mantissa >>= point_pos - 5;
18653
18654 /* We cannot represent the value 0.0, so reject it. This is handled
18655 elsewhere. */
18656 if (mantissa == 0)
18657 return false;
18658
18659 /* Then, as bit 4 is always set, we can mask it off, leaving
18660 the mantissa in the range [0, 15]. */
18661 mantissa &= ~(1 << 4);
18662 gcc_assert (mantissa <= 15);
18663
18664 /* GCC internally does not use IEEE754-like encoding (where normalized
18665 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
18666 Our mantissa values are shifted 4 places to the left relative to
18667 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
18668 by 5 places to correct for GCC's representation. */
18669 exponent = 5 - exponent;
18670
18671 return (exponent >= 0 && exponent <= 7);
18672 }
18673
18674 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
18675 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
18676 output MOVI/MVNI, ORR or BIC immediate. */
18677 char*
18678 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
18679 enum simd_immediate_check which)
18680 {
18681 bool is_valid;
18682 static char templ[40];
18683 const char *mnemonic;
18684 const char *shift_op;
18685 unsigned int lane_count = 0;
18686 char element_char;
18687
18688 struct simd_immediate_info info;
18689
18690 /* This will return true to show const_vector is legal for use as either
18691 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
18692 It will also update INFO to show how the immediate should be generated.
18693 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
18694 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
18695 gcc_assert (is_valid);
18696
18697 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18698 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
18699
18700 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18701 {
18702 gcc_assert (info.insn == simd_immediate_info::MOV
18703 && info.u.mov.shift == 0);
18704 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
18705 move immediate path. */
18706 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18707 info.u.mov.value = GEN_INT (0);
18708 else
18709 {
18710 const unsigned int buf_size = 20;
18711 char float_buf[buf_size] = {'\0'};
18712 real_to_decimal_for_mode (float_buf,
18713 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18714 buf_size, buf_size, 1, info.elt_mode);
18715
18716 if (lane_count == 1)
18717 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
18718 else
18719 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
18720 lane_count, element_char, float_buf);
18721 return templ;
18722 }
18723 }
18724
18725 gcc_assert (CONST_INT_P (info.u.mov.value));
18726
18727 if (which == AARCH64_CHECK_MOV)
18728 {
18729 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
18730 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
18731 ? "msl" : "lsl");
18732 if (lane_count == 1)
18733 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
18734 mnemonic, UINTVAL (info.u.mov.value));
18735 else if (info.u.mov.shift)
18736 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18737 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
18738 element_char, UINTVAL (info.u.mov.value), shift_op,
18739 info.u.mov.shift);
18740 else
18741 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
18742 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
18743 element_char, UINTVAL (info.u.mov.value));
18744 }
18745 else
18746 {
18747 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
18748 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
18749 if (info.u.mov.shift)
18750 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18751 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
18752 element_char, UINTVAL (info.u.mov.value), "lsl",
18753 info.u.mov.shift);
18754 else
18755 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
18756 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
18757 element_char, UINTVAL (info.u.mov.value));
18758 }
18759 return templ;
18760 }
18761
18762 char*
18763 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
18764 {
18765
18766 /* If a floating point number was passed and we desire to use it in an
18767 integer mode do the conversion to integer. */
18768 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
18769 {
18770 unsigned HOST_WIDE_INT ival;
18771 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
18772 gcc_unreachable ();
18773 immediate = gen_int_mode (ival, mode);
18774 }
18775
18776 machine_mode vmode;
18777 /* use a 64 bit mode for everything except for DI/DF mode, where we use
18778 a 128 bit vector mode. */
18779 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
18780
18781 vmode = aarch64_simd_container_mode (mode, width);
18782 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
18783 return aarch64_output_simd_mov_immediate (v_op, width);
18784 }
18785
18786 /* Return the output string to use for moving immediate CONST_VECTOR
18787 into an SVE register. */
18788
18789 char *
18790 aarch64_output_sve_mov_immediate (rtx const_vector)
18791 {
18792 static char templ[40];
18793 struct simd_immediate_info info;
18794 char element_char;
18795
18796 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
18797 gcc_assert (is_valid);
18798
18799 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18800
18801 machine_mode vec_mode = GET_MODE (const_vector);
18802 if (aarch64_sve_pred_mode_p (vec_mode))
18803 {
18804 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
18805 if (info.insn == simd_immediate_info::MOV)
18806 {
18807 gcc_assert (info.u.mov.value == const0_rtx);
18808 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
18809 }
18810 else
18811 {
18812 gcc_assert (info.insn == simd_immediate_info::PTRUE);
18813 unsigned int total_bytes;
18814 if (info.u.pattern == AARCH64_SV_ALL
18815 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
18816 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
18817 total_bytes / GET_MODE_SIZE (info.elt_mode));
18818 else
18819 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
18820 svpattern_token (info.u.pattern));
18821 }
18822 return buf;
18823 }
18824
18825 if (info.insn == simd_immediate_info::INDEX)
18826 {
18827 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
18828 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
18829 element_char, INTVAL (info.u.index.base),
18830 INTVAL (info.u.index.step));
18831 return templ;
18832 }
18833
18834 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
18835 {
18836 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
18837 info.u.mov.value = GEN_INT (0);
18838 else
18839 {
18840 const int buf_size = 20;
18841 char float_buf[buf_size] = {};
18842 real_to_decimal_for_mode (float_buf,
18843 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
18844 buf_size, buf_size, 1, info.elt_mode);
18845
18846 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
18847 element_char, float_buf);
18848 return templ;
18849 }
18850 }
18851
18852 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
18853 element_char, INTVAL (info.u.mov.value));
18854 return templ;
18855 }
18856
18857 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
18858 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
18859 pattern. */
18860
18861 char *
18862 aarch64_output_sve_ptrues (rtx const_unspec)
18863 {
18864 static char templ[40];
18865
18866 struct simd_immediate_info info;
18867 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
18868 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
18869
18870 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
18871 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
18872 svpattern_token (info.u.pattern));
18873 return templ;
18874 }
18875
18876 /* Split operands into moves from op[1] + op[2] into op[0]. */
18877
18878 void
18879 aarch64_split_combinev16qi (rtx operands[3])
18880 {
18881 unsigned int dest = REGNO (operands[0]);
18882 unsigned int src1 = REGNO (operands[1]);
18883 unsigned int src2 = REGNO (operands[2]);
18884 machine_mode halfmode = GET_MODE (operands[1]);
18885 unsigned int halfregs = REG_NREGS (operands[1]);
18886 rtx destlo, desthi;
18887
18888 gcc_assert (halfmode == V16QImode);
18889
18890 if (src1 == dest && src2 == dest + halfregs)
18891 {
18892 /* No-op move. Can't split to nothing; emit something. */
18893 emit_note (NOTE_INSN_DELETED);
18894 return;
18895 }
18896
18897 /* Preserve register attributes for variable tracking. */
18898 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
18899 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
18900 GET_MODE_SIZE (halfmode));
18901
18902 /* Special case of reversed high/low parts. */
18903 if (reg_overlap_mentioned_p (operands[2], destlo)
18904 && reg_overlap_mentioned_p (operands[1], desthi))
18905 {
18906 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
18907 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
18908 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
18909 }
18910 else if (!reg_overlap_mentioned_p (operands[2], destlo))
18911 {
18912 /* Try to avoid unnecessary moves if part of the result
18913 is in the right place already. */
18914 if (src1 != dest)
18915 emit_move_insn (destlo, operands[1]);
18916 if (src2 != dest + halfregs)
18917 emit_move_insn (desthi, operands[2]);
18918 }
18919 else
18920 {
18921 if (src2 != dest + halfregs)
18922 emit_move_insn (desthi, operands[2]);
18923 if (src1 != dest)
18924 emit_move_insn (destlo, operands[1]);
18925 }
18926 }
18927
18928 /* vec_perm support. */
18929
18930 struct expand_vec_perm_d
18931 {
18932 rtx target, op0, op1;
18933 vec_perm_indices perm;
18934 machine_mode vmode;
18935 unsigned int vec_flags;
18936 bool one_vector_p;
18937 bool testing_p;
18938 };
18939
18940 /* Generate a variable permutation. */
18941
18942 static void
18943 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
18944 {
18945 machine_mode vmode = GET_MODE (target);
18946 bool one_vector_p = rtx_equal_p (op0, op1);
18947
18948 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
18949 gcc_checking_assert (GET_MODE (op0) == vmode);
18950 gcc_checking_assert (GET_MODE (op1) == vmode);
18951 gcc_checking_assert (GET_MODE (sel) == vmode);
18952 gcc_checking_assert (TARGET_SIMD);
18953
18954 if (one_vector_p)
18955 {
18956 if (vmode == V8QImode)
18957 {
18958 /* Expand the argument to a V16QI mode by duplicating it. */
18959 rtx pair = gen_reg_rtx (V16QImode);
18960 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
18961 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
18962 }
18963 else
18964 {
18965 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
18966 }
18967 }
18968 else
18969 {
18970 rtx pair;
18971
18972 if (vmode == V8QImode)
18973 {
18974 pair = gen_reg_rtx (V16QImode);
18975 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
18976 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
18977 }
18978 else
18979 {
18980 pair = gen_reg_rtx (OImode);
18981 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
18982 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
18983 }
18984 }
18985 }
18986
18987 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
18988 NELT is the number of elements in the vector. */
18989
18990 void
18991 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
18992 unsigned int nelt)
18993 {
18994 machine_mode vmode = GET_MODE (target);
18995 bool one_vector_p = rtx_equal_p (op0, op1);
18996 rtx mask;
18997
18998 /* The TBL instruction does not use a modulo index, so we must take care
18999 of that ourselves. */
19000 mask = aarch64_simd_gen_const_vector_dup (vmode,
19001 one_vector_p ? nelt - 1 : 2 * nelt - 1);
19002 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
19003
19004 /* For big-endian, we also need to reverse the index within the vector
19005 (but not which vector). */
19006 if (BYTES_BIG_ENDIAN)
19007 {
19008 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
19009 if (!one_vector_p)
19010 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
19011 sel = expand_simple_binop (vmode, XOR, sel, mask,
19012 NULL, 0, OPTAB_LIB_WIDEN);
19013 }
19014 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
19015 }
19016
19017 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
19018
19019 static void
19020 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
19021 {
19022 emit_insn (gen_rtx_SET (target,
19023 gen_rtx_UNSPEC (GET_MODE (target),
19024 gen_rtvec (2, op0, op1), code)));
19025 }
19026
19027 /* Expand an SVE vec_perm with the given operands. */
19028
19029 void
19030 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
19031 {
19032 machine_mode data_mode = GET_MODE (target);
19033 machine_mode sel_mode = GET_MODE (sel);
19034 /* Enforced by the pattern condition. */
19035 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
19036
19037 /* Note: vec_perm indices are supposed to wrap when they go beyond the
19038 size of the two value vectors, i.e. the upper bits of the indices
19039 are effectively ignored. SVE TBL instead produces 0 for any
19040 out-of-range indices, so we need to modulo all the vec_perm indices
19041 to ensure they are all in range. */
19042 rtx sel_reg = force_reg (sel_mode, sel);
19043
19044 /* Check if the sel only references the first values vector. */
19045 if (GET_CODE (sel) == CONST_VECTOR
19046 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
19047 {
19048 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
19049 return;
19050 }
19051
19052 /* Check if the two values vectors are the same. */
19053 if (rtx_equal_p (op0, op1))
19054 {
19055 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
19056 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19057 NULL, 0, OPTAB_DIRECT);
19058 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
19059 return;
19060 }
19061
19062 /* Run TBL on for each value vector and combine the results. */
19063
19064 rtx res0 = gen_reg_rtx (data_mode);
19065 rtx res1 = gen_reg_rtx (data_mode);
19066 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
19067 if (GET_CODE (sel) != CONST_VECTOR
19068 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
19069 {
19070 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
19071 2 * nunits - 1);
19072 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
19073 NULL, 0, OPTAB_DIRECT);
19074 }
19075 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
19076 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
19077 NULL, 0, OPTAB_DIRECT);
19078 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
19079 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
19080 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
19081 else
19082 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
19083 }
19084
19085 /* Recognize patterns suitable for the TRN instructions. */
19086 static bool
19087 aarch64_evpc_trn (struct expand_vec_perm_d *d)
19088 {
19089 HOST_WIDE_INT odd;
19090 poly_uint64 nelt = d->perm.length ();
19091 rtx out, in0, in1, x;
19092 machine_mode vmode = d->vmode;
19093
19094 if (GET_MODE_UNIT_SIZE (vmode) > 8)
19095 return false;
19096
19097 /* Note that these are little-endian tests.
19098 We correct for big-endian later. */
19099 if (!d->perm[0].is_constant (&odd)
19100 || (odd != 0 && odd != 1)
19101 || !d->perm.series_p (0, 2, odd, 2)
19102 || !d->perm.series_p (1, 2, nelt + odd, 2))
19103 return false;
19104
19105 /* Success! */
19106 if (d->testing_p)
19107 return true;
19108
19109 in0 = d->op0;
19110 in1 = d->op1;
19111 /* We don't need a big-endian lane correction for SVE; see the comment
19112 at the head of aarch64-sve.md for details. */
19113 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19114 {
19115 x = in0, in0 = in1, in1 = x;
19116 odd = !odd;
19117 }
19118 out = d->target;
19119
19120 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19121 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
19122 return true;
19123 }
19124
19125 /* Recognize patterns suitable for the UZP instructions. */
19126 static bool
19127 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
19128 {
19129 HOST_WIDE_INT odd;
19130 rtx out, in0, in1, x;
19131 machine_mode vmode = d->vmode;
19132
19133 if (GET_MODE_UNIT_SIZE (vmode) > 8)
19134 return false;
19135
19136 /* Note that these are little-endian tests.
19137 We correct for big-endian later. */
19138 if (!d->perm[0].is_constant (&odd)
19139 || (odd != 0 && odd != 1)
19140 || !d->perm.series_p (0, 1, odd, 2))
19141 return false;
19142
19143 /* Success! */
19144 if (d->testing_p)
19145 return true;
19146
19147 in0 = d->op0;
19148 in1 = d->op1;
19149 /* We don't need a big-endian lane correction for SVE; see the comment
19150 at the head of aarch64-sve.md for details. */
19151 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19152 {
19153 x = in0, in0 = in1, in1 = x;
19154 odd = !odd;
19155 }
19156 out = d->target;
19157
19158 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19159 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
19160 return true;
19161 }
19162
19163 /* Recognize patterns suitable for the ZIP instructions. */
19164 static bool
19165 aarch64_evpc_zip (struct expand_vec_perm_d *d)
19166 {
19167 unsigned int high;
19168 poly_uint64 nelt = d->perm.length ();
19169 rtx out, in0, in1, x;
19170 machine_mode vmode = d->vmode;
19171
19172 if (GET_MODE_UNIT_SIZE (vmode) > 8)
19173 return false;
19174
19175 /* Note that these are little-endian tests.
19176 We correct for big-endian later. */
19177 poly_uint64 first = d->perm[0];
19178 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
19179 || !d->perm.series_p (0, 2, first, 1)
19180 || !d->perm.series_p (1, 2, first + nelt, 1))
19181 return false;
19182 high = maybe_ne (first, 0U);
19183
19184 /* Success! */
19185 if (d->testing_p)
19186 return true;
19187
19188 in0 = d->op0;
19189 in1 = d->op1;
19190 /* We don't need a big-endian lane correction for SVE; see the comment
19191 at the head of aarch64-sve.md for details. */
19192 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
19193 {
19194 x = in0, in0 = in1, in1 = x;
19195 high = !high;
19196 }
19197 out = d->target;
19198
19199 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
19200 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
19201 return true;
19202 }
19203
19204 /* Recognize patterns for the EXT insn. */
19205
19206 static bool
19207 aarch64_evpc_ext (struct expand_vec_perm_d *d)
19208 {
19209 HOST_WIDE_INT location;
19210 rtx offset;
19211
19212 /* The first element always refers to the first vector.
19213 Check if the extracted indices are increasing by one. */
19214 if (d->vec_flags == VEC_SVE_PRED
19215 || !d->perm[0].is_constant (&location)
19216 || !d->perm.series_p (0, 1, location, 1))
19217 return false;
19218
19219 /* Success! */
19220 if (d->testing_p)
19221 return true;
19222
19223 /* The case where (location == 0) is a no-op for both big- and little-endian,
19224 and is removed by the mid-end at optimization levels -O1 and higher.
19225
19226 We don't need a big-endian lane correction for SVE; see the comment
19227 at the head of aarch64-sve.md for details. */
19228 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
19229 {
19230 /* After setup, we want the high elements of the first vector (stored
19231 at the LSB end of the register), and the low elements of the second
19232 vector (stored at the MSB end of the register). So swap. */
19233 std::swap (d->op0, d->op1);
19234 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
19235 to_constant () is safe since this is restricted to Advanced SIMD
19236 vectors. */
19237 location = d->perm.length ().to_constant () - location;
19238 }
19239
19240 offset = GEN_INT (location);
19241 emit_set_insn (d->target,
19242 gen_rtx_UNSPEC (d->vmode,
19243 gen_rtvec (3, d->op0, d->op1, offset),
19244 UNSPEC_EXT));
19245 return true;
19246 }
19247
19248 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
19249 within each 64-bit, 32-bit or 16-bit granule. */
19250
19251 static bool
19252 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
19253 {
19254 HOST_WIDE_INT diff;
19255 unsigned int i, size, unspec;
19256 machine_mode pred_mode;
19257
19258 if (d->vec_flags == VEC_SVE_PRED
19259 || !d->one_vector_p
19260 || !d->perm[0].is_constant (&diff))
19261 return false;
19262
19263 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
19264 if (size == 8)
19265 {
19266 unspec = UNSPEC_REV64;
19267 pred_mode = VNx2BImode;
19268 }
19269 else if (size == 4)
19270 {
19271 unspec = UNSPEC_REV32;
19272 pred_mode = VNx4BImode;
19273 }
19274 else if (size == 2)
19275 {
19276 unspec = UNSPEC_REV16;
19277 pred_mode = VNx8BImode;
19278 }
19279 else
19280 return false;
19281
19282 unsigned int step = diff + 1;
19283 for (i = 0; i < step; ++i)
19284 if (!d->perm.series_p (i, step, diff - i, step))
19285 return false;
19286
19287 /* Success! */
19288 if (d->testing_p)
19289 return true;
19290
19291 if (d->vec_flags == VEC_SVE_DATA)
19292 {
19293 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
19294 rtx target = gen_reg_rtx (int_mode);
19295 if (BYTES_BIG_ENDIAN)
19296 /* The act of taking a subreg between INT_MODE and d->vmode
19297 is itself a reversing operation on big-endian targets;
19298 see the comment at the head of aarch64-sve.md for details.
19299 First reinterpret OP0 as INT_MODE without using a subreg
19300 and without changing the contents. */
19301 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
19302 else
19303 {
19304 /* For SVE we use REV[BHW] unspecs derived from the element size
19305 of v->mode and vector modes whose elements have SIZE bytes.
19306 This ensures that the vector modes match the predicate modes. */
19307 int unspec = aarch64_sve_rev_unspec (d->vmode);
19308 rtx pred = aarch64_ptrue_reg (pred_mode);
19309 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
19310 gen_lowpart (int_mode, d->op0)));
19311 }
19312 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
19313 return true;
19314 }
19315 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
19316 emit_set_insn (d->target, src);
19317 return true;
19318 }
19319
19320 /* Recognize patterns for the REV insn, which reverses elements within
19321 a full vector. */
19322
19323 static bool
19324 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
19325 {
19326 poly_uint64 nelt = d->perm.length ();
19327
19328 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
19329 return false;
19330
19331 if (!d->perm.series_p (0, 1, nelt - 1, -1))
19332 return false;
19333
19334 /* Success! */
19335 if (d->testing_p)
19336 return true;
19337
19338 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
19339 emit_set_insn (d->target, src);
19340 return true;
19341 }
19342
19343 static bool
19344 aarch64_evpc_dup (struct expand_vec_perm_d *d)
19345 {
19346 rtx out = d->target;
19347 rtx in0;
19348 HOST_WIDE_INT elt;
19349 machine_mode vmode = d->vmode;
19350 rtx lane;
19351
19352 if (d->vec_flags == VEC_SVE_PRED
19353 || d->perm.encoding ().encoded_nelts () != 1
19354 || !d->perm[0].is_constant (&elt))
19355 return false;
19356
19357 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
19358 return false;
19359
19360 /* Success! */
19361 if (d->testing_p)
19362 return true;
19363
19364 /* The generic preparation in aarch64_expand_vec_perm_const_1
19365 swaps the operand order and the permute indices if it finds
19366 d->perm[0] to be in the second operand. Thus, we can always
19367 use d->op0 and need not do any extra arithmetic to get the
19368 correct lane number. */
19369 in0 = d->op0;
19370 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
19371
19372 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
19373 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
19374 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
19375 return true;
19376 }
19377
19378 static bool
19379 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
19380 {
19381 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
19382 machine_mode vmode = d->vmode;
19383
19384 /* Make sure that the indices are constant. */
19385 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
19386 for (unsigned int i = 0; i < encoded_nelts; ++i)
19387 if (!d->perm[i].is_constant ())
19388 return false;
19389
19390 if (d->testing_p)
19391 return true;
19392
19393 /* Generic code will try constant permutation twice. Once with the
19394 original mode and again with the elements lowered to QImode.
19395 So wait and don't do the selector expansion ourselves. */
19396 if (vmode != V8QImode && vmode != V16QImode)
19397 return false;
19398
19399 /* to_constant is safe since this routine is specific to Advanced SIMD
19400 vectors. */
19401 unsigned int nelt = d->perm.length ().to_constant ();
19402 for (unsigned int i = 0; i < nelt; ++i)
19403 /* If big-endian and two vectors we end up with a weird mixed-endian
19404 mode on NEON. Reverse the index within each word but not the word
19405 itself. to_constant is safe because we checked is_constant above. */
19406 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
19407 ? d->perm[i].to_constant () ^ (nelt - 1)
19408 : d->perm[i].to_constant ());
19409
19410 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
19411 sel = force_reg (vmode, sel);
19412
19413 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
19414 return true;
19415 }
19416
19417 /* Try to implement D using an SVE TBL instruction. */
19418
19419 static bool
19420 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
19421 {
19422 unsigned HOST_WIDE_INT nelt;
19423
19424 /* Permuting two variable-length vectors could overflow the
19425 index range. */
19426 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
19427 return false;
19428
19429 if (d->testing_p)
19430 return true;
19431
19432 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
19433 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
19434 if (d->one_vector_p)
19435 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
19436 else
19437 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
19438 return true;
19439 }
19440
19441 /* Try to implement D using SVE SEL instruction. */
19442
19443 static bool
19444 aarch64_evpc_sel (struct expand_vec_perm_d *d)
19445 {
19446 machine_mode vmode = d->vmode;
19447 int unit_size = GET_MODE_UNIT_SIZE (vmode);
19448
19449 if (d->vec_flags != VEC_SVE_DATA
19450 || unit_size > 8)
19451 return false;
19452
19453 int n_patterns = d->perm.encoding ().npatterns ();
19454 poly_int64 vec_len = d->perm.length ();
19455
19456 for (int i = 0; i < n_patterns; ++i)
19457 if (!known_eq (d->perm[i], i)
19458 && !known_eq (d->perm[i], vec_len + i))
19459 return false;
19460
19461 for (int i = n_patterns; i < n_patterns * 2; i++)
19462 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
19463 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
19464 return false;
19465
19466 if (d->testing_p)
19467 return true;
19468
19469 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
19470
19471 rtx_vector_builder builder (pred_mode, n_patterns, 2);
19472 for (int i = 0; i < n_patterns * 2; i++)
19473 {
19474 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
19475 : CONST0_RTX (BImode);
19476 builder.quick_push (elem);
19477 }
19478
19479 rtx const_vec = builder.build ();
19480 rtx pred = force_reg (pred_mode, const_vec);
19481 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op1, d->op0, pred));
19482 return true;
19483 }
19484
19485 static bool
19486 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
19487 {
19488 /* The pattern matching functions above are written to look for a small
19489 number to begin the sequence (0, 1, N/2). If we begin with an index
19490 from the second operand, we can swap the operands. */
19491 poly_int64 nelt = d->perm.length ();
19492 if (known_ge (d->perm[0], nelt))
19493 {
19494 d->perm.rotate_inputs (1);
19495 std::swap (d->op0, d->op1);
19496 }
19497
19498 if ((d->vec_flags == VEC_ADVSIMD
19499 || d->vec_flags == VEC_SVE_DATA
19500 || d->vec_flags == VEC_SVE_PRED)
19501 && known_gt (nelt, 1))
19502 {
19503 if (aarch64_evpc_rev_local (d))
19504 return true;
19505 else if (aarch64_evpc_rev_global (d))
19506 return true;
19507 else if (aarch64_evpc_ext (d))
19508 return true;
19509 else if (aarch64_evpc_dup (d))
19510 return true;
19511 else if (aarch64_evpc_zip (d))
19512 return true;
19513 else if (aarch64_evpc_uzp (d))
19514 return true;
19515 else if (aarch64_evpc_trn (d))
19516 return true;
19517 else if (aarch64_evpc_sel (d))
19518 return true;
19519 if (d->vec_flags == VEC_SVE_DATA)
19520 return aarch64_evpc_sve_tbl (d);
19521 else if (d->vec_flags == VEC_ADVSIMD)
19522 return aarch64_evpc_tbl (d);
19523 }
19524 return false;
19525 }
19526
19527 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
19528
19529 static bool
19530 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
19531 rtx op1, const vec_perm_indices &sel)
19532 {
19533 struct expand_vec_perm_d d;
19534
19535 /* Check whether the mask can be applied to a single vector. */
19536 if (sel.ninputs () == 1
19537 || (op0 && rtx_equal_p (op0, op1)))
19538 d.one_vector_p = true;
19539 else if (sel.all_from_input_p (0))
19540 {
19541 d.one_vector_p = true;
19542 op1 = op0;
19543 }
19544 else if (sel.all_from_input_p (1))
19545 {
19546 d.one_vector_p = true;
19547 op0 = op1;
19548 }
19549 else
19550 d.one_vector_p = false;
19551
19552 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
19553 sel.nelts_per_input ());
19554 d.vmode = vmode;
19555 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
19556 d.target = target;
19557 d.op0 = op0;
19558 d.op1 = op1;
19559 d.testing_p = !target;
19560
19561 if (!d.testing_p)
19562 return aarch64_expand_vec_perm_const_1 (&d);
19563
19564 rtx_insn *last = get_last_insn ();
19565 bool ret = aarch64_expand_vec_perm_const_1 (&d);
19566 gcc_assert (last == get_last_insn ());
19567
19568 return ret;
19569 }
19570
19571 /* Generate a byte permute mask for a register of mode MODE,
19572 which has NUNITS units. */
19573
19574 rtx
19575 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
19576 {
19577 /* We have to reverse each vector because we dont have
19578 a permuted load that can reverse-load according to ABI rules. */
19579 rtx mask;
19580 rtvec v = rtvec_alloc (16);
19581 unsigned int i, j;
19582 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
19583
19584 gcc_assert (BYTES_BIG_ENDIAN);
19585 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
19586
19587 for (i = 0; i < nunits; i++)
19588 for (j = 0; j < usize; j++)
19589 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
19590 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
19591 return force_reg (V16QImode, mask);
19592 }
19593
19594 /* Expand an SVE integer comparison using the SVE equivalent of:
19595
19596 (set TARGET (CODE OP0 OP1)). */
19597
19598 void
19599 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
19600 {
19601 machine_mode pred_mode = GET_MODE (target);
19602 machine_mode data_mode = GET_MODE (op0);
19603 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
19604 op0, op1);
19605 if (!rtx_equal_p (target, res))
19606 emit_move_insn (target, res);
19607 }
19608
19609 /* Return the UNSPEC_COND_* code for comparison CODE. */
19610
19611 static unsigned int
19612 aarch64_unspec_cond_code (rtx_code code)
19613 {
19614 switch (code)
19615 {
19616 case NE:
19617 return UNSPEC_COND_FCMNE;
19618 case EQ:
19619 return UNSPEC_COND_FCMEQ;
19620 case LT:
19621 return UNSPEC_COND_FCMLT;
19622 case GT:
19623 return UNSPEC_COND_FCMGT;
19624 case LE:
19625 return UNSPEC_COND_FCMLE;
19626 case GE:
19627 return UNSPEC_COND_FCMGE;
19628 case UNORDERED:
19629 return UNSPEC_COND_FCMUO;
19630 default:
19631 gcc_unreachable ();
19632 }
19633 }
19634
19635 /* Emit:
19636
19637 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19638
19639 where <X> is the operation associated with comparison CODE.
19640 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19641
19642 static void
19643 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
19644 bool known_ptrue_p, rtx op0, rtx op1)
19645 {
19646 rtx flag = gen_int_mode (known_ptrue_p, SImode);
19647 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
19648 gen_rtvec (4, pred, flag, op0, op1),
19649 aarch64_unspec_cond_code (code));
19650 emit_set_insn (target, unspec);
19651 }
19652
19653 /* Emit the SVE equivalent of:
19654
19655 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
19656 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
19657 (set TARGET (ior:PRED_MODE TMP1 TMP2))
19658
19659 where <Xi> is the operation associated with comparison CODEi.
19660 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19661
19662 static void
19663 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
19664 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
19665 {
19666 machine_mode pred_mode = GET_MODE (pred);
19667 rtx tmp1 = gen_reg_rtx (pred_mode);
19668 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
19669 rtx tmp2 = gen_reg_rtx (pred_mode);
19670 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
19671 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
19672 }
19673
19674 /* Emit the SVE equivalent of:
19675
19676 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
19677 (set TARGET (not TMP))
19678
19679 where <X> is the operation associated with comparison CODE.
19680 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
19681
19682 static void
19683 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
19684 bool known_ptrue_p, rtx op0, rtx op1)
19685 {
19686 machine_mode pred_mode = GET_MODE (pred);
19687 rtx tmp = gen_reg_rtx (pred_mode);
19688 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
19689 aarch64_emit_unop (target, one_cmpl_optab, tmp);
19690 }
19691
19692 /* Expand an SVE floating-point comparison using the SVE equivalent of:
19693
19694 (set TARGET (CODE OP0 OP1))
19695
19696 If CAN_INVERT_P is true, the caller can also handle inverted results;
19697 return true if the result is in fact inverted. */
19698
19699 bool
19700 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
19701 rtx op0, rtx op1, bool can_invert_p)
19702 {
19703 machine_mode pred_mode = GET_MODE (target);
19704 machine_mode data_mode = GET_MODE (op0);
19705
19706 rtx ptrue = aarch64_ptrue_reg (pred_mode);
19707 switch (code)
19708 {
19709 case UNORDERED:
19710 /* UNORDERED has no immediate form. */
19711 op1 = force_reg (data_mode, op1);
19712 /* fall through */
19713 case LT:
19714 case LE:
19715 case GT:
19716 case GE:
19717 case EQ:
19718 case NE:
19719 {
19720 /* There is native support for the comparison. */
19721 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19722 return false;
19723 }
19724
19725 case LTGT:
19726 /* This is a trapping operation (LT or GT). */
19727 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
19728 return false;
19729
19730 case UNEQ:
19731 if (!flag_trapping_math)
19732 {
19733 /* This would trap for signaling NaNs. */
19734 op1 = force_reg (data_mode, op1);
19735 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
19736 ptrue, true, op0, op1);
19737 return false;
19738 }
19739 /* fall through */
19740 case UNLT:
19741 case UNLE:
19742 case UNGT:
19743 case UNGE:
19744 if (flag_trapping_math)
19745 {
19746 /* Work out which elements are ordered. */
19747 rtx ordered = gen_reg_rtx (pred_mode);
19748 op1 = force_reg (data_mode, op1);
19749 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
19750 ptrue, true, op0, op1);
19751
19752 /* Test the opposite condition for the ordered elements,
19753 then invert the result. */
19754 if (code == UNEQ)
19755 code = NE;
19756 else
19757 code = reverse_condition_maybe_unordered (code);
19758 if (can_invert_p)
19759 {
19760 aarch64_emit_sve_fp_cond (target, code,
19761 ordered, false, op0, op1);
19762 return true;
19763 }
19764 aarch64_emit_sve_invert_fp_cond (target, code,
19765 ordered, false, op0, op1);
19766 return false;
19767 }
19768 break;
19769
19770 case ORDERED:
19771 /* ORDERED has no immediate form. */
19772 op1 = force_reg (data_mode, op1);
19773 break;
19774
19775 default:
19776 gcc_unreachable ();
19777 }
19778
19779 /* There is native support for the inverse comparison. */
19780 code = reverse_condition_maybe_unordered (code);
19781 if (can_invert_p)
19782 {
19783 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
19784 return true;
19785 }
19786 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
19787 return false;
19788 }
19789
19790 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
19791 of the data being selected and CMP_MODE is the mode of the values being
19792 compared. */
19793
19794 void
19795 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
19796 rtx *ops)
19797 {
19798 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
19799 rtx pred = gen_reg_rtx (pred_mode);
19800 if (FLOAT_MODE_P (cmp_mode))
19801 {
19802 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
19803 ops[4], ops[5], true))
19804 std::swap (ops[1], ops[2]);
19805 }
19806 else
19807 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
19808
19809 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
19810 ops[1] = force_reg (data_mode, ops[1]);
19811 /* The "false" value can only be zero if the "true" value is a constant. */
19812 if (register_operand (ops[1], data_mode)
19813 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
19814 ops[2] = force_reg (data_mode, ops[2]);
19815
19816 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
19817 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
19818 }
19819
19820 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
19821 true. However due to issues with register allocation it is preferable
19822 to avoid tieing integer scalar and FP scalar modes. Executing integer
19823 operations in general registers is better than treating them as scalar
19824 vector operations. This reduces latency and avoids redundant int<->FP
19825 moves. So tie modes if they are either the same class, or vector modes
19826 with other vector modes, vector structs or any scalar mode. */
19827
19828 static bool
19829 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
19830 {
19831 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
19832 return true;
19833
19834 /* We specifically want to allow elements of "structure" modes to
19835 be tieable to the structure. This more general condition allows
19836 other rarer situations too. The reason we don't extend this to
19837 predicate modes is that there are no predicate structure modes
19838 nor any specific instructions for extracting part of a predicate
19839 register. */
19840 if (aarch64_vector_data_mode_p (mode1)
19841 && aarch64_vector_data_mode_p (mode2))
19842 return true;
19843
19844 /* Also allow any scalar modes with vectors. */
19845 if (aarch64_vector_mode_supported_p (mode1)
19846 || aarch64_vector_mode_supported_p (mode2))
19847 return true;
19848
19849 return false;
19850 }
19851
19852 /* Return a new RTX holding the result of moving POINTER forward by
19853 AMOUNT bytes. */
19854
19855 static rtx
19856 aarch64_move_pointer (rtx pointer, poly_int64 amount)
19857 {
19858 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
19859
19860 return adjust_automodify_address (pointer, GET_MODE (pointer),
19861 next, amount);
19862 }
19863
19864 /* Return a new RTX holding the result of moving POINTER forward by the
19865 size of the mode it points to. */
19866
19867 static rtx
19868 aarch64_progress_pointer (rtx pointer)
19869 {
19870 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
19871 }
19872
19873 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
19874 MODE bytes. */
19875
19876 static void
19877 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
19878 machine_mode mode)
19879 {
19880 rtx reg = gen_reg_rtx (mode);
19881
19882 /* "Cast" the pointers to the correct mode. */
19883 *src = adjust_address (*src, mode, 0);
19884 *dst = adjust_address (*dst, mode, 0);
19885 /* Emit the memcpy. */
19886 emit_move_insn (reg, *src);
19887 emit_move_insn (*dst, reg);
19888 /* Move the pointers forward. */
19889 *src = aarch64_progress_pointer (*src);
19890 *dst = aarch64_progress_pointer (*dst);
19891 }
19892
19893 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
19894 we succeed, otherwise return false. */
19895
19896 bool
19897 aarch64_expand_cpymem (rtx *operands)
19898 {
19899 int n, mode_bits;
19900 rtx dst = operands[0];
19901 rtx src = operands[1];
19902 rtx base;
19903 machine_mode cur_mode = BLKmode, next_mode;
19904 bool speed_p = !optimize_function_for_size_p (cfun);
19905
19906 /* When optimizing for size, give a better estimate of the length of a
19907 memcpy call, but use the default otherwise. Moves larger than 8 bytes
19908 will always require an even number of instructions to do now. And each
19909 operation requires both a load+store, so devide the max number by 2. */
19910 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
19911
19912 /* We can't do anything smart if the amount to copy is not constant. */
19913 if (!CONST_INT_P (operands[2]))
19914 return false;
19915
19916 n = INTVAL (operands[2]);
19917
19918 /* Try to keep the number of instructions low. For all cases we will do at
19919 most two moves for the residual amount, since we'll always overlap the
19920 remainder. */
19921 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
19922 return false;
19923
19924 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
19925 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
19926
19927 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
19928 src = adjust_automodify_address (src, VOIDmode, base, 0);
19929
19930 /* Convert n to bits to make the rest of the code simpler. */
19931 n = n * BITS_PER_UNIT;
19932
19933 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
19934 larger than TImode, but we should not use them for loads/stores here. */
19935 const int copy_limit = GET_MODE_BITSIZE (TImode);
19936
19937 while (n > 0)
19938 {
19939 /* Find the largest mode in which to do the copy in without over reading
19940 or writing. */
19941 opt_scalar_int_mode mode_iter;
19942 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
19943 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
19944 cur_mode = mode_iter.require ();
19945
19946 gcc_assert (cur_mode != BLKmode);
19947
19948 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
19949 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
19950
19951 n -= mode_bits;
19952
19953 /* Do certain trailing copies as overlapping if it's going to be
19954 cheaper. i.e. less instructions to do so. For instance doing a 15
19955 byte copy it's more efficient to do two overlapping 8 byte copies than
19956 8 + 6 + 1. */
19957 if (n > 0 && n <= 8 * BITS_PER_UNIT)
19958 {
19959 next_mode = smallest_mode_for_size (n, MODE_INT);
19960 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
19961 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
19962 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
19963 n = n_bits;
19964 }
19965 }
19966
19967 return true;
19968 }
19969
19970 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
19971 SImode stores. Handle the case when the constant has identical
19972 bottom and top halves. This is beneficial when the two stores can be
19973 merged into an STP and we avoid synthesising potentially expensive
19974 immediates twice. Return true if such a split is possible. */
19975
19976 bool
19977 aarch64_split_dimode_const_store (rtx dst, rtx src)
19978 {
19979 rtx lo = gen_lowpart (SImode, src);
19980 rtx hi = gen_highpart_mode (SImode, DImode, src);
19981
19982 bool size_p = optimize_function_for_size_p (cfun);
19983
19984 if (!rtx_equal_p (lo, hi))
19985 return false;
19986
19987 unsigned int orig_cost
19988 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
19989 unsigned int lo_cost
19990 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
19991
19992 /* We want to transform:
19993 MOV x1, 49370
19994 MOVK x1, 0x140, lsl 16
19995 MOVK x1, 0xc0da, lsl 32
19996 MOVK x1, 0x140, lsl 48
19997 STR x1, [x0]
19998 into:
19999 MOV w1, 49370
20000 MOVK w1, 0x140, lsl 16
20001 STP w1, w1, [x0]
20002 So we want to perform this only when we save two instructions
20003 or more. When optimizing for size, however, accept any code size
20004 savings we can. */
20005 if (size_p && orig_cost <= lo_cost)
20006 return false;
20007
20008 if (!size_p
20009 && (orig_cost <= lo_cost + 1))
20010 return false;
20011
20012 rtx mem_lo = adjust_address (dst, SImode, 0);
20013 if (!aarch64_mem_pair_operand (mem_lo, SImode))
20014 return false;
20015
20016 rtx tmp_reg = gen_reg_rtx (SImode);
20017 aarch64_expand_mov_immediate (tmp_reg, lo);
20018 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
20019 /* Don't emit an explicit store pair as this may not be always profitable.
20020 Let the sched-fusion logic decide whether to merge them. */
20021 emit_move_insn (mem_lo, tmp_reg);
20022 emit_move_insn (mem_hi, tmp_reg);
20023
20024 return true;
20025 }
20026
20027 /* Generate RTL for a conditional branch with rtx comparison CODE in
20028 mode CC_MODE. The destination of the unlikely conditional branch
20029 is LABEL_REF. */
20030
20031 void
20032 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
20033 rtx label_ref)
20034 {
20035 rtx x;
20036 x = gen_rtx_fmt_ee (code, VOIDmode,
20037 gen_rtx_REG (cc_mode, CC_REGNUM),
20038 const0_rtx);
20039
20040 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
20041 gen_rtx_LABEL_REF (VOIDmode, label_ref),
20042 pc_rtx);
20043 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
20044 }
20045
20046 /* Generate DImode scratch registers for 128-bit (TImode) addition.
20047
20048 OP1 represents the TImode destination operand 1
20049 OP2 represents the TImode destination operand 2
20050 LOW_DEST represents the low half (DImode) of TImode operand 0
20051 LOW_IN1 represents the low half (DImode) of TImode operand 1
20052 LOW_IN2 represents the low half (DImode) of TImode operand 2
20053 HIGH_DEST represents the high half (DImode) of TImode operand 0
20054 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20055 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20056
20057 void
20058 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20059 rtx *low_in1, rtx *low_in2,
20060 rtx *high_dest, rtx *high_in1,
20061 rtx *high_in2)
20062 {
20063 *low_dest = gen_reg_rtx (DImode);
20064 *low_in1 = gen_lowpart (DImode, op1);
20065 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20066 subreg_lowpart_offset (DImode, TImode));
20067 *high_dest = gen_reg_rtx (DImode);
20068 *high_in1 = gen_highpart (DImode, op1);
20069 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20070 subreg_highpart_offset (DImode, TImode));
20071 }
20072
20073 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
20074
20075 This function differs from 'arch64_addti_scratch_regs' in that
20076 OP1 can be an immediate constant (zero). We must call
20077 subreg_highpart_offset with DImode and TImode arguments, otherwise
20078 VOIDmode will be used for the const_int which generates an internal
20079 error from subreg_size_highpart_offset which does not expect a size of zero.
20080
20081 OP1 represents the TImode destination operand 1
20082 OP2 represents the TImode destination operand 2
20083 LOW_DEST represents the low half (DImode) of TImode operand 0
20084 LOW_IN1 represents the low half (DImode) of TImode operand 1
20085 LOW_IN2 represents the low half (DImode) of TImode operand 2
20086 HIGH_DEST represents the high half (DImode) of TImode operand 0
20087 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20088 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
20089
20090
20091 void
20092 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
20093 rtx *low_in1, rtx *low_in2,
20094 rtx *high_dest, rtx *high_in1,
20095 rtx *high_in2)
20096 {
20097 *low_dest = gen_reg_rtx (DImode);
20098 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
20099 subreg_lowpart_offset (DImode, TImode));
20100
20101 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
20102 subreg_lowpart_offset (DImode, TImode));
20103 *high_dest = gen_reg_rtx (DImode);
20104
20105 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
20106 subreg_highpart_offset (DImode, TImode));
20107 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
20108 subreg_highpart_offset (DImode, TImode));
20109 }
20110
20111 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
20112
20113 OP0 represents the TImode destination operand 0
20114 LOW_DEST represents the low half (DImode) of TImode operand 0
20115 LOW_IN1 represents the low half (DImode) of TImode operand 1
20116 LOW_IN2 represents the low half (DImode) of TImode operand 2
20117 HIGH_DEST represents the high half (DImode) of TImode operand 0
20118 HIGH_IN1 represents the high half (DImode) of TImode operand 1
20119 HIGH_IN2 represents the high half (DImode) of TImode operand 2
20120 UNSIGNED_P is true if the operation is being performed on unsigned
20121 values. */
20122 void
20123 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
20124 rtx low_in2, rtx high_dest, rtx high_in1,
20125 rtx high_in2, bool unsigned_p)
20126 {
20127 if (low_in2 == const0_rtx)
20128 {
20129 low_dest = low_in1;
20130 high_in2 = force_reg (DImode, high_in2);
20131 if (unsigned_p)
20132 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
20133 else
20134 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
20135 }
20136 else
20137 {
20138 if (CONST_INT_P (low_in2))
20139 {
20140 high_in2 = force_reg (DImode, high_in2);
20141 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
20142 GEN_INT (-INTVAL (low_in2))));
20143 }
20144 else
20145 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
20146
20147 if (unsigned_p)
20148 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
20149 else
20150 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
20151 }
20152
20153 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
20154 emit_move_insn (gen_highpart (DImode, op0), high_dest);
20155
20156 }
20157
20158 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
20159
20160 static unsigned HOST_WIDE_INT
20161 aarch64_asan_shadow_offset (void)
20162 {
20163 if (TARGET_ILP32)
20164 return (HOST_WIDE_INT_1 << 29);
20165 else
20166 return (HOST_WIDE_INT_1 << 36);
20167 }
20168
20169 static rtx
20170 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
20171 int code, tree treeop0, tree treeop1)
20172 {
20173 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20174 rtx op0, op1;
20175 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20176 insn_code icode;
20177 struct expand_operand ops[4];
20178
20179 start_sequence ();
20180 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20181
20182 op_mode = GET_MODE (op0);
20183 if (op_mode == VOIDmode)
20184 op_mode = GET_MODE (op1);
20185
20186 switch (op_mode)
20187 {
20188 case E_QImode:
20189 case E_HImode:
20190 case E_SImode:
20191 cmp_mode = SImode;
20192 icode = CODE_FOR_cmpsi;
20193 break;
20194
20195 case E_DImode:
20196 cmp_mode = DImode;
20197 icode = CODE_FOR_cmpdi;
20198 break;
20199
20200 case E_SFmode:
20201 cmp_mode = SFmode;
20202 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20203 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
20204 break;
20205
20206 case E_DFmode:
20207 cmp_mode = DFmode;
20208 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
20209 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
20210 break;
20211
20212 default:
20213 end_sequence ();
20214 return NULL_RTX;
20215 }
20216
20217 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
20218 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
20219 if (!op0 || !op1)
20220 {
20221 end_sequence ();
20222 return NULL_RTX;
20223 }
20224 *prep_seq = get_insns ();
20225 end_sequence ();
20226
20227 create_fixed_operand (&ops[0], op0);
20228 create_fixed_operand (&ops[1], op1);
20229
20230 start_sequence ();
20231 if (!maybe_expand_insn (icode, 2, ops))
20232 {
20233 end_sequence ();
20234 return NULL_RTX;
20235 }
20236 *gen_seq = get_insns ();
20237 end_sequence ();
20238
20239 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
20240 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
20241 }
20242
20243 static rtx
20244 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
20245 int cmp_code, tree treeop0, tree treeop1, int bit_code)
20246 {
20247 rtx op0, op1, target;
20248 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
20249 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
20250 insn_code icode;
20251 struct expand_operand ops[6];
20252 int aarch64_cond;
20253
20254 push_to_sequence (*prep_seq);
20255 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
20256
20257 op_mode = GET_MODE (op0);
20258 if (op_mode == VOIDmode)
20259 op_mode = GET_MODE (op1);
20260
20261 switch (op_mode)
20262 {
20263 case E_QImode:
20264 case E_HImode:
20265 case E_SImode:
20266 cmp_mode = SImode;
20267 icode = CODE_FOR_ccmpsi;
20268 break;
20269
20270 case E_DImode:
20271 cmp_mode = DImode;
20272 icode = CODE_FOR_ccmpdi;
20273 break;
20274
20275 case E_SFmode:
20276 cmp_mode = SFmode;
20277 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20278 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
20279 break;
20280
20281 case E_DFmode:
20282 cmp_mode = DFmode;
20283 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
20284 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
20285 break;
20286
20287 default:
20288 end_sequence ();
20289 return NULL_RTX;
20290 }
20291
20292 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
20293 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
20294 if (!op0 || !op1)
20295 {
20296 end_sequence ();
20297 return NULL_RTX;
20298 }
20299 *prep_seq = get_insns ();
20300 end_sequence ();
20301
20302 target = gen_rtx_REG (cc_mode, CC_REGNUM);
20303 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
20304
20305 if (bit_code != AND)
20306 {
20307 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
20308 GET_MODE (XEXP (prev, 0))),
20309 VOIDmode, XEXP (prev, 0), const0_rtx);
20310 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
20311 }
20312
20313 create_fixed_operand (&ops[0], XEXP (prev, 0));
20314 create_fixed_operand (&ops[1], target);
20315 create_fixed_operand (&ops[2], op0);
20316 create_fixed_operand (&ops[3], op1);
20317 create_fixed_operand (&ops[4], prev);
20318 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
20319
20320 push_to_sequence (*gen_seq);
20321 if (!maybe_expand_insn (icode, 6, ops))
20322 {
20323 end_sequence ();
20324 return NULL_RTX;
20325 }
20326
20327 *gen_seq = get_insns ();
20328 end_sequence ();
20329
20330 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
20331 }
20332
20333 #undef TARGET_GEN_CCMP_FIRST
20334 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
20335
20336 #undef TARGET_GEN_CCMP_NEXT
20337 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
20338
20339 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
20340 instruction fusion of some sort. */
20341
20342 static bool
20343 aarch64_macro_fusion_p (void)
20344 {
20345 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
20346 }
20347
20348
20349 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
20350 should be kept together during scheduling. */
20351
20352 static bool
20353 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
20354 {
20355 rtx set_dest;
20356 rtx prev_set = single_set (prev);
20357 rtx curr_set = single_set (curr);
20358 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
20359 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
20360
20361 if (!aarch64_macro_fusion_p ())
20362 return false;
20363
20364 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
20365 {
20366 /* We are trying to match:
20367 prev (mov) == (set (reg r0) (const_int imm16))
20368 curr (movk) == (set (zero_extract (reg r0)
20369 (const_int 16)
20370 (const_int 16))
20371 (const_int imm16_1)) */
20372
20373 set_dest = SET_DEST (curr_set);
20374
20375 if (GET_CODE (set_dest) == ZERO_EXTRACT
20376 && CONST_INT_P (SET_SRC (curr_set))
20377 && CONST_INT_P (SET_SRC (prev_set))
20378 && CONST_INT_P (XEXP (set_dest, 2))
20379 && INTVAL (XEXP (set_dest, 2)) == 16
20380 && REG_P (XEXP (set_dest, 0))
20381 && REG_P (SET_DEST (prev_set))
20382 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
20383 {
20384 return true;
20385 }
20386 }
20387
20388 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
20389 {
20390
20391 /* We're trying to match:
20392 prev (adrp) == (set (reg r1)
20393 (high (symbol_ref ("SYM"))))
20394 curr (add) == (set (reg r0)
20395 (lo_sum (reg r1)
20396 (symbol_ref ("SYM"))))
20397 Note that r0 need not necessarily be the same as r1, especially
20398 during pre-regalloc scheduling. */
20399
20400 if (satisfies_constraint_Ush (SET_SRC (prev_set))
20401 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20402 {
20403 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
20404 && REG_P (XEXP (SET_SRC (curr_set), 0))
20405 && REGNO (XEXP (SET_SRC (curr_set), 0))
20406 == REGNO (SET_DEST (prev_set))
20407 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
20408 XEXP (SET_SRC (curr_set), 1)))
20409 return true;
20410 }
20411 }
20412
20413 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
20414 {
20415
20416 /* We're trying to match:
20417 prev (movk) == (set (zero_extract (reg r0)
20418 (const_int 16)
20419 (const_int 32))
20420 (const_int imm16_1))
20421 curr (movk) == (set (zero_extract (reg r0)
20422 (const_int 16)
20423 (const_int 48))
20424 (const_int imm16_2)) */
20425
20426 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
20427 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
20428 && REG_P (XEXP (SET_DEST (prev_set), 0))
20429 && REG_P (XEXP (SET_DEST (curr_set), 0))
20430 && REGNO (XEXP (SET_DEST (prev_set), 0))
20431 == REGNO (XEXP (SET_DEST (curr_set), 0))
20432 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
20433 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
20434 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
20435 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
20436 && CONST_INT_P (SET_SRC (prev_set))
20437 && CONST_INT_P (SET_SRC (curr_set)))
20438 return true;
20439
20440 }
20441 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
20442 {
20443 /* We're trying to match:
20444 prev (adrp) == (set (reg r0)
20445 (high (symbol_ref ("SYM"))))
20446 curr (ldr) == (set (reg r1)
20447 (mem (lo_sum (reg r0)
20448 (symbol_ref ("SYM")))))
20449 or
20450 curr (ldr) == (set (reg r1)
20451 (zero_extend (mem
20452 (lo_sum (reg r0)
20453 (symbol_ref ("SYM")))))) */
20454 if (satisfies_constraint_Ush (SET_SRC (prev_set))
20455 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
20456 {
20457 rtx curr_src = SET_SRC (curr_set);
20458
20459 if (GET_CODE (curr_src) == ZERO_EXTEND)
20460 curr_src = XEXP (curr_src, 0);
20461
20462 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
20463 && REG_P (XEXP (XEXP (curr_src, 0), 0))
20464 && REGNO (XEXP (XEXP (curr_src, 0), 0))
20465 == REGNO (SET_DEST (prev_set))
20466 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
20467 XEXP (SET_SRC (prev_set), 0)))
20468 return true;
20469 }
20470 }
20471
20472 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
20473 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
20474 && prev_set && curr_set && any_condjump_p (curr)
20475 && GET_CODE (SET_SRC (prev_set)) == COMPARE
20476 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
20477 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
20478 return true;
20479
20480 /* Fuse flag-setting ALU instructions and conditional branch. */
20481 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
20482 && any_condjump_p (curr))
20483 {
20484 unsigned int condreg1, condreg2;
20485 rtx cc_reg_1;
20486 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
20487 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
20488
20489 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
20490 && prev
20491 && modified_in_p (cc_reg_1, prev))
20492 {
20493 enum attr_type prev_type = get_attr_type (prev);
20494
20495 /* FIXME: this misses some which is considered simple arthematic
20496 instructions for ThunderX. Simple shifts are missed here. */
20497 if (prev_type == TYPE_ALUS_SREG
20498 || prev_type == TYPE_ALUS_IMM
20499 || prev_type == TYPE_LOGICS_REG
20500 || prev_type == TYPE_LOGICS_IMM)
20501 return true;
20502 }
20503 }
20504
20505 /* Fuse ALU instructions and CBZ/CBNZ. */
20506 if (prev_set
20507 && curr_set
20508 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
20509 && any_condjump_p (curr))
20510 {
20511 /* We're trying to match:
20512 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
20513 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
20514 (const_int 0))
20515 (label_ref ("SYM"))
20516 (pc)) */
20517 if (SET_DEST (curr_set) == (pc_rtx)
20518 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
20519 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
20520 && REG_P (SET_DEST (prev_set))
20521 && REGNO (SET_DEST (prev_set))
20522 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
20523 {
20524 /* Fuse ALU operations followed by conditional branch instruction. */
20525 switch (get_attr_type (prev))
20526 {
20527 case TYPE_ALU_IMM:
20528 case TYPE_ALU_SREG:
20529 case TYPE_ADC_REG:
20530 case TYPE_ADC_IMM:
20531 case TYPE_ADCS_REG:
20532 case TYPE_ADCS_IMM:
20533 case TYPE_LOGIC_REG:
20534 case TYPE_LOGIC_IMM:
20535 case TYPE_CSEL:
20536 case TYPE_ADR:
20537 case TYPE_MOV_IMM:
20538 case TYPE_SHIFT_REG:
20539 case TYPE_SHIFT_IMM:
20540 case TYPE_BFM:
20541 case TYPE_RBIT:
20542 case TYPE_REV:
20543 case TYPE_EXTEND:
20544 return true;
20545
20546 default:;
20547 }
20548 }
20549 }
20550
20551 return false;
20552 }
20553
20554 /* Return true iff the instruction fusion described by OP is enabled. */
20555
20556 bool
20557 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
20558 {
20559 return (aarch64_tune_params.fusible_ops & op) != 0;
20560 }
20561
20562 /* If MEM is in the form of [base+offset], extract the two parts
20563 of address and set to BASE and OFFSET, otherwise return false
20564 after clearing BASE and OFFSET. */
20565
20566 bool
20567 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
20568 {
20569 rtx addr;
20570
20571 gcc_assert (MEM_P (mem));
20572
20573 addr = XEXP (mem, 0);
20574
20575 if (REG_P (addr))
20576 {
20577 *base = addr;
20578 *offset = const0_rtx;
20579 return true;
20580 }
20581
20582 if (GET_CODE (addr) == PLUS
20583 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
20584 {
20585 *base = XEXP (addr, 0);
20586 *offset = XEXP (addr, 1);
20587 return true;
20588 }
20589
20590 *base = NULL_RTX;
20591 *offset = NULL_RTX;
20592
20593 return false;
20594 }
20595
20596 /* Types for scheduling fusion. */
20597 enum sched_fusion_type
20598 {
20599 SCHED_FUSION_NONE = 0,
20600 SCHED_FUSION_LD_SIGN_EXTEND,
20601 SCHED_FUSION_LD_ZERO_EXTEND,
20602 SCHED_FUSION_LD,
20603 SCHED_FUSION_ST,
20604 SCHED_FUSION_NUM
20605 };
20606
20607 /* If INSN is a load or store of address in the form of [base+offset],
20608 extract the two parts and set to BASE and OFFSET. Return scheduling
20609 fusion type this INSN is. */
20610
20611 static enum sched_fusion_type
20612 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
20613 {
20614 rtx x, dest, src;
20615 enum sched_fusion_type fusion = SCHED_FUSION_LD;
20616
20617 gcc_assert (INSN_P (insn));
20618 x = PATTERN (insn);
20619 if (GET_CODE (x) != SET)
20620 return SCHED_FUSION_NONE;
20621
20622 src = SET_SRC (x);
20623 dest = SET_DEST (x);
20624
20625 machine_mode dest_mode = GET_MODE (dest);
20626
20627 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
20628 return SCHED_FUSION_NONE;
20629
20630 if (GET_CODE (src) == SIGN_EXTEND)
20631 {
20632 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
20633 src = XEXP (src, 0);
20634 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20635 return SCHED_FUSION_NONE;
20636 }
20637 else if (GET_CODE (src) == ZERO_EXTEND)
20638 {
20639 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
20640 src = XEXP (src, 0);
20641 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
20642 return SCHED_FUSION_NONE;
20643 }
20644
20645 if (GET_CODE (src) == MEM && REG_P (dest))
20646 extract_base_offset_in_addr (src, base, offset);
20647 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
20648 {
20649 fusion = SCHED_FUSION_ST;
20650 extract_base_offset_in_addr (dest, base, offset);
20651 }
20652 else
20653 return SCHED_FUSION_NONE;
20654
20655 if (*base == NULL_RTX || *offset == NULL_RTX)
20656 fusion = SCHED_FUSION_NONE;
20657
20658 return fusion;
20659 }
20660
20661 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
20662
20663 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
20664 and PRI are only calculated for these instructions. For other instruction,
20665 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
20666 type instruction fusion can be added by returning different priorities.
20667
20668 It's important that irrelevant instructions get the largest FUSION_PRI. */
20669
20670 static void
20671 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
20672 int *fusion_pri, int *pri)
20673 {
20674 int tmp, off_val;
20675 rtx base, offset;
20676 enum sched_fusion_type fusion;
20677
20678 gcc_assert (INSN_P (insn));
20679
20680 tmp = max_pri - 1;
20681 fusion = fusion_load_store (insn, &base, &offset);
20682 if (fusion == SCHED_FUSION_NONE)
20683 {
20684 *pri = tmp;
20685 *fusion_pri = tmp;
20686 return;
20687 }
20688
20689 /* Set FUSION_PRI according to fusion type and base register. */
20690 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
20691
20692 /* Calculate PRI. */
20693 tmp /= 2;
20694
20695 /* INSN with smaller offset goes first. */
20696 off_val = (int)(INTVAL (offset));
20697 if (off_val >= 0)
20698 tmp -= (off_val & 0xfffff);
20699 else
20700 tmp += ((- off_val) & 0xfffff);
20701
20702 *pri = tmp;
20703 return;
20704 }
20705
20706 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
20707 Adjust priority of sha1h instructions so they are scheduled before
20708 other SHA1 instructions. */
20709
20710 static int
20711 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
20712 {
20713 rtx x = PATTERN (insn);
20714
20715 if (GET_CODE (x) == SET)
20716 {
20717 x = SET_SRC (x);
20718
20719 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
20720 return priority + 10;
20721 }
20722
20723 return priority;
20724 }
20725
20726 /* Given OPERANDS of consecutive load/store, check if we can merge
20727 them into ldp/stp. LOAD is true if they are load instructions.
20728 MODE is the mode of memory operands. */
20729
20730 bool
20731 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
20732 machine_mode mode)
20733 {
20734 HOST_WIDE_INT offval_1, offval_2, msize;
20735 enum reg_class rclass_1, rclass_2;
20736 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
20737
20738 if (load)
20739 {
20740 mem_1 = operands[1];
20741 mem_2 = operands[3];
20742 reg_1 = operands[0];
20743 reg_2 = operands[2];
20744 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
20745 if (REGNO (reg_1) == REGNO (reg_2))
20746 return false;
20747 }
20748 else
20749 {
20750 mem_1 = operands[0];
20751 mem_2 = operands[2];
20752 reg_1 = operands[1];
20753 reg_2 = operands[3];
20754 }
20755
20756 /* The mems cannot be volatile. */
20757 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
20758 return false;
20759
20760 /* If we have SImode and slow unaligned ldp,
20761 check the alignment to be at least 8 byte. */
20762 if (mode == SImode
20763 && (aarch64_tune_params.extra_tuning_flags
20764 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
20765 && !optimize_size
20766 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
20767 return false;
20768
20769 /* Check if the addresses are in the form of [base+offset]. */
20770 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20771 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
20772 return false;
20773 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20774 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
20775 return false;
20776
20777 /* Check if the bases are same. */
20778 if (!rtx_equal_p (base_1, base_2))
20779 return false;
20780
20781 /* The operands must be of the same size. */
20782 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
20783 GET_MODE_SIZE (GET_MODE (mem_2))));
20784
20785 offval_1 = INTVAL (offset_1);
20786 offval_2 = INTVAL (offset_2);
20787 /* We should only be trying this for fixed-sized modes. There is no
20788 SVE LDP/STP instruction. */
20789 msize = GET_MODE_SIZE (mode).to_constant ();
20790 /* Check if the offsets are consecutive. */
20791 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
20792 return false;
20793
20794 /* Check if the addresses are clobbered by load. */
20795 if (load)
20796 {
20797 if (reg_mentioned_p (reg_1, mem_1))
20798 return false;
20799
20800 /* In increasing order, the last load can clobber the address. */
20801 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
20802 return false;
20803 }
20804
20805 /* One of the memory accesses must be a mempair operand.
20806 If it is not the first one, they need to be swapped by the
20807 peephole. */
20808 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
20809 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
20810 return false;
20811
20812 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
20813 rclass_1 = FP_REGS;
20814 else
20815 rclass_1 = GENERAL_REGS;
20816
20817 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
20818 rclass_2 = FP_REGS;
20819 else
20820 rclass_2 = GENERAL_REGS;
20821
20822 /* Check if the registers are of same class. */
20823 if (rclass_1 != rclass_2)
20824 return false;
20825
20826 return true;
20827 }
20828
20829 /* Given OPERANDS of consecutive load/store that can be merged,
20830 swap them if they are not in ascending order. */
20831 void
20832 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
20833 {
20834 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
20835 HOST_WIDE_INT offval_1, offval_2;
20836
20837 if (load)
20838 {
20839 mem_1 = operands[1];
20840 mem_2 = operands[3];
20841 }
20842 else
20843 {
20844 mem_1 = operands[0];
20845 mem_2 = operands[2];
20846 }
20847
20848 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
20849 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
20850
20851 offval_1 = INTVAL (offset_1);
20852 offval_2 = INTVAL (offset_2);
20853
20854 if (offval_1 > offval_2)
20855 {
20856 /* Irrespective of whether this is a load or a store,
20857 we do the same swap. */
20858 std::swap (operands[0], operands[2]);
20859 std::swap (operands[1], operands[3]);
20860 }
20861 }
20862
20863 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
20864 comparison between the two. */
20865 int
20866 aarch64_host_wide_int_compare (const void *x, const void *y)
20867 {
20868 return wi::cmps (* ((const HOST_WIDE_INT *) x),
20869 * ((const HOST_WIDE_INT *) y));
20870 }
20871
20872 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
20873 other pointing to a REG rtx containing an offset, compare the offsets
20874 of the two pairs.
20875
20876 Return:
20877
20878 1 iff offset (X) > offset (Y)
20879 0 iff offset (X) == offset (Y)
20880 -1 iff offset (X) < offset (Y) */
20881 int
20882 aarch64_ldrstr_offset_compare (const void *x, const void *y)
20883 {
20884 const rtx * operands_1 = (const rtx *) x;
20885 const rtx * operands_2 = (const rtx *) y;
20886 rtx mem_1, mem_2, base, offset_1, offset_2;
20887
20888 if (MEM_P (operands_1[0]))
20889 mem_1 = operands_1[0];
20890 else
20891 mem_1 = operands_1[1];
20892
20893 if (MEM_P (operands_2[0]))
20894 mem_2 = operands_2[0];
20895 else
20896 mem_2 = operands_2[1];
20897
20898 /* Extract the offsets. */
20899 extract_base_offset_in_addr (mem_1, &base, &offset_1);
20900 extract_base_offset_in_addr (mem_2, &base, &offset_2);
20901
20902 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
20903
20904 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
20905 }
20906
20907 /* Given OPERANDS of consecutive load/store, check if we can merge
20908 them into ldp/stp by adjusting the offset. LOAD is true if they
20909 are load instructions. MODE is the mode of memory operands.
20910
20911 Given below consecutive stores:
20912
20913 str w1, [xb, 0x100]
20914 str w1, [xb, 0x104]
20915 str w1, [xb, 0x108]
20916 str w1, [xb, 0x10c]
20917
20918 Though the offsets are out of the range supported by stp, we can
20919 still pair them after adjusting the offset, like:
20920
20921 add scratch, xb, 0x100
20922 stp w1, w1, [scratch]
20923 stp w1, w1, [scratch, 0x8]
20924
20925 The peephole patterns detecting this opportunity should guarantee
20926 the scratch register is avaliable. */
20927
20928 bool
20929 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
20930 scalar_mode mode)
20931 {
20932 const int num_insns = 4;
20933 enum reg_class rclass;
20934 HOST_WIDE_INT offvals[num_insns], msize;
20935 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
20936
20937 if (load)
20938 {
20939 for (int i = 0; i < num_insns; i++)
20940 {
20941 reg[i] = operands[2 * i];
20942 mem[i] = operands[2 * i + 1];
20943
20944 gcc_assert (REG_P (reg[i]));
20945 }
20946
20947 /* Do not attempt to merge the loads if the loads clobber each other. */
20948 for (int i = 0; i < 8; i += 2)
20949 for (int j = i + 2; j < 8; j += 2)
20950 if (reg_overlap_mentioned_p (operands[i], operands[j]))
20951 return false;
20952 }
20953 else
20954 for (int i = 0; i < num_insns; i++)
20955 {
20956 mem[i] = operands[2 * i];
20957 reg[i] = operands[2 * i + 1];
20958 }
20959
20960 /* Skip if memory operand is by itself valid for ldp/stp. */
20961 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
20962 return false;
20963
20964 for (int i = 0; i < num_insns; i++)
20965 {
20966 /* The mems cannot be volatile. */
20967 if (MEM_VOLATILE_P (mem[i]))
20968 return false;
20969
20970 /* Check if the addresses are in the form of [base+offset]. */
20971 extract_base_offset_in_addr (mem[i], base + i, offset + i);
20972 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
20973 return false;
20974 }
20975
20976 /* Check if the registers are of same class. */
20977 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
20978 ? FP_REGS : GENERAL_REGS;
20979
20980 for (int i = 1; i < num_insns; i++)
20981 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
20982 {
20983 if (rclass != FP_REGS)
20984 return false;
20985 }
20986 else
20987 {
20988 if (rclass != GENERAL_REGS)
20989 return false;
20990 }
20991
20992 /* Only the last register in the order in which they occur
20993 may be clobbered by the load. */
20994 if (rclass == GENERAL_REGS && load)
20995 for (int i = 0; i < num_insns - 1; i++)
20996 if (reg_mentioned_p (reg[i], mem[i]))
20997 return false;
20998
20999 /* Check if the bases are same. */
21000 for (int i = 0; i < num_insns - 1; i++)
21001 if (!rtx_equal_p (base[i], base[i + 1]))
21002 return false;
21003
21004 for (int i = 0; i < num_insns; i++)
21005 offvals[i] = INTVAL (offset[i]);
21006
21007 msize = GET_MODE_SIZE (mode);
21008
21009 /* Check if the offsets can be put in the right order to do a ldp/stp. */
21010 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
21011 aarch64_host_wide_int_compare);
21012
21013 if (!(offvals[1] == offvals[0] + msize
21014 && offvals[3] == offvals[2] + msize))
21015 return false;
21016
21017 /* Check that offsets are within range of each other. The ldp/stp
21018 instructions have 7 bit immediate offsets, so use 0x80. */
21019 if (offvals[2] - offvals[0] >= msize * 0x80)
21020 return false;
21021
21022 /* The offsets must be aligned with respect to each other. */
21023 if (offvals[0] % msize != offvals[2] % msize)
21024 return false;
21025
21026 /* If we have SImode and slow unaligned ldp,
21027 check the alignment to be at least 8 byte. */
21028 if (mode == SImode
21029 && (aarch64_tune_params.extra_tuning_flags
21030 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
21031 && !optimize_size
21032 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
21033 return false;
21034
21035 return true;
21036 }
21037
21038 /* Given OPERANDS of consecutive load/store, this function pairs them
21039 into LDP/STP after adjusting the offset. It depends on the fact
21040 that the operands can be sorted so the offsets are correct for STP.
21041 MODE is the mode of memory operands. CODE is the rtl operator
21042 which should be applied to all memory operands, it's SIGN_EXTEND,
21043 ZERO_EXTEND or UNKNOWN. */
21044
21045 bool
21046 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
21047 scalar_mode mode, RTX_CODE code)
21048 {
21049 rtx base, offset_1, offset_3, t1, t2;
21050 rtx mem_1, mem_2, mem_3, mem_4;
21051 rtx temp_operands[8];
21052 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
21053 stp_off_upper_limit, stp_off_lower_limit, msize;
21054
21055 /* We make changes on a copy as we may still bail out. */
21056 for (int i = 0; i < 8; i ++)
21057 temp_operands[i] = operands[i];
21058
21059 /* Sort the operands. */
21060 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
21061
21062 /* Copy the memory operands so that if we have to bail for some
21063 reason the original addresses are unchanged. */
21064 if (load)
21065 {
21066 mem_1 = copy_rtx (temp_operands[1]);
21067 mem_2 = copy_rtx (temp_operands[3]);
21068 mem_3 = copy_rtx (temp_operands[5]);
21069 mem_4 = copy_rtx (temp_operands[7]);
21070 }
21071 else
21072 {
21073 mem_1 = copy_rtx (temp_operands[0]);
21074 mem_2 = copy_rtx (temp_operands[2]);
21075 mem_3 = copy_rtx (temp_operands[4]);
21076 mem_4 = copy_rtx (temp_operands[6]);
21077 gcc_assert (code == UNKNOWN);
21078 }
21079
21080 extract_base_offset_in_addr (mem_1, &base, &offset_1);
21081 extract_base_offset_in_addr (mem_3, &base, &offset_3);
21082 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
21083 && offset_3 != NULL_RTX);
21084
21085 /* Adjust offset so it can fit in LDP/STP instruction. */
21086 msize = GET_MODE_SIZE (mode);
21087 stp_off_upper_limit = msize * (0x40 - 1);
21088 stp_off_lower_limit = - msize * 0x40;
21089
21090 off_val_1 = INTVAL (offset_1);
21091 off_val_3 = INTVAL (offset_3);
21092
21093 /* The base offset is optimally half way between the two STP/LDP offsets. */
21094 if (msize <= 4)
21095 base_off = (off_val_1 + off_val_3) / 2;
21096 else
21097 /* However, due to issues with negative LDP/STP offset generation for
21098 larger modes, for DF, DI and vector modes. we must not use negative
21099 addresses smaller than 9 signed unadjusted bits can store. This
21100 provides the most range in this case. */
21101 base_off = off_val_1;
21102
21103 /* Adjust the base so that it is aligned with the addresses but still
21104 optimal. */
21105 if (base_off % msize != off_val_1 % msize)
21106 /* Fix the offset, bearing in mind we want to make it bigger not
21107 smaller. */
21108 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21109 else if (msize <= 4)
21110 /* The negative range of LDP/STP is one larger than the positive range. */
21111 base_off += msize;
21112
21113 /* Check if base offset is too big or too small. We can attempt to resolve
21114 this issue by setting it to the maximum value and seeing if the offsets
21115 still fit. */
21116 if (base_off >= 0x1000)
21117 {
21118 base_off = 0x1000 - 1;
21119 /* We must still make sure that the base offset is aligned with respect
21120 to the address. But it may may not be made any bigger. */
21121 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21122 }
21123
21124 /* Likewise for the case where the base is too small. */
21125 if (base_off <= -0x1000)
21126 {
21127 base_off = -0x1000 + 1;
21128 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
21129 }
21130
21131 /* Offset of the first STP/LDP. */
21132 new_off_1 = off_val_1 - base_off;
21133
21134 /* Offset of the second STP/LDP. */
21135 new_off_3 = off_val_3 - base_off;
21136
21137 /* The offsets must be within the range of the LDP/STP instructions. */
21138 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
21139 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
21140 return false;
21141
21142 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
21143 new_off_1), true);
21144 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
21145 new_off_1 + msize), true);
21146 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
21147 new_off_3), true);
21148 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
21149 new_off_3 + msize), true);
21150
21151 if (!aarch64_mem_pair_operand (mem_1, mode)
21152 || !aarch64_mem_pair_operand (mem_3, mode))
21153 return false;
21154
21155 if (code == ZERO_EXTEND)
21156 {
21157 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
21158 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
21159 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
21160 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
21161 }
21162 else if (code == SIGN_EXTEND)
21163 {
21164 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
21165 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
21166 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
21167 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
21168 }
21169
21170 if (load)
21171 {
21172 operands[0] = temp_operands[0];
21173 operands[1] = mem_1;
21174 operands[2] = temp_operands[2];
21175 operands[3] = mem_2;
21176 operands[4] = temp_operands[4];
21177 operands[5] = mem_3;
21178 operands[6] = temp_operands[6];
21179 operands[7] = mem_4;
21180 }
21181 else
21182 {
21183 operands[0] = mem_1;
21184 operands[1] = temp_operands[1];
21185 operands[2] = mem_2;
21186 operands[3] = temp_operands[3];
21187 operands[4] = mem_3;
21188 operands[5] = temp_operands[5];
21189 operands[6] = mem_4;
21190 operands[7] = temp_operands[7];
21191 }
21192
21193 /* Emit adjusting instruction. */
21194 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
21195 /* Emit ldp/stp instructions. */
21196 t1 = gen_rtx_SET (operands[0], operands[1]);
21197 t2 = gen_rtx_SET (operands[2], operands[3]);
21198 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21199 t1 = gen_rtx_SET (operands[4], operands[5]);
21200 t2 = gen_rtx_SET (operands[6], operands[7]);
21201 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
21202 return true;
21203 }
21204
21205 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
21206 it isn't worth branching around empty masked ops (including masked
21207 stores). */
21208
21209 static bool
21210 aarch64_empty_mask_is_expensive (unsigned)
21211 {
21212 return false;
21213 }
21214
21215 /* Return 1 if pseudo register should be created and used to hold
21216 GOT address for PIC code. */
21217
21218 bool
21219 aarch64_use_pseudo_pic_reg (void)
21220 {
21221 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
21222 }
21223
21224 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
21225
21226 static int
21227 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
21228 {
21229 switch (XINT (x, 1))
21230 {
21231 case UNSPEC_GOTSMALLPIC:
21232 case UNSPEC_GOTSMALLPIC28K:
21233 case UNSPEC_GOTTINYPIC:
21234 return 0;
21235 default:
21236 break;
21237 }
21238
21239 return default_unspec_may_trap_p (x, flags);
21240 }
21241
21242
21243 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
21244 return the log2 of that value. Otherwise return -1. */
21245
21246 int
21247 aarch64_fpconst_pow_of_2 (rtx x)
21248 {
21249 const REAL_VALUE_TYPE *r;
21250
21251 if (!CONST_DOUBLE_P (x))
21252 return -1;
21253
21254 r = CONST_DOUBLE_REAL_VALUE (x);
21255
21256 if (REAL_VALUE_NEGATIVE (*r)
21257 || REAL_VALUE_ISNAN (*r)
21258 || REAL_VALUE_ISINF (*r)
21259 || !real_isinteger (r, DFmode))
21260 return -1;
21261
21262 return exact_log2 (real_to_integer (r));
21263 }
21264
21265 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
21266 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
21267 return n. Otherwise return -1. */
21268
21269 int
21270 aarch64_fpconst_pow2_recip (rtx x)
21271 {
21272 REAL_VALUE_TYPE r0;
21273
21274 if (!CONST_DOUBLE_P (x))
21275 return -1;
21276
21277 r0 = *CONST_DOUBLE_REAL_VALUE (x);
21278 if (exact_real_inverse (DFmode, &r0)
21279 && !REAL_VALUE_NEGATIVE (r0))
21280 {
21281 int ret = exact_log2 (real_to_integer (&r0));
21282 if (ret >= 1 && ret <= 32)
21283 return ret;
21284 }
21285 return -1;
21286 }
21287
21288 /* If X is a vector of equal CONST_DOUBLE values and that value is
21289 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
21290
21291 int
21292 aarch64_vec_fpconst_pow_of_2 (rtx x)
21293 {
21294 int nelts;
21295 if (GET_CODE (x) != CONST_VECTOR
21296 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
21297 return -1;
21298
21299 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
21300 return -1;
21301
21302 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
21303 if (firstval <= 0)
21304 return -1;
21305
21306 for (int i = 1; i < nelts; i++)
21307 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
21308 return -1;
21309
21310 return firstval;
21311 }
21312
21313 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
21314 to float.
21315
21316 __fp16 always promotes through this hook.
21317 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
21318 through the generic excess precision logic rather than here. */
21319
21320 static tree
21321 aarch64_promoted_type (const_tree t)
21322 {
21323 if (SCALAR_FLOAT_TYPE_P (t)
21324 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
21325 return float_type_node;
21326
21327 return NULL_TREE;
21328 }
21329
21330 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
21331
21332 static bool
21333 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
21334 optimization_type opt_type)
21335 {
21336 switch (op)
21337 {
21338 case rsqrt_optab:
21339 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
21340
21341 default:
21342 return true;
21343 }
21344 }
21345
21346 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
21347
21348 static unsigned int
21349 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
21350 int *offset)
21351 {
21352 /* Polynomial invariant 1 == (VG / 2) - 1. */
21353 gcc_assert (i == 1);
21354 *factor = 2;
21355 *offset = 1;
21356 return AARCH64_DWARF_VG;
21357 }
21358
21359 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
21360 if MODE is HFmode, and punt to the generic implementation otherwise. */
21361
21362 static bool
21363 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
21364 {
21365 return (mode == HFmode
21366 ? true
21367 : default_libgcc_floating_mode_supported_p (mode));
21368 }
21369
21370 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
21371 if MODE is HFmode, and punt to the generic implementation otherwise. */
21372
21373 static bool
21374 aarch64_scalar_mode_supported_p (scalar_mode mode)
21375 {
21376 return (mode == HFmode
21377 ? true
21378 : default_scalar_mode_supported_p (mode));
21379 }
21380
21381 /* Set the value of FLT_EVAL_METHOD.
21382 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
21383
21384 0: evaluate all operations and constants, whose semantic type has at
21385 most the range and precision of type float, to the range and
21386 precision of float; evaluate all other operations and constants to
21387 the range and precision of the semantic type;
21388
21389 N, where _FloatN is a supported interchange floating type
21390 evaluate all operations and constants, whose semantic type has at
21391 most the range and precision of _FloatN type, to the range and
21392 precision of the _FloatN type; evaluate all other operations and
21393 constants to the range and precision of the semantic type;
21394
21395 If we have the ARMv8.2-A extensions then we support _Float16 in native
21396 precision, so we should set this to 16. Otherwise, we support the type,
21397 but want to evaluate expressions in float precision, so set this to
21398 0. */
21399
21400 static enum flt_eval_method
21401 aarch64_excess_precision (enum excess_precision_type type)
21402 {
21403 switch (type)
21404 {
21405 case EXCESS_PRECISION_TYPE_FAST:
21406 case EXCESS_PRECISION_TYPE_STANDARD:
21407 /* We can calculate either in 16-bit range and precision or
21408 32-bit range and precision. Make that decision based on whether
21409 we have native support for the ARMv8.2-A 16-bit floating-point
21410 instructions or not. */
21411 return (TARGET_FP_F16INST
21412 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
21413 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
21414 case EXCESS_PRECISION_TYPE_IMPLICIT:
21415 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
21416 default:
21417 gcc_unreachable ();
21418 }
21419 return FLT_EVAL_METHOD_UNPREDICTABLE;
21420 }
21421
21422 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
21423 scheduled for speculative execution. Reject the long-running division
21424 and square-root instructions. */
21425
21426 static bool
21427 aarch64_sched_can_speculate_insn (rtx_insn *insn)
21428 {
21429 switch (get_attr_type (insn))
21430 {
21431 case TYPE_SDIV:
21432 case TYPE_UDIV:
21433 case TYPE_FDIVS:
21434 case TYPE_FDIVD:
21435 case TYPE_FSQRTS:
21436 case TYPE_FSQRTD:
21437 case TYPE_NEON_FP_SQRT_S:
21438 case TYPE_NEON_FP_SQRT_D:
21439 case TYPE_NEON_FP_SQRT_S_Q:
21440 case TYPE_NEON_FP_SQRT_D_Q:
21441 case TYPE_NEON_FP_DIV_S:
21442 case TYPE_NEON_FP_DIV_D:
21443 case TYPE_NEON_FP_DIV_S_Q:
21444 case TYPE_NEON_FP_DIV_D_Q:
21445 return false;
21446 default:
21447 return true;
21448 }
21449 }
21450
21451 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
21452
21453 static int
21454 aarch64_compute_pressure_classes (reg_class *classes)
21455 {
21456 int i = 0;
21457 classes[i++] = GENERAL_REGS;
21458 classes[i++] = FP_REGS;
21459 /* PR_REGS isn't a useful pressure class because many predicate pseudo
21460 registers need to go in PR_LO_REGS at some point during their
21461 lifetime. Splitting it into two halves has the effect of making
21462 all predicates count against PR_LO_REGS, so that we try whenever
21463 possible to restrict the number of live predicates to 8. This
21464 greatly reduces the amount of spilling in certain loops. */
21465 classes[i++] = PR_LO_REGS;
21466 classes[i++] = PR_HI_REGS;
21467 return i;
21468 }
21469
21470 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
21471
21472 static bool
21473 aarch64_can_change_mode_class (machine_mode from,
21474 machine_mode to, reg_class_t)
21475 {
21476 unsigned int from_flags = aarch64_classify_vector_mode (from);
21477 unsigned int to_flags = aarch64_classify_vector_mode (to);
21478
21479 bool from_sve_p = (from_flags & VEC_ANY_SVE);
21480 bool to_sve_p = (to_flags & VEC_ANY_SVE);
21481
21482 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
21483 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
21484
21485 /* Don't allow changes between partial SVE modes and other modes.
21486 The contents of partial SVE modes are distributed evenly across
21487 the register, whereas GCC expects them to be clustered together. */
21488 if (from_partial_sve_p != to_partial_sve_p)
21489 return false;
21490
21491 /* Similarly reject changes between partial SVE modes that have
21492 different patterns of significant and insignificant bits. */
21493 if (from_partial_sve_p
21494 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
21495 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
21496 return false;
21497
21498 if (BYTES_BIG_ENDIAN)
21499 {
21500 /* Don't allow changes between SVE data modes and non-SVE modes.
21501 See the comment at the head of aarch64-sve.md for details. */
21502 if (from_sve_p != to_sve_p)
21503 return false;
21504
21505 /* Don't allow changes in element size: lane 0 of the new vector
21506 would not then be lane 0 of the old vector. See the comment
21507 above aarch64_maybe_expand_sve_subreg_move for a more detailed
21508 description.
21509
21510 In the worst case, this forces a register to be spilled in
21511 one mode and reloaded in the other, which handles the
21512 endianness correctly. */
21513 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
21514 return false;
21515 }
21516 return true;
21517 }
21518
21519 /* Implement TARGET_EARLY_REMAT_MODES. */
21520
21521 static void
21522 aarch64_select_early_remat_modes (sbitmap modes)
21523 {
21524 /* SVE values are not normally live across a call, so it should be
21525 worth doing early rematerialization even in VL-specific mode. */
21526 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
21527 if (aarch64_sve_mode_p ((machine_mode) i))
21528 bitmap_set_bit (modes, i);
21529 }
21530
21531 /* Override the default target speculation_safe_value. */
21532 static rtx
21533 aarch64_speculation_safe_value (machine_mode mode,
21534 rtx result, rtx val, rtx failval)
21535 {
21536 /* Maybe we should warn if falling back to hard barriers. They are
21537 likely to be noticably more expensive than the alternative below. */
21538 if (!aarch64_track_speculation)
21539 return default_speculation_safe_value (mode, result, val, failval);
21540
21541 if (!REG_P (val))
21542 val = copy_to_mode_reg (mode, val);
21543
21544 if (!aarch64_reg_or_zero (failval, mode))
21545 failval = copy_to_mode_reg (mode, failval);
21546
21547 emit_insn (gen_despeculate_copy (mode, result, val, failval));
21548 return result;
21549 }
21550
21551 /* Implement TARGET_ESTIMATED_POLY_VALUE.
21552 Look into the tuning structure for an estimate.
21553 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
21554 Advanced SIMD 128 bits. */
21555
21556 static HOST_WIDE_INT
21557 aarch64_estimated_poly_value (poly_int64 val)
21558 {
21559 enum aarch64_sve_vector_bits_enum width_source
21560 = aarch64_tune_params.sve_width;
21561
21562 /* If we still don't have an estimate, use the default. */
21563 if (width_source == SVE_SCALABLE)
21564 return default_estimated_poly_value (val);
21565
21566 HOST_WIDE_INT over_128 = width_source - 128;
21567 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
21568 }
21569
21570
21571 /* Return true for types that could be supported as SIMD return or
21572 argument types. */
21573
21574 static bool
21575 supported_simd_type (tree t)
21576 {
21577 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
21578 {
21579 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
21580 return s == 1 || s == 2 || s == 4 || s == 8;
21581 }
21582 return false;
21583 }
21584
21585 /* Return true for types that currently are supported as SIMD return
21586 or argument types. */
21587
21588 static bool
21589 currently_supported_simd_type (tree t, tree b)
21590 {
21591 if (COMPLEX_FLOAT_TYPE_P (t))
21592 return false;
21593
21594 if (TYPE_SIZE (t) != TYPE_SIZE (b))
21595 return false;
21596
21597 return supported_simd_type (t);
21598 }
21599
21600 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
21601
21602 static int
21603 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
21604 struct cgraph_simd_clone *clonei,
21605 tree base_type, int num)
21606 {
21607 tree t, ret_type, arg_type;
21608 unsigned int elt_bits, vec_bits, count;
21609
21610 if (!TARGET_SIMD)
21611 return 0;
21612
21613 if (clonei->simdlen
21614 && (clonei->simdlen < 2
21615 || clonei->simdlen > 1024
21616 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
21617 {
21618 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21619 "unsupported simdlen %d", clonei->simdlen);
21620 return 0;
21621 }
21622
21623 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
21624 if (TREE_CODE (ret_type) != VOID_TYPE
21625 && !currently_supported_simd_type (ret_type, base_type))
21626 {
21627 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
21628 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21629 "GCC does not currently support mixed size types "
21630 "for %<simd%> functions");
21631 else if (supported_simd_type (ret_type))
21632 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21633 "GCC does not currently support return type %qT "
21634 "for %<simd%> functions", ret_type);
21635 else
21636 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21637 "unsupported return type %qT for %<simd%> functions",
21638 ret_type);
21639 return 0;
21640 }
21641
21642 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
21643 {
21644 arg_type = TREE_TYPE (t);
21645
21646 if (!currently_supported_simd_type (arg_type, base_type))
21647 {
21648 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
21649 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21650 "GCC does not currently support mixed size types "
21651 "for %<simd%> functions");
21652 else
21653 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21654 "GCC does not currently support argument type %qT "
21655 "for %<simd%> functions", arg_type);
21656 return 0;
21657 }
21658 }
21659
21660 clonei->vecsize_mangle = 'n';
21661 clonei->mask_mode = VOIDmode;
21662 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
21663 if (clonei->simdlen == 0)
21664 {
21665 count = 2;
21666 vec_bits = (num == 0 ? 64 : 128);
21667 clonei->simdlen = vec_bits / elt_bits;
21668 }
21669 else
21670 {
21671 count = 1;
21672 vec_bits = clonei->simdlen * elt_bits;
21673 if (vec_bits != 64 && vec_bits != 128)
21674 {
21675 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
21676 "GCC does not currently support simdlen %d for type %qT",
21677 clonei->simdlen, base_type);
21678 return 0;
21679 }
21680 }
21681 clonei->vecsize_int = vec_bits;
21682 clonei->vecsize_float = vec_bits;
21683 return count;
21684 }
21685
21686 /* Implement TARGET_SIMD_CLONE_ADJUST. */
21687
21688 static void
21689 aarch64_simd_clone_adjust (struct cgraph_node *node)
21690 {
21691 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
21692 use the correct ABI. */
21693
21694 tree t = TREE_TYPE (node->decl);
21695 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
21696 TYPE_ATTRIBUTES (t));
21697 }
21698
21699 /* Implement TARGET_SIMD_CLONE_USABLE. */
21700
21701 static int
21702 aarch64_simd_clone_usable (struct cgraph_node *node)
21703 {
21704 switch (node->simdclone->vecsize_mangle)
21705 {
21706 case 'n':
21707 if (!TARGET_SIMD)
21708 return -1;
21709 return 0;
21710 default:
21711 gcc_unreachable ();
21712 }
21713 }
21714
21715 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
21716
21717 static int
21718 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
21719 {
21720 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
21721 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
21722 return 0;
21723 return 1;
21724 }
21725
21726 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
21727
21728 static const char *
21729 aarch64_get_multilib_abi_name (void)
21730 {
21731 if (TARGET_BIG_END)
21732 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
21733 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
21734 }
21735
21736 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
21737 global variable based guard use the default else
21738 return a null tree. */
21739 static tree
21740 aarch64_stack_protect_guard (void)
21741 {
21742 if (aarch64_stack_protector_guard == SSP_GLOBAL)
21743 return default_stack_protect_guard ();
21744
21745 return NULL_TREE;
21746 }
21747
21748 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
21749 section at the end if needed. */
21750 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
21751 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
21752 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
21753 void
21754 aarch64_file_end_indicate_exec_stack ()
21755 {
21756 file_end_indicate_exec_stack ();
21757
21758 unsigned feature_1_and = 0;
21759 if (aarch64_bti_enabled ())
21760 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
21761
21762 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
21763 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
21764
21765 if (feature_1_and)
21766 {
21767 /* Generate .note.gnu.property section. */
21768 switch_to_section (get_section (".note.gnu.property",
21769 SECTION_NOTYPE, NULL));
21770
21771 /* PT_NOTE header: namesz, descsz, type.
21772 namesz = 4 ("GNU\0")
21773 descsz = 16 (Size of the program property array)
21774 [(12 + padding) * Number of array elements]
21775 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
21776 assemble_align (POINTER_SIZE);
21777 assemble_integer (GEN_INT (4), 4, 32, 1);
21778 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
21779 assemble_integer (GEN_INT (5), 4, 32, 1);
21780
21781 /* PT_NOTE name. */
21782 assemble_string ("GNU", 4);
21783
21784 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
21785 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
21786 datasz = 4
21787 data = feature_1_and. */
21788 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
21789 assemble_integer (GEN_INT (4), 4, 32, 1);
21790 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
21791
21792 /* Pad the size of the note to the required alignment. */
21793 assemble_align (POINTER_SIZE);
21794 }
21795 }
21796 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
21797 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
21798 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
21799
21800 /* Target-specific selftests. */
21801
21802 #if CHECKING_P
21803
21804 namespace selftest {
21805
21806 /* Selftest for the RTL loader.
21807 Verify that the RTL loader copes with a dump from
21808 print_rtx_function. This is essentially just a test that class
21809 function_reader can handle a real dump, but it also verifies
21810 that lookup_reg_by_dump_name correctly handles hard regs.
21811 The presence of hard reg names in the dump means that the test is
21812 target-specific, hence it is in this file. */
21813
21814 static void
21815 aarch64_test_loading_full_dump ()
21816 {
21817 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
21818
21819 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
21820
21821 rtx_insn *insn_1 = get_insn_by_uid (1);
21822 ASSERT_EQ (NOTE, GET_CODE (insn_1));
21823
21824 rtx_insn *insn_15 = get_insn_by_uid (15);
21825 ASSERT_EQ (INSN, GET_CODE (insn_15));
21826 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
21827
21828 /* Verify crtl->return_rtx. */
21829 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
21830 ASSERT_EQ (0, REGNO (crtl->return_rtx));
21831 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
21832 }
21833
21834 /* Run all target-specific selftests. */
21835
21836 static void
21837 aarch64_run_selftests (void)
21838 {
21839 aarch64_test_loading_full_dump ();
21840 }
21841
21842 } // namespace selftest
21843
21844 #endif /* #if CHECKING_P */
21845
21846 #undef TARGET_STACK_PROTECT_GUARD
21847 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
21848
21849 #undef TARGET_ADDRESS_COST
21850 #define TARGET_ADDRESS_COST aarch64_address_cost
21851
21852 /* This hook will determines whether unnamed bitfields affect the alignment
21853 of the containing structure. The hook returns true if the structure
21854 should inherit the alignment requirements of an unnamed bitfield's
21855 type. */
21856 #undef TARGET_ALIGN_ANON_BITFIELD
21857 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
21858
21859 #undef TARGET_ASM_ALIGNED_DI_OP
21860 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
21861
21862 #undef TARGET_ASM_ALIGNED_HI_OP
21863 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
21864
21865 #undef TARGET_ASM_ALIGNED_SI_OP
21866 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
21867
21868 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
21869 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
21870 hook_bool_const_tree_hwi_hwi_const_tree_true
21871
21872 #undef TARGET_ASM_FILE_START
21873 #define TARGET_ASM_FILE_START aarch64_start_file
21874
21875 #undef TARGET_ASM_OUTPUT_MI_THUNK
21876 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
21877
21878 #undef TARGET_ASM_SELECT_RTX_SECTION
21879 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
21880
21881 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
21882 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
21883
21884 #undef TARGET_BUILD_BUILTIN_VA_LIST
21885 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
21886
21887 #undef TARGET_CALLEE_COPIES
21888 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
21889
21890 #undef TARGET_CAN_ELIMINATE
21891 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
21892
21893 #undef TARGET_CAN_INLINE_P
21894 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
21895
21896 #undef TARGET_CANNOT_FORCE_CONST_MEM
21897 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
21898
21899 #undef TARGET_CASE_VALUES_THRESHOLD
21900 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
21901
21902 #undef TARGET_CONDITIONAL_REGISTER_USAGE
21903 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
21904
21905 /* Only the least significant bit is used for initialization guard
21906 variables. */
21907 #undef TARGET_CXX_GUARD_MASK_BIT
21908 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
21909
21910 #undef TARGET_C_MODE_FOR_SUFFIX
21911 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
21912
21913 #ifdef TARGET_BIG_ENDIAN_DEFAULT
21914 #undef TARGET_DEFAULT_TARGET_FLAGS
21915 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
21916 #endif
21917
21918 #undef TARGET_CLASS_MAX_NREGS
21919 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
21920
21921 #undef TARGET_BUILTIN_DECL
21922 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
21923
21924 #undef TARGET_BUILTIN_RECIPROCAL
21925 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
21926
21927 #undef TARGET_C_EXCESS_PRECISION
21928 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
21929
21930 #undef TARGET_EXPAND_BUILTIN
21931 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
21932
21933 #undef TARGET_EXPAND_BUILTIN_VA_START
21934 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
21935
21936 #undef TARGET_FOLD_BUILTIN
21937 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
21938
21939 #undef TARGET_FUNCTION_ARG
21940 #define TARGET_FUNCTION_ARG aarch64_function_arg
21941
21942 #undef TARGET_FUNCTION_ARG_ADVANCE
21943 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
21944
21945 #undef TARGET_FUNCTION_ARG_BOUNDARY
21946 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
21947
21948 #undef TARGET_FUNCTION_ARG_PADDING
21949 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
21950
21951 #undef TARGET_GET_RAW_RESULT_MODE
21952 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
21953 #undef TARGET_GET_RAW_ARG_MODE
21954 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
21955
21956 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
21957 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
21958
21959 #undef TARGET_FUNCTION_VALUE
21960 #define TARGET_FUNCTION_VALUE aarch64_function_value
21961
21962 #undef TARGET_FUNCTION_VALUE_REGNO_P
21963 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
21964
21965 #undef TARGET_GIMPLE_FOLD_BUILTIN
21966 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
21967
21968 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
21969 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
21970
21971 #undef TARGET_INIT_BUILTINS
21972 #define TARGET_INIT_BUILTINS aarch64_init_builtins
21973
21974 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
21975 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
21976 aarch64_ira_change_pseudo_allocno_class
21977
21978 #undef TARGET_LEGITIMATE_ADDRESS_P
21979 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
21980
21981 #undef TARGET_LEGITIMATE_CONSTANT_P
21982 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
21983
21984 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
21985 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
21986 aarch64_legitimize_address_displacement
21987
21988 #undef TARGET_LIBGCC_CMP_RETURN_MODE
21989 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
21990
21991 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
21992 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
21993 aarch64_libgcc_floating_mode_supported_p
21994
21995 #undef TARGET_MANGLE_TYPE
21996 #define TARGET_MANGLE_TYPE aarch64_mangle_type
21997
21998 #undef TARGET_VERIFY_TYPE_CONTEXT
21999 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
22000
22001 #undef TARGET_MEMORY_MOVE_COST
22002 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
22003
22004 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
22005 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
22006
22007 #undef TARGET_MUST_PASS_IN_STACK
22008 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
22009
22010 /* This target hook should return true if accesses to volatile bitfields
22011 should use the narrowest mode possible. It should return false if these
22012 accesses should use the bitfield container type. */
22013 #undef TARGET_NARROW_VOLATILE_BITFIELD
22014 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
22015
22016 #undef TARGET_OPTION_OVERRIDE
22017 #define TARGET_OPTION_OVERRIDE aarch64_override_options
22018
22019 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
22020 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
22021 aarch64_override_options_after_change
22022
22023 #undef TARGET_OPTION_SAVE
22024 #define TARGET_OPTION_SAVE aarch64_option_save
22025
22026 #undef TARGET_OPTION_RESTORE
22027 #define TARGET_OPTION_RESTORE aarch64_option_restore
22028
22029 #undef TARGET_OPTION_PRINT
22030 #define TARGET_OPTION_PRINT aarch64_option_print
22031
22032 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
22033 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
22034
22035 #undef TARGET_SET_CURRENT_FUNCTION
22036 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
22037
22038 #undef TARGET_PASS_BY_REFERENCE
22039 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
22040
22041 #undef TARGET_PREFERRED_RELOAD_CLASS
22042 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
22043
22044 #undef TARGET_SCHED_REASSOCIATION_WIDTH
22045 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
22046
22047 #undef TARGET_PROMOTED_TYPE
22048 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
22049
22050 #undef TARGET_SECONDARY_RELOAD
22051 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
22052
22053 #undef TARGET_SHIFT_TRUNCATION_MASK
22054 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
22055
22056 #undef TARGET_SETUP_INCOMING_VARARGS
22057 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
22058
22059 #undef TARGET_STRUCT_VALUE_RTX
22060 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
22061
22062 #undef TARGET_REGISTER_MOVE_COST
22063 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
22064
22065 #undef TARGET_RETURN_IN_MEMORY
22066 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
22067
22068 #undef TARGET_RETURN_IN_MSB
22069 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
22070
22071 #undef TARGET_RTX_COSTS
22072 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
22073
22074 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22075 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
22076
22077 #undef TARGET_SCHED_ISSUE_RATE
22078 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
22079
22080 #undef TARGET_SCHED_VARIABLE_ISSUE
22081 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
22082
22083 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22084 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22085 aarch64_sched_first_cycle_multipass_dfa_lookahead
22086
22087 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
22088 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
22089 aarch64_first_cycle_multipass_dfa_lookahead_guard
22090
22091 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
22092 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
22093 aarch64_get_separate_components
22094
22095 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
22096 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
22097 aarch64_components_for_bb
22098
22099 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
22100 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
22101 aarch64_disqualify_components
22102
22103 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
22104 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
22105 aarch64_emit_prologue_components
22106
22107 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
22108 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
22109 aarch64_emit_epilogue_components
22110
22111 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
22112 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
22113 aarch64_set_handled_components
22114
22115 #undef TARGET_TRAMPOLINE_INIT
22116 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
22117
22118 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22119 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
22120
22121 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22122 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
22123
22124 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
22125 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
22126 aarch64_builtin_support_vector_misalignment
22127
22128 #undef TARGET_ARRAY_MODE
22129 #define TARGET_ARRAY_MODE aarch64_array_mode
22130
22131 #undef TARGET_ARRAY_MODE_SUPPORTED_P
22132 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
22133
22134 #undef TARGET_VECTORIZE_ADD_STMT_COST
22135 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
22136
22137 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
22138 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
22139 aarch64_builtin_vectorization_cost
22140
22141 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
22142 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
22143
22144 #undef TARGET_VECTORIZE_BUILTINS
22145 #define TARGET_VECTORIZE_BUILTINS
22146
22147 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
22148 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
22149 aarch64_builtin_vectorized_function
22150
22151 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
22152 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
22153 aarch64_autovectorize_vector_modes
22154
22155 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
22156 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
22157 aarch64_atomic_assign_expand_fenv
22158
22159 /* Section anchor support. */
22160
22161 #undef TARGET_MIN_ANCHOR_OFFSET
22162 #define TARGET_MIN_ANCHOR_OFFSET -256
22163
22164 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
22165 byte offset; we can do much more for larger data types, but have no way
22166 to determine the size of the access. We assume accesses are aligned. */
22167 #undef TARGET_MAX_ANCHOR_OFFSET
22168 #define TARGET_MAX_ANCHOR_OFFSET 4095
22169
22170 #undef TARGET_VECTOR_ALIGNMENT
22171 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
22172
22173 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
22174 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
22175 aarch64_vectorize_preferred_vector_alignment
22176 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
22177 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
22178 aarch64_simd_vector_alignment_reachable
22179
22180 /* vec_perm support. */
22181
22182 #undef TARGET_VECTORIZE_VEC_PERM_CONST
22183 #define TARGET_VECTORIZE_VEC_PERM_CONST \
22184 aarch64_vectorize_vec_perm_const
22185
22186 #undef TARGET_VECTORIZE_RELATED_MODE
22187 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
22188 #undef TARGET_VECTORIZE_GET_MASK_MODE
22189 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
22190 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
22191 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
22192 aarch64_empty_mask_is_expensive
22193 #undef TARGET_PREFERRED_ELSE_VALUE
22194 #define TARGET_PREFERRED_ELSE_VALUE \
22195 aarch64_preferred_else_value
22196
22197 #undef TARGET_INIT_LIBFUNCS
22198 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
22199
22200 #undef TARGET_FIXED_CONDITION_CODE_REGS
22201 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
22202
22203 #undef TARGET_FLAGS_REGNUM
22204 #define TARGET_FLAGS_REGNUM CC_REGNUM
22205
22206 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
22207 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
22208
22209 #undef TARGET_ASAN_SHADOW_OFFSET
22210 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
22211
22212 #undef TARGET_LEGITIMIZE_ADDRESS
22213 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
22214
22215 #undef TARGET_SCHED_CAN_SPECULATE_INSN
22216 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
22217
22218 #undef TARGET_CAN_USE_DOLOOP_P
22219 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
22220
22221 #undef TARGET_SCHED_ADJUST_PRIORITY
22222 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
22223
22224 #undef TARGET_SCHED_MACRO_FUSION_P
22225 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
22226
22227 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
22228 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
22229
22230 #undef TARGET_SCHED_FUSION_PRIORITY
22231 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
22232
22233 #undef TARGET_UNSPEC_MAY_TRAP_P
22234 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
22235
22236 #undef TARGET_USE_PSEUDO_PIC_REG
22237 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
22238
22239 #undef TARGET_PRINT_OPERAND
22240 #define TARGET_PRINT_OPERAND aarch64_print_operand
22241
22242 #undef TARGET_PRINT_OPERAND_ADDRESS
22243 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
22244
22245 #undef TARGET_OPTAB_SUPPORTED_P
22246 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
22247
22248 #undef TARGET_OMIT_STRUCT_RETURN_REG
22249 #define TARGET_OMIT_STRUCT_RETURN_REG true
22250
22251 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
22252 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
22253 aarch64_dwarf_poly_indeterminate_value
22254
22255 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
22256 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
22257 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
22258
22259 #undef TARGET_HARD_REGNO_NREGS
22260 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
22261 #undef TARGET_HARD_REGNO_MODE_OK
22262 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
22263
22264 #undef TARGET_MODES_TIEABLE_P
22265 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
22266
22267 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
22268 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
22269 aarch64_hard_regno_call_part_clobbered
22270
22271 #undef TARGET_INSN_CALLEE_ABI
22272 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
22273
22274 #undef TARGET_CONSTANT_ALIGNMENT
22275 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
22276
22277 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
22278 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
22279 aarch64_stack_clash_protection_alloca_probe_range
22280
22281 #undef TARGET_COMPUTE_PRESSURE_CLASSES
22282 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
22283
22284 #undef TARGET_CAN_CHANGE_MODE_CLASS
22285 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
22286
22287 #undef TARGET_SELECT_EARLY_REMAT_MODES
22288 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
22289
22290 #undef TARGET_SPECULATION_SAFE_VALUE
22291 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
22292
22293 #undef TARGET_ESTIMATED_POLY_VALUE
22294 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
22295
22296 #undef TARGET_ATTRIBUTE_TABLE
22297 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
22298
22299 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
22300 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
22301 aarch64_simd_clone_compute_vecsize_and_simdlen
22302
22303 #undef TARGET_SIMD_CLONE_ADJUST
22304 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
22305
22306 #undef TARGET_SIMD_CLONE_USABLE
22307 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
22308
22309 #undef TARGET_COMP_TYPE_ATTRIBUTES
22310 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
22311
22312 #undef TARGET_GET_MULTILIB_ABI_NAME
22313 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
22314
22315 #undef TARGET_FNTYPE_ABI
22316 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
22317
22318 #if CHECKING_P
22319 #undef TARGET_RUN_TARGET_SELFTESTS
22320 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
22321 #endif /* #if CHECKING_P */
22322
22323 #undef TARGET_ASM_POST_CFI_STARTPROC
22324 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
22325
22326 #undef TARGET_STRICT_ARGUMENT_NAMING
22327 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
22328
22329 #undef TARGET_MD_ASM_ADJUST
22330 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
22331
22332 struct gcc_target targetm = TARGET_INITIALIZER;
22333
22334 #include "gt-aarch64.h"